diff --git a/.autover/autover.json b/.autover/autover.json index 8985c52bb..02f2ad0db 100644 --- a/.autover/autover.json +++ b/.autover/autover.json @@ -47,6 +47,11 @@ "Name": "Amazon.Lambda.Core", "Path": "Libraries/src/Amazon.Lambda.Core/Amazon.Lambda.Core.csproj" }, + { + "Name": "Amazon.Lambda.DurableExecution", + "Path": "Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj", + "PrereleaseLabel": "preview" + }, { "Name": "Amazon.Lambda.DynamoDBEvents", "Path": "Libraries/src/Amazon.Lambda.DynamoDBEvents/Amazon.Lambda.DynamoDBEvents.csproj" diff --git a/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json new file mode 100644 index 000000000..42a1cec69 --- /dev/null +++ b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Minor", + "ChangelogMessages": [ + "Implement NestingType.Flat for ParallelAsync and MapAsync (previously threw NotSupportedException). Under Flat, each branch/item runs in a virtual context that emits no per-branch CONTEXT checkpoint; per-branch results and errors are recorded inline on the parent operation's payload, reducing checkpoint volume. Operations inside a flat branch (steps, waits) still checkpoint, re-parented to the parallel/map operation. NestingType.Nested remains the default." + ] + } + ] +} diff --git a/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json b/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json new file mode 100644 index 000000000..41fab0859 --- /dev/null +++ b/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Initial preview release of the Durable Execution SDK for .NET. Build long-running Lambda workflows with automatic checkpointing via `StepAsync`, `WaitAsync`, `RunInChildContextAsync`, `CreateCallbackAsync`, and `WaitForCallbackAsync` on `IDurableContext`." + ] + } + ] +} \ No newline at end of file diff --git a/.autover/changes/durable-mapasync.json b/.autover/changes/durable-mapasync.json new file mode 100644 index 000000000..412e09055 --- /dev/null +++ b/.autover/changes/durable-mapasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `MapAsync` to `IDurableContext` for processing a collection in parallel with one child context per item and automatic checkpointing. Supports configurable max concurrency, completion policy, and per-item naming via `MapConfig`, returning an `IBatchResult`." + ] + } + ] +} diff --git a/.autover/changes/durable-parallelasync.json b/.autover/changes/durable-parallelasync.json new file mode 100644 index 000000000..2adf78331 --- /dev/null +++ b/.autover/changes/durable-parallelasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `ParallelAsync` to `IDurableContext` for running multiple workflow branches concurrently with automatic checkpointing. Supports configurable max concurrency, failure tolerance, and first-successful completion via `ParallelConfig`, returning an `IBatchResult`." + ] + } + ] +} diff --git a/.autover/changes/ef22a418-1be3-4359-a442-f086667635ec.json b/.autover/changes/ef22a418-1be3-4359-a442-f086667635ec.json new file mode 100644 index 000000000..652d2bca2 --- /dev/null +++ b/.autover/changes/ef22a418-1be3-4359-a442-f086667635ec.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add WaitForConditionAsync polling primitive: IDurableContext.WaitForConditionAsync, IConditionCheckContext, WaitForConditionConfig, IWaitStrategy, WaitDecision, WaitStrategy factory (Exponential/Linear/Fixed/FromDelegate), and WaitForConditionException with AttemptsExhausted and LastState" + ] + } + ] +} diff --git a/.gitignore b/.gitignore index 1caae6fe4..f86678d7a 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,6 @@ global.json **/cdk.out/** **/.DS_Store + +# JetBrains Rider per-project cache +**/*.lscache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31e288af2..63777c644 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,6 +85,7 @@ The available projects are: * Amazon.Lambda.ConfigEvents * Amazon.Lambda.ConnectEvents * Amazon.Lambda.Core +* Amazon.Lambda.DurableExecution * Amazon.Lambda.DynamoDBEvents * Amazon.Lambda.DynamoDBEvents.SDK.Convertor * Amazon.Lambda.KafkaEvents diff --git a/Docs/durable-execution-design.md b/Docs/durable-execution-design.md new file mode 100644 index 000000000..c5c4da089 --- /dev/null +++ b/Docs/durable-execution-design.md @@ -0,0 +1,2348 @@ +# .NET Lambda Durable Execution SDK Design + +## Table of Contents + +- [Overview](#overview) +- [Motivation](#motivation) +- [How Durable Execution Works](#how-durable-execution-works) +- [User Experience](#user-experience) + - [Quick Start](#quick-start) + - [Steps](#steps) + - [Wait Operations](#wait-operations) + - [Callbacks](#callbacks) + - [Invoke (Chained Functions)](#invoke-chained-functions) + - [Parallel Execution](#parallel-execution) + - [Map Operations](#map-operations) + - [Child Contexts](#child-contexts) + - [Error Handling & Retry](#error-handling--retry) + - [Logging](#logging) +- [Internals](#internals) +- [API Reference](#api-reference) + - [IDurableContext](#idurablecontext) + - [Configuration Types](#configuration-types) + - [Result Types](#result-types) + - [Exception Types](#exception-types) +- [Serialization](#serialization) +- [Integration with Existing Libraries](#integration-with-existing-libraries) +- [Testing](#testing) +- [Local development (Test Tool v2 and Aspire)](#local-development-test-tool-v2-and-aspire) +- [Requirements & Constraints](#requirements--constraints) +- [Package Structure](#package-structure) +- [Implementation plan](#implementation-plan) +- [Cross-SDK API comparison](#cross-sdk-api-comparison) +- [Common Patterns](#common-patterns) + +--- + +## Overview + +Lambda Durable Functions let you write multi-step workflows that persist state automatically. They can run for days or months, survive failures, and you only pay for actual compute time. + +This doc covers the **.NET Durable Execution SDK** (`Amazon.Lambda.DurableExecution`). SDKs already exist for [Python](https://github.com/aws/aws-durable-execution-sdk-python) and [JavaScript/TypeScript](https://github.com/aws/aws-durable-execution-sdk-js). + +Related: [GitHub Issue #2216](https://github.com/aws/aws-lambda-dotnet/issues/2216) + +--- + +## Motivation + +### The problem + +Today, building multi-step Lambda workflows in .NET requires one of: + +1. **Step Functions** -- a separate service with its own state machine language (ASL), adding latency between steps and forcing you to learn a second programming model. +2. **Manual state management** -- rolling your own checkpointing with DynamoDB or S3, plus retry logic, idempotency keys, and resumption code. +3. **Event-driven choreography** -- chaining functions through SQS/SNS/EventBridge, scattering a single workflow's logic across half a dozen Lambda functions. + +All three push infrastructure concerns into your business logic. The code gets harder to read and test, and nobody wants to inherit it. + +### What durable functions do instead + +With this SDK, you write sequential code and the runtime handles persistence: +- Checkpoints each step's result +- Suspends when waiting (no compute charges while idle) +- Resumes from the last checkpoint on the next invocation +- Retries failed steps with configurable backoff +- Waits for callbacks from external systems + +Your function reads like a normal async method. The SDK deals with state, replay, and recovery. + +### Why build a .NET SDK + +.NET has a large Lambda user base, especially in enterprise shops running order processing, document pipelines, and (increasingly) AI agent workflows. Today those teams either use Step Functions or build custom state machines. A native .NET SDK removes that tradeoff. + +--- + +## How Durable Execution Works + +### The replay model + +Durable functions use a replay-based execution model. Every invocation runs your code from the top, but previously completed steps return their cached result instead of re-executing. + +1. Lambda invokes your function with a `DurableExecutionInvocationInput` containing: + - `DurableExecutionArn` -- unique execution identifier + - `CheckpointToken` -- for optimistic concurrency + - `InitialExecutionState` -- previously checkpointed operations + +2. Your function code runs **from the beginning** on every invocation. + +3. When a **step** is encountered: + - Previously completed → return cached result (no re-execution) + - New → execute it, checkpoint the result, continue + +4. When a **wait** is encountered: + - Already elapsed → continue + - Still pending → return `PENDING`, Lambda terminates, service re-invokes later + +5. The function returns one of: + - `SUCCEEDED` -- workflow completed + - `FAILED` -- workflow failed + - `PENDING` -- workflow suspended (waiting for time or callback) + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ First Invocation (t=0s) │ +│ │ +│ handler(event, context) │ +│ │ │ +│ ├─► context.StepAsync(FetchData) → executes, checkpoints │ +│ │ │ +│ ├─► context.WaitAsync(30 seconds) → returns PENDING │ +│ │ │ +│ └── (Lambda terminates, environment recyclable) │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ Second Invocation (t=30s) │ +│ │ +│ handler(event, context) │ +│ │ │ +│ ├─► context.StepAsync(FetchData) → returns cached result │ +│ │ │ +│ ├─► context.WaitAsync(30 seconds) → already elapsed, skip │ +│ │ │ +│ ├─► context.StepAsync(ProcessData) → executes, checkpoints │ +│ │ │ +│ └── return result → SUCCEEDED │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## User Experience + +### Quick Start + +#### Installation + +```shell +dotnet add package Amazon.Lambda.DurableExecution +``` + +#### Minimal Example + +```csharp +using Amazon.Lambda.Annotations; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace MyDurableFunction; + +public class Function +{ + [LambdaFunction] + [DurableExecution] + public async Task Handler(OrderEvent input, IDurableContext context) + { + // Step 1: Validate the order (checkpointed automatically) + var validation = await context.StepAsync( + async (step) => await ValidateOrder(input.OrderId), + name: "validate_order"); + + if (!validation.IsValid) + return new OrderResult { Status = "rejected" }; + + // Step 2: Wait for processing (Lambda is NOT running during this time) + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "processing_delay"); + + // Step 3: Process the order + var result = await context.StepAsync( + async (step) => await ProcessOrder(input.OrderId), + name: "process_order"); + + return new OrderResult { Status = "approved", OrderId = result.OrderId }; + } + + private async Task ValidateOrder(string orderId) { /* ... */ } + private async Task ProcessOrder(string orderId) { /* ... */ } +} +``` + +Things to notice: +- `[LambdaFunction]` + `[DurableExecution]` triggers source generation, so you don't wire up the handler yourself +- Each step function receives an `IStepContext` with a step-scoped logger, attempt number, and operation ID +- Each `StepAsync` call checkpoints its result automatically +- `WaitAsync` suspends the function -- Lambda is not running (or billing you) during the wait +- On replay, completed steps return their cached result without re-executing +- The generated wrapper handles checkpoint batching and cleanup + +#### Manual Handler (Without Annotations) + +If you don't use `Amazon.Lambda.Annotations`, use `DurableFunction.WrapAsync` — a static helper (inspired by [OpenTelemetry's `AWSLambdaWrapper.TraceAsync`](https://github.com/open-telemetry/opentelemetry-dotnet-contrib/tree/main/src/OpenTelemetry.Instrumentation.AWSLambda#lambda-function)) that handles the entire durable execution envelope for you: + +```csharp +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace MyDurableFunction; + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (step) => await ValidateOrder(input.OrderId), + name: "validate_order"); + + if (!validation.IsValid) + return new OrderResult { Status = "rejected" }; + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "processing_delay"); + + var result = await context.StepAsync( + async (step) => await ProcessOrder(input.OrderId), + name: "process_order"); + + return new OrderResult { Status = "approved", OrderId = result.OrderId }; + } + + private async Task ValidateOrder(string orderId) { /* ... */ } + private async Task ProcessOrder(string orderId) { /* ... */ } +} +``` + +`DurableFunction.WrapAsync` handles all the plumbing: +- Hydrates `ExecutionState` from `invocationInput.InitialExecutionState` +- Extracts the user payload from the service envelope +- Runs the workflow through `DurableExecutionHandler.RunAsync` +- Constructs and returns the `DurableExecutionInvocationOutput` envelope (status mapping, JSON serialization) +- Sets execution environment tracking + +For workflows that return no value, use the single-type-parameter overload: + +```csharp +public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + +private async Task MyWorkflow(OrderEvent input, IDurableContext context) +{ + await context.StepAsync(async (step) => await SendNotification(input.UserId), name: "notify"); + await context.WaitAsync(TimeSpan.FromHours(1), name: "cooldown"); + await context.StepAsync(async (step) => await Cleanup(input.UserId), name: "cleanup"); +} +``` + +For **NativeAOT** deployments, register an AOT-aware `ILambdaSerializer` with the Lambda runtime. `WrapAsync` reads the registered serializer from `ILambdaContext.Serializer` and uses it for both envelope and step-checkpoint (de)serialization — there is no per-call `JsonSerializerContext` argument, and AOT and reflection callers share the same `WrapAsync` overloads. + +In the class library programming model, register via the assembly attribute: + +```csharp +[assembly: LambdaSerializer(typeof(SourceGeneratorLambdaJsonSerializer))] + +// The user's context must include the wire-envelope types (the typed handler +// signature is DurableExecutionInvocationInput → DurableExecutionInvocationOutput, +// so Lambda's runtime needs to deserialize them with this serializer) plus every +// TInput/TOutput/step-result POCO the workflow uses. +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(OrderEvent))] +[JsonSerializable(typeof(OrderResult))] +public partial class MyJsonContext : JsonSerializerContext { } + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + // ... + } +} +``` + +In an executable / custom-runtime deployment, pass the serializer to `LambdaBootstrapBuilder.Create(handler, serializer)` instead of using the assembly attribute — `RuntimeSupport` will propagate it onto `ILambdaContext.Serializer` for the SDK to pick up. + +To inject a custom `IAmazonLambda` client (e.g., for VPC endpoints or unit testing), use the overload that accepts one: + +```csharp +public class Function +{ + private readonly IAmazonLambda _lambdaClient; + + public Function(IAmazonLambda lambdaClient) => _lambdaClient = lambdaClient; + + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync( + MyWorkflow, invocationInput, context, _lambdaClient); +} +``` + +You'd also need to manually configure the CloudFormation template with `DurableConfig` and managed policies: + +```json +{ + "Resources": { + "MyFunction": { + "Type": "AWS::Serverless::Function", + "Properties": { + "Handler": "MyDurableFunction::MyDurableFunction.Function::FunctionHandler", + "Policies": [ + "AWSLambdaBasicExecutionRole", + "AWSLambdaBasicDurableExecutionRolePolicy" + ], + "DurableConfig": { + "Enabled": true + } + } + } + } +} +``` + +##### What WrapAsync does internally + +For reference, here's the expanded version of what `DurableFunction.WrapAsync` eliminates — this is effectively what the source generator produces for the Annotations path: + +```csharp +public async Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) +{ + // 1. Hydrate execution state from previously checkpointed operations + var state = new ExecutionState(); + state.LoadFromCheckpoint(invocationInput.InitialExecutionState); + + // 2. Extract user payload from the service envelope (internal) + var userPayload = ExtractUserPayload(invocationInput); + + // 3. Run the user's workflow via DurableExecutionHandler.RunAsync + var result = await DurableExecutionHandler.RunAsync( + state, + async (durableContext) => await MyWorkflow(userPayload, durableContext), + invocationInput.DurableExecutionArn); + + // 4. Construct and return the service output envelope + return new DurableExecutionInvocationOutput + { + Status = result.Status, + Result = result.Status == InvocationStatus.Succeeded + ? JsonSerializer.Serialize(result.Result) + : null, + ErrorMessage = result.Message + }; +} +``` + +Key differences between `WrapAsync` and the Annotations approach: +- `WrapAsync` still requires you to define the Lambda entry point signature (`DurableExecutionInvocationInput` → `DurableExecutionInvocationOutput`) +- You configure `DurableConfig` + managed policies in your CloudFormation template manually (not generated) +- No `[LambdaFunction]` or `[DurableExecution]` attributes needed + +With `[LambdaFunction] + [DurableExecution]`, even the entry point and CloudFormation config are generated at compile time — you just write the workflow method. + +--- + +### Steps + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/step.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/step-handler/step-handler.ts) + +A step runs your code and checkpoints the result. On replay, the cached result comes back without re-executing. Each step function receives an `IStepContext` with a step-scoped logger and attempt metadata. + +```csharp +// Basic step +var result = await context.StepAsync(async (step) => await CallExternalApi()); + +// Named step (recommended for debugging/testing) +var user = await context.StepAsync( + async (step) => await FetchUser(userId), + name: "fetch_user"); + +// Using the step-scoped logger (includes step name, attempt number, operation ID) +var order = await context.StepAsync( + async (step) => + { + step.Logger.LogInformation("Fetching order {OrderId}", orderId); + return await orderService.GetOrder(orderId); + }, + name: "get_order"); + +// Step with configuration +var payment = await context.StepAsync( + async (step) => await chargeCard(amount), + name: "charge_card", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, initialDelay: TimeSpan.FromSeconds(1)) + }); +``` + +#### Step Semantics + +| Semantics | Behavior | Use Case | +|-----------|----------|----------| +| `AtLeastOncePerRetry` (default) | Step re-executes on each retry | Idempotent operations (calculations, reads) | +| `AtMostOncePerRetry` | Step executes at most once per retry | Side effects (payments, emails, writes) | + +--- + +### Wait Operations + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/wait.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/wait-handler/wait-handler.ts) + +Waits suspend the function without consuming compute time. Lambda can recycle the execution environment. + +```csharp +// Wait for a specific duration +await context.WaitAsync(TimeSpan.FromSeconds(30)); +await context.WaitAsync(TimeSpan.FromMinutes(5), name: "cooldown"); +await context.WaitAsync(TimeSpan.FromHours(24), name: "daily_check"); +await context.WaitAsync(TimeSpan.FromDays(7), name: "weekly_reminder"); +``` + +> **Validation:** The duration must be at least 1 second. Values less than 1 second throw `ArgumentOutOfRangeException`. Sub-second precision is truncated to whole seconds (the underlying service operates at second granularity). + +#### Wait For Condition + +`WaitForConditionAsync` polls a user-supplied check function until a configured `IWaitStrategy` decides to stop. Between polls the workflow is suspended (no compute charge); the service re-invokes when the strategy's chosen delay elapses. The check function receives the state from the previous iteration, so users can carry per-poll bookkeeping inside the state itself. + +```csharp +// Poll until an order's status reaches a terminal value. +var finalStatus = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + ctx.Logger.LogInformation("Polling order on attempt {Attempt}", ctx.AttemptNumber); + return await orderService.GetStatusAsync(orderId); + }, + config: new WaitForConditionConfig + { + InitialState = OrderStatus.Unknown, + WaitStrategy = WaitStrategy.Exponential( + isDone: s => s == OrderStatus.Completed || s == OrderStatus.Cancelled) + }, + name: "wait_for_order_settle"); +``` + +Built-in strategies live on the `WaitStrategy` factory (`Exponential`, `Linear`, `Fixed`, plus `FromDelegate`) and all accept an optional `isDone` predicate so the common case stays declarative. When the strategy hits its `maxAttempts` limit it throws `WaitForConditionException` (carrying `AttemptsExhausted` and `LastState`); when the check function itself throws, the operation surfaces a `StepException` with the original error type. State is checkpointed per-iteration in the operation's payload so polling survives Lambda re-invocations deterministically. + +--- + +### Callbacks + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/callback.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/callback-handler/callback.ts) + +Callbacks let your workflow pause until an external system responds (human approval, a webhook, a third-party API). + +#### Create a Callback (Advanced) + +```csharp +// Create a callback and get the callback ID +var callback = await context.CreateCallbackAsync( + name: "approval_callback", + config: new CallbackConfig + { + Timeout = TimeSpan.FromHours(24), + HeartbeatTimeout = TimeSpan.FromHours(2) + }); + +// Send the callback ID to an external system +await context.StepAsync( + async () => await SendApprovalEmail(callback.CallbackId, recipientEmail), + name: "send_approval_email"); + +// Wait for the external system to respond +var result = await callback.GetResultAsync(); +``` + +#### Wait For Callback (Simple) + +```csharp +// Combined pattern: create callback, submit to external system, wait for result +var approval = await context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + await SendApprovalEmail(callbackId, managerEmail); + }, + name: "wait_for_approval", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromHours(24), + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3) + }); + +if (approval.Approved) +{ + await context.StepAsync(async (step) => await ExecutePlan(), name: "execute"); +} +``` + +**Example `SendApprovalEmail` stub:** +```csharp +private async Task SendApprovalEmail(string callbackId, string recipientEmail) +{ + // Include the callbackId in the approval link so the external system + // can complete the callback via the AWS API + var approvalLink = $"https://my-app.example.com/approve?callbackId={callbackId}"; + await emailService.SendAsync(recipientEmail, "Approval Required", $"Please approve: {approvalLink}"); +} +``` + +**External system completes the callback via AWS API:** +```bash +aws lambda send-durable-execution-callback-success \ + --function-name my-function:1 \ + --callback-id "cb-12345" \ + --payload '{"approved": true, "approver": "jane@example.com"}' +``` + +--- + +### Invoke (Chained Functions) + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/invoke.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/invoke-handler/invoke-handler.ts) + +Call another durable function. The invocation is checkpointed, so it survives failures and won't double-fire. + +```csharp +// Invoke another durable function +var paymentResult = await context.InvokeAsync( + functionName: "arn:aws:lambda:us-east-1:123456789012:function:payment-processor:prod", + payload: new PaymentRequest { Amount = 100, Currency = "USD" }, + name: "process_payment", + config: new InvokeConfig + { + TenantId = "tenant-42" + }); +``` + +> **Note:** Durable function invocations require **qualified identifiers** — include a version number, alias, or `$LATEST`: +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:prod` (alias) +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:42` (version) +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:$LATEST` +> - ❌ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor` (unqualified — not supported) + +--- + +### Parallel Execution + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/parallel.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/parallel-handler/parallel-handler.ts) + +Run independent operations concurrently. The JS SDK uses a `DurablePromise` pattern where operations are deferred until awaited; in .NET that isn't necessary because `ParallelAsync` and `MapAsync` cover the same use case idiomatically. `Task`-returning methods start immediately and `await` retrieves the result, so there's no gap to fill with a lazy wrapper. + +> **Prefer `ParallelAsync` over `Task.WhenAll`:** While `Task.WhenAll` works correctly with durable operations (operation IDs are allocated deterministically), it bypasses completion policies, concurrency limits, branch naming, and `IBatchResult` structured output. Always use `ParallelAsync` or `MapAsync` for concurrent durable operations. A future Roslyn analyzer (DE004) will flag `Task.WhenAll` usage with durable tasks and suggest `ParallelAsync` as a replacement. + +```csharp +// Run multiple operations in parallel +var results = await context.ParallelAsync( + new Func>[] + { + async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId), name: "fetch_user"), + async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId), name: "fetch_orders"), + async (ctx) => await ctx.StepAsync(async (step) => await FetchPreferences(userId), name: "fetch_prefs"), + }, + name: "parallel_fetch", + config: new ParallelConfig + { + MaxConcurrency = 3, + CompletionConfig = CompletionConfig.AllSuccessful() + }); + +// Access individual results +var userData = results.GetResults()[0]; +var orderHistory = results.GetResults()[1]; +var preferences = results.GetResults()[2]; +``` + +#### Named Parallel Branches + +For better observability, you can name individual branches (matching the JS SDK pattern): + +```csharp +// Named branches for easier debugging and testing +var results = await context.ParallelAsync( + new DurableBranch[] + { + new("fetch_user", async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId))), + new("fetch_orders", async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId))), + new("fetch_prefs", async (ctx) => await ctx.StepAsync(async (step) => await FetchPreferences(userId))), + }, + name: "parallel_fetch"); + +// In tests, you can find specific branches by name +var fetchUserBranch = result.GetOperation("fetch_user"); +``` + +#### Completion Configurations + +`ParallelAsync` and `MapAsync` accept a `CompletionConfig` to control when the overall operation is considered complete: + +```csharp +// All must succeed (default) +CompletionConfig.AllSuccessful() + +// Complete when any one succeeds +CompletionConfig.FirstSuccessful() + +// Complete when all finish (regardless of success/failure) +CompletionConfig.AllCompleted() + +// Custom: succeed if at least 3 succeed, tolerate up to 2 failures +new CompletionConfig +{ + MinSuccessful = 3, + ToleratedFailureCount = 2 +} +``` + +--- + +### Map Operations + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/map.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/map-handler/map-handler.ts) + +Process a collection in parallel with configurable concurrency. The `items` parameter accepts any `IReadOnlyList` (arrays, lists, etc.). + +```csharp +var orders = new[] { "order-1", "order-2", "order-3", "order-4", "order-5" }; + +var results = await context.MapAsync( + items: orders, // IReadOnlyList + func: async (ctx, orderId, index, allItems) => + { + return await ctx.StepAsync( + async () => await ProcessOrder(orderId), + name: $"process_order_{index}"); + }, + name: "process_all_orders", + config: new MapConfig + { + MaxConcurrency = 3, + CompletionConfig = CompletionConfig.AllSuccessful(), + ItemNamer = (orderId, index) => $"Order-{orderId}" // Readable names for observability + }); + +// Check results +results.ThrowIfError(); // Throws if any item failed +var processedOrders = results.GetResults(); +``` + +--- + +### Child Contexts + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/child.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/run-in-child-context-handler/run-in-child-context-handler.ts) + +Child contexts group related durable operations into a sub-workflow. Use them when you need waits or multiple steps inside a logical unit (you cannot nest durable calls inside a step directly). + +```csharp +// Group operations into a child context +var enrichedData = await context.RunInChildContextAsync( + async (childCtx) => + { + var validated = await childCtx.StepAsync( + async () => await Validate(data), + name: "validate"); + + await childCtx.WaitAsync(TimeSpan.FromSeconds(1), name: "rate_limit"); + + var enriched = await childCtx.StepAsync( + async () => await Enrich(validated), + name: "enrich"); + + return enriched; + }, + name: "validation_phase"); + +// Use the enriched data in a subsequent step +var finalResult = await context.StepAsync( + async () => await SubmitEnrichedData(enrichedData), + name: "submit"); +``` + +> **Why child contexts?** You cannot nest durable operations inside a step. Steps are leaf operations. If you need multiple durable operations grouped together, use a child context. + +--- + +### Error Handling & Retry + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/retries.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/retry/retry-config/index.ts) + +#### Retry Strategies + +```csharp +// Exponential backoff with jitter +var result = await context.StepAsync( + async () => await CallUnreliableApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(60), + backoffRate: 2.0, + jitter: JitterStrategy.Full) + }); + +// Using presets +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Default // 6 attempts, 2x backoff, 5s initial, Full jitter + }); + +// Available presets: +// RetryStrategy.None — maxAttempts: 1 (no retry) +// RetryStrategy.Default — 6 attempts, 2x backoff, 5s initial delay, Full jitter +// RetryStrategy.Transient — 3 attempts, 2x backoff, 1s initial delay, Full jitter + +// Custom retry strategy +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = new CustomRetryStrategy((exception, attemptCount) => + { + // Only retry transient errors + if (exception is HttpRequestException httpEx && httpEx.StatusCode >= 500) + return RetryDecision.RetryAfter(TimeSpan.FromSeconds(Math.Pow(2, attemptCount))); + + return RetryDecision.DoNotRetry(); + }) + }); + +// Retry with specific exception types +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException), typeof(HttpRequestException) }) + }); + +// Retry with message pattern matching (regex) +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(HttpRequestException) }, + retryableMessagePatterns: new[] { "timeout", "throttl", "5\\d{2}" }) + }); +``` + +#### Jitter Strategies + +Jitter prevents thundering-herd scenarios where multiple retrying clients converge on the same backoff schedule. The SDK supports three jitter strategies: + +```csharp +public enum JitterStrategy +{ + /// No randomization — delay is exactly the calculated backoff value. + None, + + /// Random delay between 0 and the calculated backoff value (recommended). + Full, + + /// Random delay between 50% and 100% of the calculated backoff value. + Half +} +``` + +The default jitter for `RetryStrategy.Exponential()` is `JitterStrategy.Full`. All built-in presets (`RetryStrategy.Default`, `RetryStrategy.Transient`) also use `JitterStrategy.Full`. Use `JitterStrategy.None` only when you need deterministic retry timing (e.g., for testing). + +#### Retry Strategy Interface + +```csharp +public interface IRetryStrategy +{ + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +public record RetryDecision +{ + public bool ShouldRetry { get; } + public TimeSpan Delay { get; } + + public static RetryDecision DoNotRetry() => new() { ShouldRetry = false }; + public static RetryDecision RetryAfter(TimeSpan delay) => new() { ShouldRetry = true, Delay = delay }; +} +``` + +`IRetryStrategy` supports implicit conversion from `Func`, enabling inline lambdas: + +```csharp +config: new StepConfig +{ + RetryStrategy = (ex, attempt) => + attempt < 3 && ex is HttpRequestException + ? RetryDecision.RetryAfter(TimeSpan.FromSeconds(Math.Pow(2, attempt))) + : RetryDecision.DoNotRetry() +} +``` + +#### Saga Pattern (Compensating Transactions) + +```csharp +[DurableExecution] +public async Task Handler(BookingRequest input, IDurableContext context) +{ + var compensations = new List<(string Name, Func Action)>(); + + try + { + var flight = await context.StepAsync( + async () => await BookFlight(input), + name: "book_flight"); + compensations.Add(("cancel_flight", async () => await CancelFlight(flight.Id))); + + var hotel = await context.StepAsync( + async () => await BookHotel(input), + name: "book_hotel"); + compensations.Add(("cancel_hotel", async () => await CancelHotel(hotel.Id))); + + var car = await context.StepAsync( + async () => await BookCar(input), + name: "book_car"); + compensations.Add(("cancel_car", async () => await CancelCar(car.Id))); + + return new BookingResult { Status = "confirmed" }; + } + catch (Exception ex) + { + // Execute compensations in reverse order + foreach (var (name, action) in compensations.AsEnumerable().Reverse()) + { + await context.StepAsync(action, name: name); + } + return new BookingResult { Status = "cancelled", Error = ex.Message }; + } +} +``` + +--- + +### Logging + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/logger.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/logger/logger.ts) + +`context.Logger` is replay-aware: it suppresses duplicate messages that would otherwise repeat on every invocation. Use it instead of `Console.WriteLine`. + +> **Implementation note:** The replay-aware logger is implemented entirely in the durable execution SDK. During replay, the SDK tracks which operations are being restored from checkpoint state vs. executing for the first time, and suppresses log output for replayed operations. No changes to `Amazon.Lambda.RuntimeSupport` or the Lambda Runtime API are required. + +```csharp +[DurableExecution] +public async Task Handler(MyEvent input, IDurableContext context) +{ + // ✅ Replay-safe: only logs once even during replay + context.Logger.LogInformation("Starting workflow for {OrderId}", input.OrderId); + + var result = await context.StepAsync( + async () => await ProcessData(input.Data), + name: "process_data"); + + // ✅ Replay-safe + context.Logger.LogInformation("Processing complete: {Result}", result); + + // ❌ NOT replay-safe: will log on every replay + Console.WriteLine("This will repeat!"); + + return result; +} +``` + +The logger integrates with `Microsoft.Extensions.Logging`: + +```csharp +// context.Logger implements ILogger +context.Logger.LogDebug("Debug info"); +context.Logger.LogInformation("Info message"); +context.Logger.LogWarning("Warning: {Detail}", detail); +context.Logger.LogError(exception, "Error occurred"); +``` + +#### Custom Logger Configuration + +You can swap the logger or disable replay-aware filtering (e.g., to see logs during replay for debugging): + +```csharp +// Use a custom logger (e.g., Serilog, AWS Lambda Powertools) +context.ConfigureLogger(new LoggerConfig +{ + CustomLogger = myCustomLogger, + ModeAware = true // true = suppress during replay (default), false = always log +}); + +// Disable replay-aware filtering to see ALL logs (useful for debugging) +context.ConfigureLogger(new LoggerConfig { ModeAware = false }); +``` + +--- + +## Internals + +### AWS APIs used + +| API | Purpose | +|-----|---------| +| `CheckpointDurableExecution` | Persist operation state (step results, waits, etc.) | +| `GetDurableExecutionState` | Retrieve previously checkpointed state on replay | +| `SendDurableExecutionCallbackSuccess` | External systems signal callback completion | +| `SendDurableExecutionCallbackFailure` | External systems signal callback failure | +| `SendDurableExecutionCallbackHeartbeat` | External systems send heartbeat signals | + +### How suspension works internally + +This follows the same pattern as the JavaScript SDK's `Promise.race`. The .NET equivalent is `Task.WhenAny`. + +When `RunAsync` starts, it kicks off two tasks in parallel: user code and a termination signal (a `TaskCompletionSource` that starts unresolved). Whoever finishes first wins: + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ DurableExecutionHandler.RunAsync │ +│ │ +│ var userTask = userHandler(context); │ +│ var terminationTask = terminationManager.TerminationTask; │ +│ │ +│ var winner = await Task.WhenAny(userTask, terminationTask); │ +│ │ +│ ┌─── userTask ───────────────────┐ ┌─── terminationTask ────────┐ │ +│ │ StepAsync("fetch") → execute │ │ (unresolved TCS - waiting) │ │ +│ │ WaitAsync("delay") → ... │ │ │ │ +│ │ calls Terminate() ──────────────► SetResult() → resolves! │ │ +│ │ awaits forever (blocked) │ │ │ │ +│ └────────────────────────────────┘ └────────────────────────────┘ │ +│ │ +│ winner == terminationTask → return PENDING │ +│ (userTask is abandoned, GC collects it) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +The `TerminationManager` is a thin wrapper around `TaskCompletionSource`: +- `TerminationTask` -- a Task that hangs forever until `Terminate()` is called +- `Terminate(reason)` -- resolves the TCS, causing the race to pick termination + +When user code hits a pending wait or callback: +1. It checkpoints the operation state +2. Calls `terminationManager.Terminate(WaitScheduled)` +3. Awaits a new never-completing `TaskCompletionSource` (blocks itself permanently) +4. `Task.WhenAny` sees the termination task resolved and picks it as the winner +5. `RunAsync` returns PENDING; the abandoned user task is left to be GC'd; Lambda terminates + +### Lifecycle and cleanup + +`RunAsync` manages the full lifecycle internally. When the handler completes (SUCCEEDED/FAILED) or suspends (PENDING), `RunAsync` stops the background checkpoint batcher, flushes any pending checkpoint operations, and disposes internal state. Users never call `Dispose` or wrap anything in `await using`. + +--- + +## API Reference + +### DurableFunction + +Static helper for the non-Annotations handler path. Wraps a workflow function, handling all envelope translation between `DurableExecutionInvocationInput`/`DurableExecutionInvocationOutput` and user types. + +```csharp +/// +/// Static helper that wraps a durable workflow function, handling all envelope +/// translation between DurableExecutionInvocationInput/Output and user types. +/// +/// All four overloads dispatch through the ILambdaSerializer registered on +/// ILambdaContext.Serializer, so AOT-safe and reflection-based callers share a +/// single code path. Callers wire AOT support by registering an AOT-aware +/// serializer with the runtime (e.g., SourceGeneratorLambdaJsonSerializer<TContext>) +/// — there is no per-call JsonSerializerContext argument. +/// +public static class DurableFunction +{ + /// + /// Wrap a workflow (typed input + output). + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext); + + /// + /// Wrap a workflow (typed input + output) with explicit Lambda client. + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient); + + /// + /// Wrap a void workflow (typed input, no output). + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext); + + /// + /// Wrap a void workflow with explicit Lambda client. + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient); +} +``` + +`WrapAsync` requires an `ILambdaSerializer` on `ILambdaContext.Serializer`. If none is registered the helper throws `InvalidOperationException` with a message that points at the three places to register one (assembly attribute, `LambdaBootstrapBuilder.Create`, or `TestLambdaContext.Serializer` for tests). + +### IDurableContext + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/context.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/durable-context.ts) + +The primary interface developers interact with: + +```csharp +public interface IDurableContext +{ + /// + /// Replay-safe logger. Messages are de-duplicated during replay. + /// + ILogger Logger { get; } + + /// + /// Metadata about the current durable execution. + /// + IExecutionContext ExecutionContext { get; } + + /// + /// The underlying Lambda context. + /// + ILambdaContext LambdaContext { get; } + + // ── StepAsync overloads ──────────────────────────────────────────── + // The user's function always receives IStepContext, matching the + // Python and JS SDKs (Java has no-context overloads but deprecated + // them — see https://github.com/aws/aws-durable-execution-sdk-java). + // Step results are serialized via the ILambdaSerializer registered on + // ILambdaContext.Serializer. AOT and reflection callers share one + // overload — the AOT story is determined by the registered serializer. + + /// + /// Execute a step with automatic checkpointing. The IStepContext provides + /// a step-scoped logger with operation metadata (step name, attempt number, + /// operation ID) and the current attempt number. + /// + Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute a step that returns no value. + /// + Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Suspend execution for the specified duration. + /// Throws ArgumentOutOfRangeException if duration is less than 1 second. + /// + Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); + + /// + /// Create a callback for an external system to complete. Returns an + /// handle exposing the service-allocated + /// (pass to the external system) and + /// + /// (await to suspend until a result arrives). + /// + /// + /// The callback result is deserialized using the + /// registered on . AOT and reflection-based + /// scenarios share this single overload — the AOT story is determined by the + /// registered serializer (e.g., + /// SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// Errors are deferred to ; + /// CreateCallbackAsync always returns successfully so user code + /// between CreateCallbackAsync and the result-await runs deterministically + /// across replays. + /// + /// + Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Composite operation that creates a callback, runs the supplied submitter + /// (which hands the callbackId to an external system), and suspends + /// until the external system delivers a result. Equivalent to manually + /// composing + /// + + /// + + /// inside a child context. + /// + /// + /// Submitter failures (after retries are exhausted) surface as + /// . Callback failures and timeouts + /// surface as / + /// . + /// + Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Invoke another durable function. + /// + Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple operations in parallel (unnamed branches). + /// + Task> ParallelAsync( + IReadOnlyList>> functions, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple named operations in parallel. Named branches appear in + /// execution traces and can be inspected by name in tests. + /// + Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Process a collection of items in parallel. + /// + Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Run operations in an isolated child context. + /// + Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Poll until a condition is met. The check function returns the next + /// state on each invocation; the configured IWaitStrategy<TState> + /// decides whether to keep polling and how long to wait between calls. + /// State is serialized using the ILambdaSerializer registered on + /// ILambdaContext.Serializer (AOT and reflection-based scenarios + /// share this single overload — the AOT story is determined by the + /// registered serializer). + /// + Task WaitForConditionAsync( + Func> check, + WaitForConditionConfig config, + string? name = null, + CancellationToken cancellationToken = default); +} +``` + +#### Supporting Types + +```csharp +/// +/// Context passed to step functions. Provides step-scoped logging and metadata. +/// +public interface IStepContext +{ + /// + /// Logger scoped to this step. Includes step name, operation ID, and attempt + /// number in structured log metadata automatically. + /// + ILogger Logger { get; } + + /// + /// The current retry attempt number (1-based). + /// + int AttemptNumber { get; } + + /// + /// The deterministic operation ID for this step. + /// + string OperationId { get; } +} + +/// +/// Context passed to the submitter delegate of WaitForCallbackAsync. +/// Distinct from so the submitter API can evolve +/// independently. Mirrors WaitForCallbackContext in the Python and +/// JavaScript SDKs (logger-only surface). +/// +public interface IWaitForCallbackContext +{ + /// + /// Logger scoped to the submitter step (replay-safe). + /// + ILogger Logger { get; } +} + +/// +/// A named branch for parallel execution. Named branches appear in execution +/// traces and can be inspected by name in the test runner. +/// +public record DurableBranch(string Name, Func> Func); + +/// +/// Context passed to a WaitForCondition check function on every polling +/// iteration. Mirrors IStepContext minus OperationId (every iteration of a +/// wait-for-condition operation shares the same operation ID, so exposing +/// it here would be misleading). +/// +public interface IConditionCheckContext +{ + /// Logger scoped to this condition-check attempt. + ILogger Logger { get; } + + /// The current 1-based attempt number. + int AttemptNumber { get; } +} + +/// +/// Decides, per polling iteration, whether a WaitForConditionAsync operation +/// should keep polling and how long to wait. Implementations are typically +/// obtained via the WaitStrategy factory; users may also implement +/// directly. Built-in implementations throw WaitForConditionException +/// when their max-attempts limit is reached so the operation can produce a +/// failure with the last observed state. +/// +public interface IWaitStrategy +{ + WaitDecision Decide(TState state, int attemptNumber); +} + +/// +/// Decision returned by IWaitStrategy on each polling iteration. Stop() +/// indicates the condition has been met (the operation SUCCEEDs and returns +/// the latest state); ContinueAfter(delay) schedules the next poll. +/// +public readonly record struct WaitDecision +{ + public bool ShouldContinue { get; } + public TimeSpan Delay { get; } + public static WaitDecision Stop(); + public static WaitDecision ContinueAfter(TimeSpan delay); +} + +/// +/// Factory for built-in IWaitStrategy implementations. Each accepts an +/// optional isDone predicate so users can terminate polling declaratively +/// when the latest state satisfies a condition (e.g. state => state.IsReady) +/// without implementing IWaitStrategy themselves. Defaults are intentionally +/// tuned for polling, NOT retry-on-exception: 60 attempts / 5s initial / +/// 300s max / 1.5x backoff / Full jitter. +/// +public static class WaitStrategy +{ + public static IWaitStrategy Exponential(...); + public static IWaitStrategy Linear(...); + public static IWaitStrategy Fixed(TimeSpan delay, ...); + public static IWaitStrategy FromDelegate(Func strategy); +} +``` + +#### CancellationToken behavior + +All methods accept a per-call `CancellationToken` that follows standard .NET semantics: cancellation throws `OperationCanceledException` and the execution fails. Cancellation does **not** trigger suspension — those are separate concepts. + +The durable execution service handles timeout scenarios automatically: if Lambda terminates mid-execution, the next invocation simply replays from the last checkpoint. For advanced users who want to suspend gracefully before timeout, check `context.LambdaContext.RemainingTime` and return early. + +### Configuration Types + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/config.py) | JavaScript: [step](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/step.ts) | [batch](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/batch.ts) + +```csharp +/// +/// Configuration for step execution. +/// +public class StepConfig +{ + /// + /// Retry strategy for failed steps. Default is no retry. + /// Accepts IRetryStrategy implementations (RetryStrategy.Exponential, etc.) + /// or an inline function via implicit conversion from + /// Func<Exception, int, RetryDecision>. + /// + public IRetryStrategy? RetryStrategy { get; set; } + + /// + /// Execution semantics. Default is AtLeastOncePerRetry. + /// + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; + + // Note: there is no Serializer property here. Step result serialization + // is delegated to the ILambdaSerializer registered on + // ILambdaContext.Serializer (assembly attribute or + // LambdaBootstrapBuilder.Create). +} + +public enum StepSemantics +{ + /// + /// Step re-executes on each retry attempt. Safe for idempotent operations. + /// + AtLeastOncePerRetry, + + /// + /// Step executes at most once per retry attempt. Use for side effects. + /// + AtMostOncePerRetry +} + +/// +/// Configuration for callback operations. +/// +public class CallbackConfig +{ + /// + /// Maximum time to wait for callback response. Default (TimeSpan.Zero) means no timeout. + /// + public TimeSpan Timeout { get; set; } = TimeSpan.Zero; + + /// + /// Maximum time between heartbeat signals before timeout. Default (TimeSpan.Zero) means no heartbeat timeout. + /// + public TimeSpan HeartbeatTimeout { get; set; } = TimeSpan.Zero; + + // Note: there is no Serializer property here. Callback result + // serialization flows through the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. +} + +/// +/// Configuration for wait-for-callback operations. +/// +public class WaitForCallbackConfig : CallbackConfig +{ + /// + /// Retry strategy for the submitter function. + /// + public IRetryStrategy? RetryStrategy { get; set; } +} + +/// +/// Configuration for invoke operations. +/// +public class InvokeConfig +{ + /// + /// Optional tenant identifier propagated to the chained invocation. + /// Matches the tenantId field on Python/JS/Java InvokeConfig. + /// + public string? TenantId { get; set; } + + // Note: there are no payload/result serializer properties here. Both + // flow through the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. +} + +/// +/// Controls how branches are represented in the checkpoint graph. +/// +public enum NestingType +{ + /// + /// Each branch creates a full isolated CONTEXT operation. Higher observability + /// in execution traces but more checkpoint operations (default). + /// + Nested, + + /// + /// Branches use virtual contexts sharing the parent. Reduces checkpoint cost + /// by ~30% at the expense of less granular execution traces. + /// + Flat +} + +/// +/// Configuration for parallel execution. +/// +public class ParallelConfig +{ + /// + /// Maximum concurrent branches. Null = unlimited. + /// + public int? MaxConcurrency { get; set; } + + /// + /// When to consider the operation complete. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How branches are represented in the checkpoint graph. + /// Nested = full isolated context per branch (default). + /// Flat = virtual contexts sharing parent (~30% fewer checkpoint operations). + /// + public NestingType NestingType { get; set; } = NestingType.Nested; +} + +/// +/// Configuration for map operations. +/// +public class MapConfig +{ + /// + /// Maximum concurrent items. Null = unlimited. + /// + public int? MaxConcurrency { get; set; } + + /// + /// When to consider the operation complete. Defaults to AllCompleted() — + /// every item runs regardless of per-item failures, which surface via + /// IBatchResult<T>.Failed rather than throwing. This permissive default + /// matches the Python and Java SDKs' map operation. It differs intentionally + /// from ParallelConfig.CompletionConfig, which defaults to AllSuccessful() + /// (fail-fast). For fail-fast map behavior, set this to + /// CompletionConfig.AllSuccessful() or call IBatchResult<T>.ThrowIfError(). + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); + + /// + /// How item branches are represented in the checkpoint graph. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; + + /// + /// Optional function to generate a custom name for each item's branch. + /// Improves observability in execution traces. Receives the item and its index. + /// If null, branches are named by index (e.g., "0", "1", "2"). + /// + public Func? ItemNamer { get; set; } +} + +/// +/// Defines completion criteria for parallel/map operations. +/// +public class CompletionConfig +{ + public int? MinSuccessful { get; set; } + public int? ToleratedFailureCount { get; set; } + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based threshold. Validated by the + /// setter; out-of-range values throw . + /// + public double? ToleratedFailurePercentage { get; set; } + + public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; + public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; + public static CompletionConfig AllCompleted() => new(); +} + +/// +/// Configuration for child context operations. +/// +public class ChildContextConfig +{ + // Note: there is no Serializer property here. The child context's + // return value is serialized via the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. + + /// + /// Operation sub-type label for observability (e.g., in test runner output). + /// + public string? SubType { get; set; } + + /// + /// Optional function to transform exceptions from the child context before + /// surfacing them to the parent. Useful for wrapping low-level errors into + /// domain-specific exceptions. + /// + public Func? ErrorMapping { get; set; } +} + +/// +/// Configuration for wait-for-condition (polling). +/// +public class WaitForConditionConfig +{ + /// + /// Initial state passed to the first check invocation. + /// + public required TState InitialState { get; set; } + + /// + /// Strategy controlling how long to wait between checks. + /// + public required IWaitStrategy WaitStrategy { get; set; } +} +``` + +### Result Types + +```csharp +/// +/// Result of a parallel or map operation. +/// +public interface IBatchResult +{ + /// + /// All items, in original index order. + /// + IReadOnlyList> All { get; } + + /// + /// Items whose Status is Succeeded. + /// + IReadOnlyList> Succeeded { get; } + + /// + /// Items whose Status is Failed. + /// + IReadOnlyList> Failed { get; } + + /// + /// Items still in flight when the batch resolved (CompletionConfig short-circuit). + /// + IReadOnlyList> Started { get; } + + /// + /// Get all successful results in original index order. Throws if any failed. + /// + IReadOnlyList GetResults(); + + /// + /// Get all errors from failed items. + /// + IReadOnlyList GetErrors(); + + /// + /// Throw a single aggregated exception if any item failed. + /// + void ThrowIfError(); + + /// + /// True if any item is in the Failed state. + /// + bool HasFailure { get; } + + /// + /// Why the batch resolved. + /// + CompletionReason CompletionReason { get; } + + int SuccessCount { get; } + int FailureCount { get; } + int StartedCount { get; } + int TotalCount { get; } +} + +public interface IBatchItem +{ + int Index { get; } + BatchItemStatus Status { get; } + T? Result { get; } + DurableExecutionException? Error { get; } +} + +/// +/// Status of an individual item in a batch result. +/// Mirrors the wire-state observed at the time the batch resolved — items still +/// running when a CompletionConfig short-circuits remain in . +/// +public enum BatchItemStatus +{ + /// + /// The branch ran to completion and produced a result. + /// + Succeeded, + + /// + /// The branch ran to completion and threw. + /// + Failed, + + /// + /// The branch was still in flight when the batch's CompletionConfig + /// resolved (e.g., FirstSuccessful returned before this branch finished). + /// + Started +} +public enum CompletionReason { AllCompleted, MinSuccessfulReached, FailureToleranceExceeded } + +/// +/// Represents a pending callback. +/// +public interface ICallback +{ + /// + /// The callback ID to send to external systems. + /// + string CallbackId { get; } + + /// + /// Wait for and return the callback result. + /// Suspends execution until the result is available. + /// + /// External system reported failure. + /// Service marked the callback TIMED_OUT. + Task GetResultAsync(CancellationToken cancellationToken = default); +} + +/// +/// Metadata about the current execution. +/// +public interface IExecutionContext +{ + /// + /// The ARN of the current durable execution. + /// + string DurableExecutionArn { get; } +} +``` + +### Exception Types + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/exceptions.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/errors/durable-error/durable-error.ts) + +```csharp +/// +/// Base exception for all durable execution errors. +/// +public class DurableExecutionException : Exception { } + +/// +/// Thrown when user code inside a step fails (after retries exhausted). +/// Contains the original error details from the checkpoint. +/// +public class StepException : DurableExecutionException +{ + public string? ErrorType { get; } + public string? ErrorData { get; } + public IReadOnlyList? StackTrace { get; } +} + +/// +/// Base exception for callback failures. Concrete subclasses distinguish +/// failure modes — pattern-match the subclass type rather than inspecting +/// a flag. +/// +public class CallbackException : DurableExecutionException +{ + public string? CallbackId { get; init; } + public string? ErrorType { get; init; } + public string? ErrorData { get; init; } + public IReadOnlyList? OriginalStackTrace { get; init; } +} + +/// External system reported a failure result for the callback. +public class CallbackFailedException : CallbackException { } + +/// Service marked the callback TIMED_OUT (overall or heartbeat). +public class CallbackTimeoutException : CallbackException { } + +/// +/// Submitter step (the inner step inside WaitForCallbackAsync) failed +/// after retries are exhausted. Wraps the underlying StepException. +/// Only thrown from WaitForCallbackAsync. +/// +public class CallbackSubmitterException : CallbackException { } + +/// +/// Base exception for chained-invoke failures. Catch InvokeException +/// to handle every non-success terminal state uniformly, or pattern-match the +/// concrete subclasses (InvokeFailedException, InvokeTimedOutException, +/// InvokeStoppedException) to react differently to specific outcomes. +/// Mirrors the Java SDK's invoke exception tree. +/// +public class InvokeException : DurableExecutionException +{ + public string? FunctionName { get; init; } + public string? ErrorType { get; init; } + public string? ErrorData { get; init; } + public IReadOnlyList? OriginalStackTrace { get; init; } +} + +/// The chained function ran and threw. +public class InvokeFailedException : InvokeException { } + +/// The chained invocation reached the service-side TIMED_OUT terminal state. +public class InvokeTimedOutException : InvokeException { } + +/// The chained execution was stopped by the service before reaching a normal terminal state. +public class InvokeStoppedException : InvokeException { } + +/// +/// Thrown when a child context operation fails. +/// +public class ChildContextException : DurableExecutionException +{ + public string? SubType { get; } +} + +/// +/// Thrown when a wait-for-condition operation exhausts all attempts +/// without the condition being met. Subclassable: future failure modes +/// (e.g. timeout) should add derived exceptions rather than discriminator +/// flags so callers can catch by static type. +/// +public class WaitForConditionException : DurableExecutionException +{ + public int AttemptsExhausted { get; } + + /// The most recent state observed by the check function before + /// the strategy gave up. Boxed because the exception type is not generic; + /// callers cast to the workflow's known state type. + public object? LastState { get; } +} + +/// +/// Thrown when the operation sequence during replay does not match +/// the previously checkpointed history. Indicates non-deterministic code. +/// +public class NonDeterministicException : DurableExecutionException +{ + public string? ExpectedOperationId { get; } + public string? ActualOperationId { get; } +} + +/// +/// Thrown when a step is interrupted mid-execution (e.g., Lambda timeout or +/// runtime termination). The step did not complete and its result was not +/// checkpointed. On the next invocation, the step will re-execute from scratch. +/// +public class StepInterruptedException : DurableExecutionException +{ + public string? StepName { get; } + public int AttemptNumber { get; } +} + +/// +/// Thrown when checkpoint serialization or deserialization fails. +/// +public class SerializationException : DurableExecutionException { } + +/// +/// Thrown when input validation fails. +/// +public class DurableValidationException : DurableExecutionException { } + +/// +/// Thrown when the checkpoint API call fails. +/// +public class CheckpointException : DurableExecutionException +{ + public bool IsRetriable { get; } +} +``` + +--- + +## Serialization + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/serdes.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/serdes/serdes.ts) + +### Default behavior + +Step results are serialized to JSON (via `System.Text.Json`) before checkpointing. Your return types need to be JSON-serializable. + +```csharp +// ✅ GOOD: JSON-serializable types +public record OrderResult(string OrderId, decimal Total, bool IsCompleted); + +// ❌ BAD: Non-serializable types +public class BadResult +{ + public Stream DataStream { get; set; } // Not serializable + public HttpClient Client { get; set; } // Not serializable +} +``` + +### Custom Serialization + +There is no per-call serializer override on any durable-execution API. Every checkpoint — step results, callback results, invoke payloads/results, child-context results — is serialized via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. To customize, register a different `ILambdaSerializer` for the function: + +```csharp +// Class library mode — register via the assembly attribute. +[assembly: LambdaSerializer(typeof(MyCustomSerializer))] + +// Executable / custom runtime — pass to LambdaBootstrapBuilder.Create. +using var bootstrap = LambdaBootstrapBuilder.Create(handler, new MyCustomSerializer()).Build(); +``` + +The customization applies uniformly to the whole function — there is no way today to swap the format for a single step or a single result type. See [NativeAOT compatibility](#nativeaot-compatibility) for how the registration flows in JIT vs. AOT. + +### Class library vs. executable output + +All samples in this doc use the class library pattern (no `Main` method). This is the default for Lambda functions. To turn a durable function project into an executable (required for NativeAOT or custom runtimes): + +**With Annotations** — add the global attribute to auto-generate a `Main` method: +```csharp +[assembly: LambdaGlobalProperties(GenerateMain = true)] +``` + +**Without Annotations** — provide your own `Main` method: +```csharp +public static async Task Main(string[] args) +{ + using var bootstrap = new LambdaBootstrap( + new Function().FunctionHandler, + new DefaultLambdaJsonSerializer()); + await bootstrap.RunAsync(); +} +``` + +Both approaches produce a self-contained executable that the Lambda custom runtime can invoke. + +### NativeAOT compatibility + +The SDK is AOT-friendly but does not require AOT. The default JSON serialization uses reflection (standard `System.Text.Json` behavior), which works in JIT mode. **AOT safety is determined entirely by which `ILambdaSerializer` the user registers with the Lambda runtime** — there is no separate AOT-only API surface in the SDK, and no per-call `JsonSerializerContext` argument anywhere on `WrapAsync` or `IDurableContext`. The same overloads work in JIT and AOT; the difference is whether `ILambdaContext.Serializer` resolves to `DefaultLambdaJsonSerializer` (reflection) or `SourceGeneratorLambdaJsonSerializer` (AOT). + +The SDK itself avoids `Activator.CreateInstance`, `Type.GetType()`, and other reflection patterns, and uses `[DynamicallyAccessedMembers]` trimming annotations where needed. + +#### What the user registers in their `JsonSerializerContext` + +For AOT, the user's source-generated context must include: + +1. **Wire-envelope types** — `DurableExecutionInvocationInput` and `DurableExecutionInvocationOutput`. The handler signature is typed against these, so Lambda's runtime calls `serializer.Deserialize(...)` on each invoke and the source generator needs `JsonTypeInfo` for both. +2. **Workflow input / output POCOs** — every `TInput` / `TOutput` that appears in a `WrapAsync` call. +3. **Step result types** — every `T` that appears in `context.StepAsync(...)`. The SDK serializes step results via the same `ILambdaSerializer`, so each result type needs source-gen registration too. + +```csharp +// Class library mode — register via the assembly attribute. +[assembly: LambdaSerializer(typeof(SourceGeneratorLambdaJsonSerializer))] + +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(OrderEvent))] +[JsonSerializable(typeof(OrderResult))] +[JsonSerializable(typeof(Order))] // step result +public partial class MyJsonContext : JsonSerializerContext { } + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + // Same StepAsync overload in JIT and AOT — the registered serializer decides. + var order = await context.StepAsync(async (step) => await GetOrder(), name: "get_order"); + // ... + } +} +``` + +For executable / custom-runtime deployments (no class library attribute), the same context is registered by passing the serializer to `LambdaBootstrapBuilder.Create(handler, serializer)` — see the [Manual Handler](#manual-handler-without-annotations) section. + +### Large payload and checkpoint overflow + +The durable execution service imposes size limits: + +- **256 KB** per individual operation checkpoint +- **6 MB** maximum Lambda response payload + +The SDK handles overflow transparently: + +**Step results exceeding 256 KB:** When a step's serialized result exceeds the checkpoint size limit, the SDK splits the checkpoint into a START operation (before execution) and a separate result checkpoint (after execution). On replay, the SDK fetches the result via the paginated `GetDurableExecutionState` API rather than reading it inline from the operation record. + +**Batch results (map/parallel) exceeding limits:** For large map/parallel operations, the SDK generates a compact summary for the parent operation's checkpoint. The summary includes item count, success/failure counts, and completion reason — but not individual item results. During replay, the SDK sets `ReplayChildren = true` on the state request, which causes the service to return child operation records so full results can be reconstructed. + +**Lambda response exceeding 6 MB:** If the final orchestration result exceeds the response payload limit, the SDK checkpoints the result before returning the `DurableExecutionInvocationOutput`. The service reads the result from the checkpoint rather than from the response body. + +**Guidance for very large results:** For results that are inherently large (multi-MB payloads), do the offload yourself inside the step — write the payload to external storage (S3, DynamoDB) and return a reference (e.g. an S3 key) from the step. The reference is what the SDK serializes and checkpoints, so the checkpoint stays small and pagination is avoided. Subsequent steps fetch the payload from external storage on demand. + +--- + +## Integration with Existing Libraries + +### Amazon.Lambda.Core + +The SDK uses existing Lambda core interfaces: +- `ILambdaContext` -- available via `context.LambdaContext` +- `ILambdaSerializer` -- used for event deserialization + +### Amazon.Lambda.RuntimeSupport + +The durable execution handler integrates with the existing runtime support bootstrap: + +```csharp +// The [DurableExecution] attribute signals that the handler +// receives DurableExecutionInvocationInput and returns DurableExecutionInvocationOutput +// The SDK handles the translation to/from the user's handler signature +``` + +### Amazon.Lambda.Annotations (optional) + +`Amazon.Lambda.Annotations` is an **optional** dependency. Users can write durable functions without it (see [Manual Handler](#manual-handler-without-annotations) above), but adding Annotations to the project reduces boilerplate significantly. + +When both packages are referenced, the Annotations source generator detects `[DurableExecution]` by fully-qualified name and at compile time: + +1. Generates a handler wrapper that translates `DurableExecutionInvocationInput` to/from your types +2. Manages context lifecycle (creation, checkpoint batching, cleanup) +3. Adds `DurableConfig` to the CloudFormation template +4. Adds the `AWSLambdaBasicDurableExecutionRolePolicy` managed policy + +```csharp +public class Functions +{ + [LambdaFunction] + [DurableExecution(ExecutionTimeout = 3600, RetentionPeriodInDays = 7)] + public async Task ProcessOrder( + [FromBody] OrderRequest request, + IDurableContext context) + { + var validated = await context.StepAsync( + async (step) => await Validate(request), + name: "validate"); + // ... + } +} +``` + +#### Custom Lambda Client + +For VPC endpoints, custom retry policies, or testing with mocked clients, inject a custom `IAmazonLambda` client via the `[DurableExecution]` attribute: + +```csharp +public class Functions +{ + private readonly IAmazonLambda _lambdaClient; + + public Functions(IAmazonLambda lambdaClient) + { + _lambdaClient = lambdaClient; + } + + [LambdaFunction] + [DurableExecution(LambdaClientFactory = nameof(_lambdaClient))] + public async Task ProcessOrder( + [FromBody] OrderRequest request, + IDurableContext context) + { + // ... + } +} +``` + +When no `LambdaClientFactory` is specified, the generated code creates a default `AmazonLambdaClient`. For the manual handler path (`DurableFunction.WrapAsync`), pass the client directly via the `IAmazonLambda lambdaClient` overload. + +> **Dependency boundaries:** `Amazon.Lambda.Annotations` has **no dependency** on the AWS SDK or on `Amazon.Lambda.DurableExecution`. The Annotations source generator references durable execution types by fully-qualified name strings only — it never takes a compile-time dependency on the durable package. The `[DurableExecution]` attribute is defined in `Amazon.Lambda.DurableExecution`, and the generated code resolves against the user's project references. There is only one source generator (Annotations) — no coordination between multiple generators is needed. + +### AWSSDK.Lambda + +The `Amazon.Lambda.DurableExecution` package depends on the AWS SDK for .NET Lambda client to make checkpoint API calls. This dependency is confined to the durable execution package — `Amazon.Lambda.Annotations` does not depend on the AWS SDK. + + +- `CheckpointDurableExecutionAsync` +- `GetDurableExecutionStateAsync` + +--- + +## Testing (customer-facing package) + +> **Implementations:** [JavaScript (local runner)](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-testing/src/test-runner/local/local-durable-test-runner.ts) | [JavaScript (cloud runner)](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-testing/src/test-runner/cloud/cloud-durable-test-runner.ts) + +We ship a separate NuGet package (`Amazon.Lambda.DurableExecution.Testing`) that lets developers test their durable functions locally without deploying to AWS. + +**Why this needs to exist:** A durable function requires multiple Lambda invocations to complete (invoke → PENDING → wait → re-invoke → SUCCEEDED). You can't test that with a normal unit test because there's no Lambda service orchestrating the re-invocations. The test runner simulates this loop in-process: it calls your handler, gets PENDING, marks waits as elapsed, calls your handler again with the prior checkpoint state, and repeats until the workflow completes. + +```csharp +var runner = new DurableTestRunner( + handler: new Function().Handler, + options: new TestRunnerOptions + { + SkipTime = true, // Waits complete instantly (no real delays) + MaxInvocations = 10 // Safety limit to prevent infinite loops + }); + +var result = await runner.RunAsync( + input: new OrderEvent { OrderId = "order-123" }, + timeout: TimeSpan.FromSeconds(30)); + +Assert.Equal(InvocationStatus.Succeeded, result.Status); +Assert.Equal("approved", result.Result.Status); + +// Inspect individual steps +var validateStep = result.GetStep("validate_order"); +Assert.True(validateStep.GetResult().IsValid); +``` + +The Python and JS SDKs both ship equivalent test runner packages. + +### Cloud Test Runner + +For integration testing against deployed functions, the testing package also ships a `CloudDurableTestRunner` with the same API as the local runner. This lets developers run the exact same assertions against a real Lambda function: + +```csharp +var runner = new CloudDurableTestRunner( + functionArn: "arn:aws:lambda:us-east-1:123456789012:function:process-order:$LATEST"); + +var result = await runner.RunAsync( + input: new OrderEvent { OrderId = "order-123" }, + timeout: TimeSpan.FromSeconds(60)); + +Assert.Equal(InvocationStatus.Succeeded, result.Status); +var validateStep = result.GetStep("validate_order"); +Assert.True(validateStep.GetResult().IsValid); +``` + +The cloud runner invokes the deployed function and polls `GetDurableExecutionState` until the execution reaches a terminal state, then reconstructs the same `TestResult` structure as the local runner. + +### Function Registration for Invoke Testing + +To test workflows that use `InvokeAsync` without deploying, register sibling functions with the local test runner: + +```csharp +var paymentHandler = new PaymentFunction().Handler; + +var runner = new DurableTestRunner( + handler: new OrderFunction().Handler, + options: new TestRunnerOptions { SkipTime = true }); + +runner.RegisterFunction("process-payment", paymentHandler); +runner.RegisterFunction( + "arn:aws:lambda:us-east-1:123:function:process-payment:$LATEST", + paymentHandler); + +var result = await runner.RunAsync(input: new OrderEvent { OrderId = "123" }); +``` + +When the workflow calls `context.InvokeAsync("process-payment", payload)`, the test runner routes to the registered handler instead of making an AWS API call. + +--- + +## Local development (Test Tool v2 and Aspire) + +The Lambda Test Tool v2 and the Aspire Lambda integration currently emulate single-invocation Lambda functions. Durable functions require a multi-invocation loop that neither tool supports today. To add support, the local emulator needs three things: + +### Checkpoint API endpoints + +The SDK calls these during execution. The emulator would serve them locally with in-memory storage: + +- `POST /checkpoint-durable-execution` -- store step results, wait records +- `GET /durable-execution-state` -- return accumulated state for replay + +### An orchestration loop + +When the function returns `PENDING`, the emulator needs to: +- Parse the checkpoint to determine what's pending (timer, callback, retry) +- Wait for that condition (or skip it in fast mode) +- Re-invoke the function with the accumulated `DurableExecutionInvocationInput` +- Repeat until `SUCCEEDED` or `FAILED` + +### Callback delivery + +An endpoint that external tools (or the developer via the UI) can call to deliver callback results: + +- `POST /send-durable-execution-callback-success` +- This triggers a re-invocation of the waiting execution + +### How this relates to the testing SDK + +The `DurableTestRunner` in the testing package implements the same orchestration loop programmatically. The test tool / Aspire enhancement would reuse this engine and wrap it in a web UI or Aspire dashboard, giving developers a visual way to see execution state, deliver callbacks manually, skip timers, and inspect checkpoint history. + +### Priority + +This is post-v1 work. For the initial release, developers test durable functions using the programmatic `DurableTestRunner` or by deploying to AWS. Test tool and Aspire support are a fast-follow once the core SDK is stable. + +--- + +## Requirements & Constraints + +- **Target framework:** `net8.0` only. .NET 6 is EOL and not supported. Durable functions are a new feature — adopters will be on the latest managed runtime. Targeting .NET 8 gives access to `required` properties, improved `System.Text.Json` source generation, and better NativeAOT support. +- **Lambda runtime:** Requires the managed .NET 8 runtime or a custom runtime (`provided.al2023`) for NativeAOT deployments. +- **Durable execution service:** The function must be configured with `DurableConfig` (handled automatically by the `[DurableExecution]` source generator). +- **Qualified function identifiers:** `InvokeAsync` requires a version number, alias, or `$LATEST` — unqualified ARNs are not supported for durable invocations. +- **Serializable results:** All step return types must be serializable by the `ILambdaSerializer` registered on `ILambdaContext.Serializer` (default: `System.Text.Json`). + +--- + +## Package Structure + +### Amazon.Lambda.DurableExecution (Runtime) + +The core SDK that runs in Lambda. Minimal dependencies. + +**Dependencies:** +- `Amazon.Lambda.Core` (existing) +- `AWSSDK.Lambda` (for checkpoint/state APIs) +- `Microsoft.Extensions.Logging.Abstractions` (for ILogger) + +### Amazon.Lambda.DurableExecution.Testing (Dev-only) + +Test runner and helpers for local/cloud testing. + +**Dependencies:** +- `Amazon.Lambda.DurableExecution` +- `Amazon.Lambda.TestUtilities` (existing) + +### Blueprints (`dotnet new` Templates) + +New `dotnet new` templates ship as part of the existing `Amazon.Lambda.Templates` NuGet package (same as all other Lambda blueprints in this repo under `Blueprints/BlueprintDefinitions/`). + +**Templates to ship:** + +| Template short name | Description | +|---------------------|-------------| +| `lambda.DurableFunction` | Minimal durable function with a single step and wait. Includes test project with `DurableTestRunner`. | +| `lambda.DurableFunction.Agentic` | GenAI agentic loop pattern (invoke model → check tool call → execute tool → repeat). | +| `lambda.DurableFunction.HumanInTheLoop` | Callback-based human approval workflow. | + +Each template includes: +- `.csproj` with correct NuGet references (`Amazon.Lambda.DurableExecution`, `Amazon.Lambda.Annotations`) +- Handler class with `[LambdaFunction]` + `[DurableExecution]` attributes +- `serverless.template` (auto-generated by source generator on build) +- Test project with `DurableTestRunner` and a passing test +- `aws-lambda-tools-defaults.json` for deployment via `dotnet lambda deploy-function` + +Running `dotnet new lambda.DurableFunction` should produce a buildable, testable, deployable project in under 30 seconds. + +--- + +## Implementation plan + +| Workstream | Scope | Estimate | +|------------|-------|----------| +| **Durable execution runtime** | Core SDK: replay engine, all context operations (step, wait, callback, invoke, parallel, map), checkpoint batching, retry, logging | ~5-6 weeks | +| **Annotations / source generator** | `[DurableExecution]` attribute, handler wrapper codegen, CloudFormation DurableConfig + IAM policy generation | ~2 weeks | +| **Testing SDK** | Local test runner (in-memory, time-skipping), cloud test runner, step inspection API | ~1.5 weeks | +| **Blueprints, docs, examples** | `dotnet new` project templates, developer guide, API reference, sample projects | ~2 weeks | +| **Roslyn analyzers** (P1 follow-up) | Static analysis detecting non-determinism, nesting violations, closure mutations | ~2 weeks | + +**Total: ~10-11 weeks (1 engineer familiar with the Python/JS SDKs)** + Roslyn analyzers as follow-up + +### Roslyn Analyzers (P1 Follow-up) + +> **Reference implementation:** JavaScript ESLint plugin — [no-non-deterministic-outside-step](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-non-deterministic-outside-step/no-non-deterministic-outside-step.ts) | [no-nested-durable-operations](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-nested-durable-operations/no-nested-durable-operations.ts) | [no-closure-in-durable-operations](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-closure-in-durable-operations/no-closure-in-durable-operations.ts) + +Ship as a separate NuGet package: `Amazon.Lambda.DurableExecution.Analyzers` + +The JavaScript SDK ships an ESLint plugin (`@aws/durable-execution-sdk-js-eslint-plugin`) with three rules that catch the most common durable execution mistakes at author time. The .NET equivalent uses Roslyn diagnostic analyzers: + +| Diagnostic ID | Severity | Rule | Rationale | +|---------------|----------|------|-----------| +| DE001 | Warning | `DateTime.Now`, `DateTime.UtcNow`, `Guid.NewGuid()`, `Random.Next()`, `Random.Shared`, `Environment.TickCount` used outside a `StepAsync` body | Non-deterministic values produce different results on replay, breaking checkpoint consistency | +| DE002 | Error | Calling `context.StepAsync`, `WaitAsync`, `ParallelAsync`, `MapAsync`, `InvokeAsync`, `RunInChildContextAsync`, `CreateCallbackAsync`, or `WaitForCallbackAsync` inside a `StepAsync` lambda | Steps are leaf operations — nesting durable operations inside a step produces unpredictable behavior | +| DE003 | Warning | Mutable variable captured by a `StepAsync` lambda and written to inside the lambda body | On replay the step returns cached result without executing, so the write never happens — the outer variable has stale state | +| DE004 | Info | `Task.WhenAll` or `Task.WhenAny` called with tasks returned by durable context methods | Suggest using `ParallelAsync` for completion policies, nesting control, and observability | + +These analyzers run at compile time in the IDE (IntelliSense squiggles) and during `dotnet build`, preventing the most confusing class of runtime failures. + +--- + +## Cross-SDK API comparison + +All four SDKs expose the same core operations. The differences are naming conventions, parameter ordering, and concurrency model. + +| Operation | .NET | Python | JavaScript | Java | +|-----------|------|--------|------------|------| +| Step | `context.StepAsync(func, name?, config?)` | `context.step(func, name?, config?)` | `context.step(name?, fn, config?)` → `DurablePromise` | `context.step(name, type, func, config?)` (blocking) / `context.stepAsync(...)` → `DurableFuture` | +| Wait | `context.WaitAsync(duration, name?)` | `context.wait(duration, name?)` | `context.wait(name?, duration)` → `DurablePromise` | +| Create callback | `context.CreateCallbackAsync(name?, config?)` | `context.create_callback(name?, config?)` | `context.createCallback(name?, config?)` | +| Wait for callback | `context.WaitForCallbackAsync(submitter, name?, config?)` | `context.wait_for_callback(submitter, name?, config?)` | `context.waitForCallback(name?, submitter, config?)` | +| Invoke | `context.InvokeAsync(funcName, payload, name?, config?)` | `context.invoke(func_name, payload, name?, config?)` | `context.invoke(name?, funcId, input, config?)` → `DurablePromise` | +| Parallel | `context.ParallelAsync(functions, name?, config?)` | `context.parallel(functions, name?, config?)` | `context.parallel(name?, branches, config?)` | +| Map | `context.MapAsync(items, func, name?, config?)` | `context.map(inputs, func, name?, config?)` | `context.map(name?, items, mapFunc, config?)` | +| Child context | `context.RunInChildContextAsync(func, name?, config?)` | `context.run_in_child_context(func, name?, config?)` | `context.runInChildContext(name?, fn, config?)` | +| Wait for condition | `context.WaitForConditionAsync(check, config, name?)` | `context.wait_for_condition(check, config, name?)` | `context.waitForCondition(name?, checkFunc, config?)` | +| Logger | `context.Logger` (ILogger) | `context.logger` (Logger) | `context.logger` (DurableContextLogger) | +| Lambda context | `context.LambdaContext` | `context.lambda_context` | `context.lambdaContext` | +| Execution context | `context.ExecutionContext` | `context.execution_context` | *(via logger metadata)* | +| Promise combinators | `CompletionConfig` on `ParallelAsync` | `CompletionConfig` on `parallel`/`map` | `context.promise.all/allSettled/any/race` | +| Configure logger | `context.ConfigureLogger(config)` | `context.set_logger(logger)` | `context.configureLogger(config)` | +| Cancellation | `CancellationToken` on all methods | *(N/A)* | *(N/A)* | +| Jitter strategy | `JitterStrategy` enum on `Exponential()` | `jitter_strategy` on `RetryStrategyConfig` | `jitter` on `createRetryStrategy()` | +| Retry presets | `RetryStrategy.None/Default/Transient` | `RetryPresets.none()/default()/transient()` | `retryPresets.default/linear/noRetry` | +| Nesting type | `NestingType` on `ParallelConfig`/`MapConfig` | `NestingType` on parallel/map config | `NestingType` on parallel/map config | +| Item namer | `ItemNamer` on `MapConfig` | Item naming function on `MapConfig` | `itemNamer` on `MapConfig` | +| Error mapping | `ErrorMapping` on `ChildContextConfig` | *(typed exception wrapping)* | `errorMapping` on child context config | +| Message-based retry filter | `retryableMessagePatterns` (regex) | `retryable_errors` (regex) | `retryableErrors` (RegExp[]) | +| Step context / scoped logger | `IStepContext` with `Logger`, `AttemptNumber` | `StepContext` with `logger` | `ctx` with `logger` in step callback | +| Named parallel branches | `DurableBranch(name, func)` | Function `__name__` | `{ name, func }` objects | +| Inline retry lambda | `Func` | `Callable[[Exception, int], RetryDecision]` | `(error, attempt) => RetryDecision` | +| Static analysis | Roslyn analyzers (P1 follow-up) | *(N/A)* | ESLint plugin (3 rules) | +| Cloud test runner | `CloudDurableTestRunner` | `pytest --runner-mode=cloud` | `CloudDurableTestRunner` | + +**Key differences:** + +- **Concurrency model:** JS returns `DurablePromise` (lazy, deferred until awaited). Python is synchronous (blocks the thread). Java exposes both `step` (blocking) and `stepAsync` (returns `DurableFuture`). .NET returns `Task` (standard async/await). Note: `Task.WhenAll` works with durable operations but `ParallelAsync`/`MapAsync` are preferred for completion policies and observability. +- **Why .NET ships only the async form:** Java's two-API split exists because Java has no language-level `await` — `step` is the simple blocking ergonomic, `stepAsync` is the composable form. In .NET, `Task` is *already* both: `await context.StepAsync(...)` reads as sequential code, and `Task.WhenAll(...)` composes concurrently. A `Step` (blocking, returns `T`) overload would do nothing except call `.GetAwaiter().GetResult()` on the async version, which is also a Lambda-thread anti-pattern (deadlock-prone, blocks a thread the runtime needs). So .NET intentionally has one shape — `*Async` — matching the rest of `IAmazonLambda` and the broader .NET async convention. Python is single-shape for the same reason in reverse: no async runtime in scope, so blocking is the only ergonomic shape. +- **Step function signature:** Python and JS only expose `Func` — the user always receives a step context. Java has both `Function` and `Supplier` overloads, but the `Supplier` ones are deprecated (*"use the variants accepting StepContext instead"*). .NET follows Python/JS: `IStepContext` is always passed. +- **Name parameter position:** JS puts `name` first; Python, Java, and .NET put it after the function/duration. +- **Parallel semantics in JS:** JS uses `context.promise.all/any/race/allSettled` to combine DurablePromises. .NET, Python, and Java use `CompletionConfig` on the `Parallel`/`Map` operations instead. +- **.NET-only:** `CancellationToken` on every method (standard .NET pattern). +- **Jitter default:** All four SDKs default to full jitter on retry strategies. + +--- + +## Common Patterns + +### GenAI Agentic Loop + +```csharp +[DurableExecution] +public async Task AgentHandler(AgentRequest input, IDurableContext context) +{ + var messages = new List + { + new Message { Role = "user", Content = input.Prompt } + }; + + while (true) + { + var response = await context.StepAsync( + async (step) => await InvokeModel(messages), + name: "invoke_model"); + + if (response.ToolCall == null) + return response.Content; + + var toolResult = await context.StepAsync( + async (step) => await ExecuteTool(response.ToolCall), + name: $"tool_{response.ToolCall.Name}"); + + messages.Add(new Message { Role = "assistant", Content = toolResult }); + } +} +``` + +### Human-in-the-Loop + +```csharp +[DurableExecution] +public async Task ReviewHandler(ReviewRequest input, IDurableContext context) +{ + var analysis = await context.StepAsync( + async (step) => await AnalyzeDocument(input.DocumentUrl), + name: "analyze_document"); + + context.Logger.LogInformation("Analysis complete, requesting human review"); + + var review = await context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + await NotifyReviewer(input.ReviewerEmail, callbackId, analysis); + }, + name: "human_review", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromDays(7), + HeartbeatTimeout = TimeSpan.FromHours(24) + }); + + if (review.Approved) + { + await context.StepAsync( + async (step) => await PublishDocument(input.DocumentUrl), + name: "publish"); + } + + return new ReviewResult { Status = review.Approved ? "published" : "rejected" }; +} +``` + +### Scheduled Pipeline with Retries + +```csharp +[DurableExecution] +public async Task DataPipeline(PipelineInput input, IDurableContext context) +{ + // Extract + var rawData = await context.StepAsync( + async (step) => await ExtractFromSource(input.SourceId), + name: "extract", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 5, initialDelay: TimeSpan.FromSeconds(2)) + }); + + // Transform (fan-out) + var transformed = await context.MapAsync( + items: rawData.Chunks, + func: async (ctx, chunk, index, _) => + { + return await ctx.StepAsync( + async (step) => await TransformChunk(chunk), + name: $"transform_{index}"); + }, + name: "transform_all", + config: new MapConfig { MaxConcurrency = 10 }); + + transformed.ThrowIfError(); + + // Load + var loadResult = await context.StepAsync( + async (step) => await LoadToDestination(transformed.GetResults()), + name: "load", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry + }); + + // Wait before next run + await context.WaitAsync(TimeSpan.FromHours(1), name: "schedule_delay"); + + return new PipelineResult { RecordsProcessed = loadResult.Count }; +} +``` + +--- + +## References + +- [AWS Blog: Build multi-step applications and AI workflows with AWS Lambda durable functions](https://aws.amazon.com/blogs/aws/build-multi-step-applications-and-ai-workflows-with-aws-lambda-durable-functions/) +- [AWS Documentation: Lambda Durable Functions](https://docs.aws.amazon.com/lambda/latest/dg/durable-functions.html) +- [Python SDK Repository](https://github.com/aws/aws-durable-execution-sdk-python) +- [JavaScript/TypeScript SDK Repository](https://github.com/aws/aws-durable-execution-sdk-js) +- [GitHub Issue #2216: .NET Durable Functions Support](https://github.com/aws/aws-lambda-dotnet/issues/2216) +- [Existing .NET Annotations Design Doc](lambda-annotations-design.md) diff --git a/Libraries/Libraries.sln b/Libraries/Libraries.sln index e42c40045..65b4cd9e0 100644 --- a/Libraries/Libraries.sln +++ b/Libraries/Libraries.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 18 -VisualStudioVersion = 18.5.11709.299 stable +VisualStudioVersion = 18.5.11709.299 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{AAB54E74-20B1-42ED-BC3D-CE9F7BC7FD12}" EndProject @@ -155,6 +155,14 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ResponseStreamingFunctionHa EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AspNetCoreStreamingApiGatewayTest", "test\Amazon.Lambda.RuntimeSupport.Tests\AspNetCoreStreamingApiGatewayTest\AspNetCoreStreamingApiGatewayTest.csproj", "{0768FA72-CF49-2B59-BC4C-E4CE579E5D93}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution", "src\Amazon.Lambda.DurableExecution\Amazon.Lambda.DurableExecution.csproj", "{9097B5A4-E100-47FD-A676-0B666A36FAFF}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.Tests", "test\Amazon.Lambda.DurableExecution.Tests\Amazon.Lambda.DurableExecution.Tests.csproj", "{57150BA6-3826-431F-8F58-B1D11FAFC5D4}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.IntegrationTests", "test\Amazon.Lambda.DurableExecution.IntegrationTests\Amazon.Lambda.DurableExecution.IntegrationTests.csproj", "{CA132CAB-FF4F-4312-B3A3-66DE9D360F27}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.AotPublishTest", "test\Amazon.Lambda.DurableExecution.AotPublishTest\Amazon.Lambda.DurableExecution.AotPublishTest.csproj", "{16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -969,6 +977,54 @@ Global {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x64.Build.0 = Release|Any CPU {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x86.ActiveCfg = Release|Any CPU {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x86.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x64.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x64.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x86.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x86.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|Any CPU.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x64.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x64.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x86.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x86.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x64.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x64.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x86.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x86.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|Any CPU.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x64.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x64.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x86.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x86.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x64.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x64.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x86.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x86.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|Any CPU.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x64.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x64.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x86.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x86.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|Any CPU.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x64.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x64.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x86.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x86.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|Any CPU.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|Any CPU.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x64.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x64.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x86.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1045,6 +1101,10 @@ Global {80594C21-C6EB-469E-83CC-68F9F661CA5E} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} {E404A7AC-812B-BC03-CA76-02C0BC2BA7F9} = {B5BD0336-7D08-492C-8489-42C987E29B39} {0768FA72-CF49-2B59-BC4C-E4CE579E5D93} = {B5BD0336-7D08-492C-8489-42C987E29B39} + {9097B5A4-E100-47FD-A676-0B666A36FAFF} = {AAB54E74-20B1-42ED-BC3D-CE9F7BC7FD12} + {57150BA6-3826-431F-8F58-B1D11FAFC5D4} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {503678A4-B8D1-4486-8915-405A3E9CF0EB} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj b/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj new file mode 100644 index 000000000..ae173e365 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj @@ -0,0 +1,40 @@ + + + + + + $(DefaultPackageTargets) + Amazon Lambda .NET SDK for Durable Execution - write multi-step workflows that persist state automatically. + Amazon.Lambda.DurableExecution + 0.0.1 + Amazon.Lambda.DurableExecution + Amazon.Lambda.DurableExecution + AWS;Amazon;Lambda;Durable;Workflow + true + true + enable + enable + true + IL2026,IL2067,IL2075,IL3050 + + $(NoWarn);AWSLAMBDA001 + + + + + <_Parameter1>Amazon.Lambda.DurableExecution.Tests, PublicKey="0024000004800000940000000602000000240000525341310004000001000100db5f59f098d27276c7833875a6263a3cc74ab17ba9a9df0b52aedbe7252745db7274d5271fd79c1f08f668ecfa8eaab5626fa76adc811d3c8fc55859b0d09d3bc0a84eecd0ba891f2b8a2fc55141cdcc37c2053d53491e650a479967c3622762977900eddbf1252ed08a2413f00a28f3a0752a81203f03ccb7f684db373518b4" + + + + + + + + + + + + + diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs new file mode 100644 index 000000000..fdba62d64 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs @@ -0,0 +1,31 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Status of an individual item in a . +/// +/// +/// Mirrors the wire-state of the per-branch checkpoint at the moment the batch +/// resolved. Items that finished produce or +/// ; items that were not dispatched because a +/// short-circuit fired are reported as +/// . +/// +public enum BatchItemStatus +{ + /// + /// The branch ran to completion and produced a result. + /// + Succeeded, + + /// + /// The branch ran to completion and threw. + /// + Failed, + + /// + /// The branch was not dispatched before the batch's + /// resolved (e.g., short-circuited + /// before this branch was started), or no per-branch checkpoint exists on replay. + /// + Started +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md new file mode 100644 index 000000000..b825300bd --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md @@ -0,0 +1,151 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this is + +`Amazon.Lambda.DurableExecution` is the .NET SDK (preview, 0.x) for resilient, long-running AWS Lambda +workflows that checkpoint progress after each step and resume after failures or waits. A workflow can run +for up to ~1 year (the WAIT cap is 31,622,400 seconds) and is only billed for active compute. The SDK is +client-side glue: the *durable execution service* (part of Lambda) owns the checkpoint store, fires timers, +and re-invokes the function; this library re-derives in-memory workflow position from the checkpoint history +the service sends on each invocation. See sibling SDKs (Python/JS/Java) listed in `README.md` for the shared +model — this SDK deliberately mirrors their semantics. + +## Build & test + +Targets `net8.0;net10.0` (`DefaultPackageTargets` in `buildtools/common.props`). `TreatWarningsAsErrors` is on +everywhere, and the main library is `IsTrimmable` with the trim analyzer enabled — keep new code AOT/trim-clean. + +```bash +# Build the library (run from this directory) +dotnet build + +# Unit tests (fast, no AWS). Project: Libraries/test/Amazon.Lambda.DurableExecution.Tests +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj + +# A single test +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj \ + --filter "FullyQualifiedName~StepOperationTests" + +# Coverage report (requires reportgenerator tool) +../../test/Amazon.Lambda.DurableExecution.Tests/coverage.sh +``` + +Unit tests reach `internal` types via `InternalsVisibleTo` (declared in the `.csproj`). They use +`Amazon.Lambda.TestUtilities` (`TestLambdaContext`) and the real `SourceGeneratorLambdaJsonSerializer` — +set `TestLambdaContext.Serializer` so `LambdaSerializerHelper.GetRequired` finds one. + +### Integration tests (expensive, real AWS) + +`Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests` deploys real Lambdas. Each test builds a +`TestFunctions//` project into a container image via **`dotnet publish` + `docker build`**, pushes to ECR, +creates an IAM role + Lambda (`DurableFunctionDeployment`), invokes it, and tears everything down on dispose. +Requires Docker, AWS creds (us-east-1), and is slow. Every behavior in `docs/` should have a paired +integration test under that project. Prefix AWS commands with `unset AWS_PROFILE` to use `[default]` creds. + +**Run integration tests against `net10.0`.** The project multi-targets `net8.0;net10.0`; `dotnet test` +without a framework spins up one testhost per TFW and runs them concurrently, which races two processes on +the same `TestFunctions//` build dir. Pin the framework: + +```bash +dotnet test ../../test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj \ + -f net10.0 --filter "FullyQualifiedName~MultipleStepsTest" +``` + +## Architecture: the replay model + +This is the part you must understand before changing anything. Read these together: +`DurableFunction.cs`, `DurableExecutionHandler.cs`, `DurableContext.cs`, `Internal/DurableOperation.cs`, +`Internal/ExecutionState.cs`, `Internal/OperationIdGenerator.cs`, `Internal/TerminationManager.cs`. + +**Entry point.** The user's Lambda handler delegates to `DurableFunction.WrapAsync`, which: +hydrates `ExecutionState` from `invocationInput.InitialExecutionState` (paging the service via `NextMarker`), +extracts the user payload from the `EXECUTION`-type op, builds a `CheckpointBatcher` + `DurableContext`, runs +the workflow through `DurableExecutionHandler.RunAsync`, drains checkpoints, and maps the result to a +`DurableExecutionInvocationOutput` with status **Succeeded / Failed / Pending**. + +**Each operation runs the same workflow code every invocation.** There is no persisted program counter. +On re-invocation the user function executes from the top again; each durable call (`StepAsync`, `WaitAsync`, +etc.) looks up its own checkpoint and either replays the cached result or runs fresh. This is why workflow +code **must be deterministic** — same operations, same order, same names across deployments. + +**Deterministic operation IDs** (`OperationIdGenerator`). Each durable call gets an ID = SHA-256 of +`"-"`, where the counter is per-context and pre-incremented. The same workflow position +yields the same opaque ID across replays, so a checkpoint correlates to a call by *position*, not by name — +renaming a step does **not** break replay (the human name rides separately on `OperationUpdate.Name`). +Reordering or adding/removing calls *does* break it. `ValidateReplayConsistency` enforces this and throws +`NonDeterministicExecutionException` on type/name drift. + +**Suspension is implemented by never completing a Task** (`TerminationManager` + `DurableExecutionHandler`). +When an op must suspend (wait timer, scheduled retry, pending callback/invoke) it calls +`Termination.SuspendAndAwait()`, which trips a one-shot signal and returns a Task that *never resolves*. +`RunAsync` runs the user code via `Task.Run` and races it against `TerminationTask` with `Task.WhenAny`: +- user task wins → **Succeeded** (or **Failed** if it threw) +- termination wins → **Pending**; the abandoned user task is GC'd, checkpoints flush, the service fires the + timer and re-invokes. On replay the suspended op sees its now-terminal checkpoint and returns normally. + +**Operation classes** (`Internal/*Operation.cs`) all extend `DurableOperation`. The base's +`ExecuteAsync` does: `ValidateReplayConsistency` → `TrackReplay` → look up checkpoint → dispatch to +`StartAsync` (no prior checkpoint) or `ReplayAsync` (checkpoint exists). `StepOperation` is the canonical +example — read its class doc comment for the full status decision table (Succeeded→cached, Failed→rethrow, +Pending→re-suspend if retry timer hasn't fired, Started→crash-recovery under `AtMostOncePerRetry`, +Ready→run next attempt). `DurableContext` is a thin dispatcher: it allocates the op ID, pulls the serializer +off `ILambdaContext.Serializer`, constructs the right `*Operation`, and calls `ExecuteAsync`. + +**Checkpointing** (`CheckpointBatcher`). Outbound `OperationUpdate`s (START/SUCCEED/FAIL/RETRY) are enqueued +to a background channel worker that batches and flushes them via `LambdaDurableServiceClient` (which wraps +the `AWSSDK.Lambda` `Checkpoint`/`GetExecutionState` calls). `EnqueueAsync` awaits its batch's flush +(sync semantics); fire-and-forget callers (e.g. the START checkpoint under the default +`AtLeastOncePerRetry`) don't await but must observe the Task's exception. Flush errors become a terminal +error rethrown by the next `EnqueueAsync`/`DrainAsync`. `DurableFunction.IsTerminalCheckpointError` +classifies SDK errors on the final drain: 4xx (except 429 and stale-token) → **Failed** envelope; 429/5xx/ +network → let it escape so Lambda retries the whole invocation. + +**Replay-mode tracking** (`ExecutionState`). `IsReplaying` starts true iff any completed non-`EXECUTION` op +exists; `TrackReplay` decrements as each is visited and flips to false once the workflow catches up to the +frontier. `ReplayAwareLogger` uses this to suppress log lines emitted during replay so a 30-step workflow +re-invoked 30 times logs each line once — **always use `ctx.Logger`**, never `Console.WriteLine`. +`ExecutionState` is lock-guarded because the batcher worker thread and concurrent parallel/map branches all +touch it. + +### Operations surface (`IDurableContext`) + +`StepAsync` (checkpointed code + retries), `WaitAsync` (1s–~1yr timer), `RunInChildContextAsync` (isolated +sub-workflow checkpointed as one `CONTEXT` op), `CreateCallbackAsync` / `WaitForCallbackAsync` (external +events; `WaitForCallback` is *composed* from child-context + callback + submitter step — see +`DurableContext.RunWaitForCallback`), `InvokeAsync` (durable-to-durable chained invoke, qualified ARN +required), and `ParallelAsync` / `MapAsync` (concurrent branches → `IBatchResult`). + +**Nesting (`NestingType`)** matters for parallel/map. `Nested` (default) gives each branch a full `CONTEXT` +checkpoint. `Flat` runs branches in *virtual* contexts that emit no `CONTEXT` op — inner ops re-parent to the +parallel/map op via `OperationIdGenerator.CreateVirtualChild(operationId, reportedParentId)`, trading trace +granularity for fewer checkpoints. The `idPrefix` vs `reportedParentId` split is the subtle part: inner IDs +always derive from the branch's own op ID (so siblings never collide), but are *reported* under the nearest +non-virtual ancestor (so they reference a parent that actually exists in the checkpoint store). + +### Wire format (`Operation.cs`) + +`Operation` and its `*Details` types mirror the service envelope JSON exactly (`[JsonPropertyName]`). +String constants live in `OperationTypes` (STEP/WAIT/CALLBACK/CHAINED_INVOKE/CONTEXT/EXECUTION), +`OperationStatuses` (STARTED/SUCCEEDED/FAILED/PENDING/READY/CANCELLED/STOPPED/TIMED_OUT), and +`OperationSubTypes` (PascalCase finer classifier). Plural type names (`OperationTypes`, not `OperationType`) +intentionally avoid collision with `AWSSDK.Lambda` model enums. + +## Conventions + +- **Programming model:** preview supports only the *executable* model — `Main` builds a `LambdaBootstrap` + with a handler wrapper and an `ILambdaSerializer`. The serializer is read off `ILambdaContext.Serializer` + (a preview API; the project-wide `AWSLAMBDA001` suppression in the `.csproj` is intentional for that + reason). All step/result/payload (de)serialization flows through that one registered serializer, so AOT + and reflection callers share a single code path — there is no per-call `JsonSerializerContext` argument. +- **Errors:** durable exceptions carry `ErrorType`/`ErrorData`/`OriginalStackTrace` so a failure can be + reconstructed on replay when the live exception object is gone. `StepException`, `ChildContextException`, + `CallbackFailedException`/`CallbackTimeoutException`/`CallbackSubmitterException`, `ParallelException`, + `MapException`, and `NonDeterministicExecutionException` all derive from `DurableExecutionException`. + When adding error-mapping logic, handle *both* the fresh path (`InnerException` is the live exception) and + the replay path (`InnerException` is null, `ErrorType` carries the type string) — see + `DurableContext.MapWaitForCallbackException` for the pattern. +- **Public config types** (`StepConfig`, `WaitForCallbackConfig`, `ParallelConfig`, `MapConfig`, + `CompletionConfig`, etc.) are nullable optional args; resolve to an effective config inside the dispatcher. +- Inclusive language is enforced repo-wide (see the user's global rules): no master/slave, whitelist/blacklist. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs new file mode 100644 index 000000000..e565ddb06 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for callback operations created via +/// . +/// +public class CallbackConfig +{ + private TimeSpan _timeout = TimeSpan.Zero; + private TimeSpan _heartbeatTimeout = TimeSpan.Zero; + + /// + /// Maximum total time the service will wait for the external system to + /// complete the callback. (default) means no + /// overall timeout — only applies (if set). + /// + /// + /// The service's timer granularity is 1 second, so values strictly between + /// and 1 second are rejected to avoid silent + /// rounding. Use to disable the timeout, or a + /// value of at least 1 second. + /// + /// + /// Thrown when set to a positive value less than 1 second. + /// + public TimeSpan Timeout + { + get => _timeout; + set + { + ValidateTimeout(value, nameof(Timeout)); + _timeout = value; + } + } + + /// + /// Maximum gap between heartbeat signals from the external system before + /// the service marks the callback as timed-out. + /// (default) means no heartbeat timeout. + /// + /// + /// The service's timer granularity is 1 second, so values strictly between + /// and 1 second are rejected to avoid silent + /// rounding. Use to disable the heartbeat + /// timeout, or a value of at least 1 second. + /// + /// + /// Thrown when set to a positive value less than 1 second. + /// + public TimeSpan HeartbeatTimeout + { + get => _heartbeatTimeout; + set + { + ValidateTimeout(value, nameof(HeartbeatTimeout)); + _heartbeatTimeout = value; + } + } + + private static void ValidateTimeout(TimeSpan value, string paramName) + { + // Allow Zero (means "not set"); reject negative; reject sub-second + // positive values to mirror WaitAsync's behavior and prevent silent + // rounding-up inside BuildCallbackOptions. + if (value < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException( + paramName, value, $"{paramName} must be non-negative."); + } + if (value > TimeSpan.Zero && value < TimeSpan.FromSeconds(1)) + { + throw new ArgumentOutOfRangeException( + paramName, value, + $"{paramName} must be at least 1 second (or TimeSpan.Zero to disable)."); + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs new file mode 100644 index 000000000..2d1244b2b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Base exception type for callback failures surfaced from +/// +/// or +/// . +/// Concrete subclasses distinguish failure modes — pattern-match +/// , , +/// or in catch clauses. +/// +public class CallbackException : DurableExecutionException +{ + /// The callback ID associated with the failure (if known). + public string? CallbackId { get; init; } + + /// The fully-qualified type name of the original error, if known. + public string? ErrorType { get; init; } + + /// Optional structured error data attached by the external system. + public string? ErrorData { get; init; } + + /// Stack trace of the original error, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public CallbackException() { } + + /// Creates a with the given message. + public CallbackException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when the external system reports a failure result for a callback +/// (via SendDurableExecutionCallbackFailure). +/// +public class CallbackFailedException : CallbackException +{ + /// Creates an empty . + public CallbackFailedException() { } + + /// Creates a with the given message. + public CallbackFailedException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackFailedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when the durable execution service marks a callback as timed-out — +/// either the overall or the +/// elapsed. +/// +public class CallbackTimeoutException : CallbackException +{ + /// Creates an empty . + public CallbackTimeoutException() { } + + /// Creates a with the given message. + public CallbackTimeoutException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackTimeoutException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown only from +/// +/// when the user-supplied submitter delegate (the step that hands the callback +/// ID to the external system) fails after retries are exhausted. Wraps the +/// underlying as . +/// +public class CallbackSubmitterException : CallbackException +{ + /// Creates an empty . + public CallbackSubmitterException() { } + + /// Creates a with the given message. + public CallbackSubmitterException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackSubmitterException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs new file mode 100644 index 000000000..c97418a6a --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs @@ -0,0 +1,35 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for a child context. +/// +/// +/// A child context is a logical sub-workflow with its own deterministic +/// operation-ID space, persisted as a CONTEXT operation. Use +/// +/// (and overloads) to run code inside one. +/// +public sealed class ChildContextConfig +{ + /// + /// Operation sub-type label for observability (e.g. "WaitForCallback"). + /// Surfaces on the wire OperationUpdate.SubType field. + /// + public string? SubType { get; set; } + + /// + /// Optional function to transform exceptions thrown by the child context's + /// user function before they surface to the caller. Useful for wrapping + /// low-level errors into domain-specific exceptions. + /// + /// + /// Applied when the user function throws (the mapped exception propagates + /// to the caller of RunInChildContextAsync) and on replay of a + /// FAILED child context (the constructed + /// is mapped before being thrown). + /// + public Func? ErrorMapping { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs new file mode 100644 index 000000000..b31873f67 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs @@ -0,0 +1,111 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Defines completion criteria for parallel/map operations. +/// +/// +/// Construct via the static factories (, +/// , ) or set the +/// individual properties directly. Multiple criteria combine: the operation +/// resolves as soon as any criterion is met (success short-circuit) or violated +/// (failure short-circuit). +/// +public sealed class CompletionConfig +{ + private int? _minSuccessful; + private int? _toleratedFailureCount; + private double? _toleratedFailurePercentage; + + /// + /// Minimum number of items required + /// before the operation resolves successfully. null = no minimum. + /// + /// + /// Thrown by the setter if the value is less than 1. A minimum of + /// zero (or negative) would resolve the operation immediately without + /// dispatching any branch. + /// + public int? MinSuccessful + { + get => _minSuccessful; + set + { + if (value is { } v && v < 1) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MinSuccessful must be at least 1."); + } + _minSuccessful = value; + } + } + + /// + /// Maximum tolerated count. When the + /// failure count strictly exceeds this value, the operation resolves + /// with . + /// null = no count-based failure threshold. + /// + /// + /// Thrown by the setter if the value is negative. A negative tolerance + /// would fail the operation immediately without dispatching any branch. + /// + public int? ToleratedFailureCount + { + get => _toleratedFailureCount; + set + { + if (value is { } v && v < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailureCount must be zero or greater."); + } + _toleratedFailureCount = value; + } + } + + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based failure threshold. + /// + /// + /// Thrown by the setter if the value is outside [0.0, 1.0]. + /// + public double? ToleratedFailurePercentage + { + get => _toleratedFailurePercentage; + set + { + if (value is { } v && (v < 0.0 || v > 1.0)) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailurePercentage must be a ratio in [0.0, 1.0]."); + } + _toleratedFailurePercentage = value; + } + } + + /// + /// All items must succeed. Equivalent to + /// = 0. The default for + /// . + /// + public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; + + /// + /// Run every branch regardless of failures; surface failures per-item via + /// . Resolution does not auto-throw — + /// the caller can inspect the result and call + /// if they want strict-success + /// behavior. + /// + public static CompletionConfig AllCompleted() => new(); + + /// + /// Resolve once at least one branch has succeeded. Branches that were not + /// dispatched before the completion criteria was met are reported as + /// . + /// + public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs new file mode 100644 index 000000000..ed40a1fc8 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs @@ -0,0 +1,29 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Why a batch operation ( +/// or future Map) resolved. +/// +public enum CompletionReason +{ + /// + /// Every branch finished — no short-circuit + /// was triggered. Branches may be a mix of + /// and . + /// + AllCompleted, + + /// + /// branches succeeded; remaining + /// branches were left in . + /// + MinSuccessfulReached, + + /// + /// or + /// was exceeded. + /// The batch is considered failed and surfaces a + /// when awaited. + /// + FailureToleranceExceeded +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs new file mode 100644 index 000000000..c6e1cb6f0 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs @@ -0,0 +1,13 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// A named branch for +/// . +/// Names appear in execution traces and on the wire OperationUpdate.Name +/// field, and surface on . +/// +/// The branch's result type. +/// Human-readable branch name. Required. +/// The user function executed inside the branch's +/// child context. +public sealed record DurableBranch(string Name, Func> Func); diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs new file mode 100644 index 000000000..2f5d699c5 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -0,0 +1,565 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution.Internal; +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Implementation of . Constructs and dispatches +/// per-operation classes (, ); +/// the replay logic lives in those classes. +/// +internal sealed class DurableContext : IDurableContext +{ + private readonly ExecutionState _state; + private readonly TerminationManager _terminationManager; + private readonly OperationIdGenerator _idGenerator; + private readonly string _durableExecutionArn; + private readonly CheckpointBatcher? _batcher; + private ILogger _logger; + + public DurableContext( + ExecutionState state, + TerminationManager terminationManager, + OperationIdGenerator idGenerator, + string durableExecutionArn, + ILambdaContext lambdaContext, + CheckpointBatcher? batcher = null) + { + _state = state; + _terminationManager = terminationManager; + _idGenerator = idGenerator; + _durableExecutionArn = durableExecutionArn; + _batcher = batcher; + LambdaContext = lambdaContext; + _logger = new ReplayAwareLogger(new LambdaCoreLogger(), state, modeAware: true); + } + + public ILogger Logger => _logger; + public IExecutionContext ExecutionContext => new DurableExecutionContext(_durableExecutionArn); + public ILambdaContext LambdaContext { get; } + + public void ConfigureLogger(LoggerConfig config) + { + if (config == null) throw new ArgumentNullException(nameof(config)); + + // If the user supplies a CustomLogger, wrap it. Otherwise re-wrap the + // existing inner logger (unwrapping if it was already a ReplayAwareLogger) + // so toggling ModeAware works without losing the previous custom logger. + var inner = config.CustomLogger + ?? (_logger is ReplayAwareLogger existing ? existing.Inner : _logger); + _logger = new ReplayAwareLogger(inner, _state, config.ModeAware); + } + + public Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default) + => RunStep(func, name, config, cancellationToken); + + public async Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default) + { + // Void steps don't carry a meaningful payload — wrap with an object?-typed + // step that always returns null. The serializer isn't actually invoked + // with a non-null value, so any registered ILambdaSerializer suffices. + await RunStep( + async (ctx) => { await func(ctx); return null; }, + name, config, cancellationToken); + } + + private Task RunStep( + Func> func, + string? name, + StepConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new StepOperation( + operationId, name, _idGenerator.ParentId, func, config, serializer, Logger, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default) + { + // Service timer granularity is 1 second; sub-second waits would round to 0. + // WaitOptions.WaitSeconds is integer in [1, 31_622_400] (1 second to ~1 year). + if (duration < TimeSpan.FromSeconds(1)) + throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at least 1 second."); + + if (duration > TimeSpan.FromSeconds(31_622_400)) + throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at most 31,622,400 seconds (~1 year)."); + + cancellationToken.ThrowIfCancellationRequested(); + + var operationId = _idGenerator.NextId(); + var waitSeconds = (int)Math.Max(1, Math.Ceiling(duration.TotalSeconds)); + var op = new WaitOperation( + operationId, name, _idGenerator.ParentId, waitSeconds, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default) + => RunChildContext(func, name, config, cancellationToken); + + public async Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default) + { + // Void child contexts don't carry a meaningful payload; the wrapper + // returns null so the registered ILambdaSerializer is never asked to + // serialize a real value. + await RunChildContext( + async (ctx) => { await func(ctx); return null; }, + name, config, cancellationToken); + } + + public Task WaitForConditionAsync( + Func> check, + WaitForConditionConfig config, + string? name = null, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(check); + ArgumentNullException.ThrowIfNull(config); + ArgumentNullException.ThrowIfNull(config.WaitStrategy); + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + var operationId = _idGenerator.NextId(); + var op = new WaitForConditionOperation( + operationId, name, _idGenerator.ParentId, check, config, serializer, Logger, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + private Task RunChildContext( + Func> func, + string? name, + ChildContextConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + + var op = new ChildContextOperation( + operationId, name, _idGenerator.ParentId, func, config, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default) + => RunCallback(name, config, cancellationToken); + + private Task> RunCallback( + string? name, + CallbackConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new CallbackOperation( + operationId, name, _idGenerator.ParentId, config, serializer, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(WrapToDurableBranches(branches), name, config, cancellationToken); + + public Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(branches, name, config, cancellationToken); + + private static IReadOnlyList> WrapToDurableBranches( + IReadOnlyList>> branches) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + + var result = new DurableBranch[branches.Count]; + for (var i = 0; i < branches.Count; i++) + { + var func = branches[i]; + if (func == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + // Default name is the index — surfaces in execution traces and on + // IBatchItem.Name. Users wanting custom names use the + // DurableBranch overload. + result[i] = new DurableBranch(i.ToString(System.Globalization.CultureInfo.InvariantCulture), func); + } + return result; + } + + private Task> RunParallel( + IReadOnlyList> branches, + string? name, + ParallelConfig? config, + CancellationToken cancellationToken) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + for (var i = 0; i < branches.Count; i++) + { + if (branches[i] == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + if (branches[i].Func == null) + throw new ArgumentException($"Branch at index {i} has a null Func.", nameof(branches)); + } + + var effectiveConfig = config ?? new ParallelConfig(); + + var serializer = LambdaContext.Serializer + ?? throw new InvalidOperationException( + "No ILambdaSerializer is registered on ILambdaContext.Serializer. " + + "Register a serializer via LambdaBootstrapBuilder.Create(handler, serializer) " + + "(or in tests, set TestLambdaContext.Serializer)."); + + var operationId = _idGenerator.NextId(); + var op = new Internal.ParallelOperation( + operationId, name, _idGenerator.ParentId, branches, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default) + => RunMap(items, func, name, config, cancellationToken); + + private Task> RunMap( + IReadOnlyList items, + Func, Task> func, + string? name, + MapConfig? config, + CancellationToken cancellationToken) + { + if (items == null) throw new ArgumentNullException(nameof(items)); + if (func == null) throw new ArgumentNullException(nameof(func)); + + var effectiveConfig = config ?? new MapConfig(); + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new Internal.MapOperation( + operationId, name, _idGenerator.ParentId, items, func, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default) + => RunWaitForCallback(submitter, name, config, cancellationToken); + + /// + /// Composes WaitForCallback over RunInChildContextAsync + CreateCallbackAsync + /// + StepAsync(submitter) + callback.GetResultAsync. + /// + /// + /// Sub-operation naming follows kebab-style: "{name}-callback" and + /// "{name}-submitter". When the parent is null, + /// the inner ops are also nameless (no leading hyphen). + /// + /// remaps a submitter + /// to . + /// Callback errors () pass through unchanged. + /// + /// + private Task RunWaitForCallback( + Func submitter, + string? name, + WaitForCallbackConfig? config, + CancellationToken cancellationToken) + { + var callbackName = name == null ? null : $"{name}-callback"; + var submitterName = name == null ? null : $"{name}-submitter"; + + var callbackConfig = config == null ? null : new CallbackConfig + { + Timeout = config.Timeout, + HeartbeatTimeout = config.HeartbeatTimeout, + }; + + var stepConfig = config?.RetryStrategy == null + ? null + : new StepConfig { RetryStrategy = config.RetryStrategy }; + + // Delegate to RunInChildContextAsync; the inner CreateCallbackAsync and + // StepAsync calls each pull the registered ILambdaSerializer from + // ILambdaContext.Serializer, so AOT and reflection-based scenarios share + // the same code path. + return RunInChildContextAsync( + async childCtx => + { + var callback = await childCtx.CreateCallbackAsync( + name: callbackName, + config: callbackConfig, + cancellationToken: cancellationToken); + + await childCtx.StepAsync( + async (stepCtx) => + { + var submitterCtx = new WaitForCallbackContext(stepCtx.Logger); + await submitter(callback.CallbackId, submitterCtx); + }, + name: submitterName, + config: stepConfig, + cancellationToken: cancellationToken); + + return await callback.GetResultAsync(cancellationToken); + }, + name, + new ChildContextConfig + { + SubType = OperationSubTypes.WaitForCallback, + ErrorMapping = MapWaitForCallbackException, + }, + cancellationToken); + } + + private static Exception MapWaitForCallbackException(Exception ex) + { + // Callback errors are already user-meaningful (CallbackFailed/Timeout + // from inside the callback await). Pass through. + if (ex is CallbackException) return ex; + + // The ChildContextOperation wraps thrown exceptions in + // ChildContextException; unwrap to surface the underlying cause. + if (ex is ChildContextException childEx) + { + // CallbackException thrown from GetResultAsync (callback completed + // with FAILED/TIMED_OUT) — surface directly. + // + // Fresh-execution path: InnerException is the live exception object. + // Replay path: InnerException is null but ErrorType carries the string. + if (childEx.InnerException is CallbackException nestedLive) + return nestedLive; + if (IsCallbackErrorTypeString(childEx.ErrorType)) + { + // Replay-side reconstruction: preserve subclass fidelity by + // dispatching on the stored ErrorType FullName so a stored + // CallbackTimeoutException remaps to CallbackTimeoutException + // (not the more generic CallbackFailedException). + return BuildCallbackExceptionForReplay(childEx); + } + + // Submitter step exhausted retries → wrap as CallbackSubmitterException. + // Fresh path: InnerException is the live StepException. + if (childEx.InnerException is StepException stepLive) + { + return new CallbackSubmitterException(stepLive.Message, stepLive) + { + ErrorType = stepLive.ErrorType, + ErrorData = stepLive.ErrorData, + OriginalStackTrace = stepLive.OriginalStackTrace, + }; + } + // Replay path: InnerException is null; ErrorType is the type string. + if (childEx.ErrorType == typeof(StepException).FullName) + { + return new CallbackSubmitterException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + } + + // Anything else — surface unchanged so the user sees the original cause. + return ex; + } + + private static CallbackException BuildCallbackExceptionForReplay(ChildContextException childEx) + { + // Dispatch on the stored ErrorType FullName to preserve the original + // subclass across replays. Caller has already verified + // IsCallbackErrorTypeString(childEx.ErrorType) is true. + if (childEx.ErrorType == typeof(CallbackTimeoutException).FullName) + { + return new CallbackTimeoutException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + if (childEx.ErrorType == typeof(CallbackSubmitterException).FullName) + { + return new CallbackSubmitterException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + if (childEx.ErrorType == typeof(CallbackException).FullName) + { + return new CallbackException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + // CallbackFailedException.FullName (or any future callback subtype not + // listed above) defaults to CallbackFailedException — the most general + // "callback failed" surface that preserves user-catchable behavior. + return new CallbackFailedException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + + private static bool IsCallbackErrorTypeString(string? errorType) => + errorType == typeof(CallbackFailedException).FullName + || errorType == typeof(CallbackTimeoutException).FullName + || errorType == typeof(CallbackSubmitterException).FullName + || errorType == typeof(CallbackException).FullName; + + public Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default) + => RunInvoke( + functionName, payload, + name, config, cancellationToken); + + private Task RunInvoke( + string functionName, + TPayload payload, + string? name, + InvokeConfig? config, + CancellationToken cancellationToken) + { + // Argument validation runs synchronously at the call site (matches the + // .NET convention of failing fast for misuse). Match Python/JS/Java + // parity: only check for null/empty here; the durable execution service + // enforces the qualified-ARN rule and surfaces a precise error when an + // unqualified identifier is used. + ArgumentNullException.ThrowIfNull(functionName); + if (string.IsNullOrWhiteSpace(functionName)) + throw new ArgumentException("Function name must not be empty or whitespace.", nameof(functionName)); + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + cancellationToken.ThrowIfCancellationRequested(); + + var operationId = _idGenerator.NextId(); + var op = new InvokeOperation( + operationId, name, _idGenerator.ParentId, functionName, payload, config, + serializer, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + /// + /// Builds the factory used by (and + /// each branch) to construct + /// the inner . The child shares state, + /// termination, batcher, ARN, and Lambda context — but uses a child + /// so its operation IDs are + /// deterministically namespaced under the parent op ID. + /// + /// + /// Builds the factory each operation uses to create the inner + /// its user function runs against. + /// + /// + /// The delegate takes (operationId, reportedParentId, isVirtual): + /// + /// isVirtual == false (the default child-context case): the + /// inner context's ID space and reported parent both root at + /// operationId via ; + /// reportedParentId is ignored. + /// isVirtual == true (a branch): + /// inner-op IDs still root at operationId (so sibling branches + /// never collide), but inner ops report reportedParentId — the + /// parallel/map operation — as their parent, since the virtual branch + /// emits no CONTEXT checkpoint to reference. + /// + /// + private Func MakeChildFactory() + { + return (operationId, reportedParentId, isVirtual) => new DurableContext( + _state, _terminationManager, + isVirtual + ? _idGenerator.CreateVirtualChild(operationId, reportedParentId) + : _idGenerator.CreateChild(operationId), + _durableExecutionArn, LambdaContext, _batcher); + } +} + +internal sealed class WaitForCallbackContext : IWaitForCallbackContext +{ + public WaitForCallbackContext(ILogger logger) + { + Logger = logger; + } + + public ILogger Logger { get; } +} + +internal sealed class DurableExecutionContext : IExecutionContext +{ + public DurableExecutionContext(string durableExecutionArn) + { + DurableExecutionArn = durableExecutionArn; + } + + public string DurableExecutionArn { get; } +} + +internal sealed class StepContext : IStepContext +{ + public StepContext(string operationId, int attemptNumber, ILogger logger) + { + OperationId = operationId; + AttemptNumber = attemptNumber; + Logger = logger; + } + + public ILogger Logger { get; } + public int AttemptNumber { get; } + public string OperationId { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs new file mode 100644 index 000000000..e4748b381 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs @@ -0,0 +1,166 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Base exception for all durable execution errors. +/// +public class DurableExecutionException : Exception +{ + /// Creates an empty . + public DurableExecutionException() { } + /// Creates a with the given message. + public DurableExecutionException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public DurableExecutionException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when code has changed between invocations, causing a replay mismatch. +/// For example, a step at index 0 was previously a WAIT but is now a STEP. +/// +public class NonDeterministicExecutionException : DurableExecutionException +{ + /// Creates an empty . + public NonDeterministicExecutionException() { } + /// Creates a with the given message. + public NonDeterministicExecutionException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public NonDeterministicExecutionException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when user code inside a step fails (after retries exhausted). +/// Contains the original error details from the checkpoint. +/// +public class StepException : DurableExecutionException +{ + /// The fully-qualified type name of the original exception. + public string? ErrorType { get; init; } + /// Optional structured error data attached by the user. + public string? ErrorData { get; init; } + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public StepException() { } + /// Creates a with the given message. + public StepException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public StepException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a step under is +/// detected to have been interrupted mid-execution on a prior invocation +/// (replay sees a STARTED checkpoint with no terminal record). +/// +/// +/// Surfaces in so user-supplied +/// strategies can distinguish "my code threw" from "a previous attempt +/// crashed before it could record a result". +/// +public class StepInterruptedException : StepException +{ + /// Creates an empty . + public StepInterruptedException() { } + /// Creates a with the given message. + public StepInterruptedException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public StepInterruptedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a child context's user function fails. Surfaces from +/// RunInChildContextAsync; the underlying error is preserved on the +/// // +/// fields. Use to remap into a +/// domain-specific exception. +/// +public class ChildContextException : DurableExecutionException +{ + /// + /// The child context's , if any. + /// + public string? SubType { get; init; } + /// The fully-qualified type name of the original exception. + public string? ErrorType { get; init; } + /// Optional structured error data attached by the user. + public string? ErrorData { get; init; } + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public ChildContextException() { } + /// Creates a with the given message. + public ChildContextException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public ChildContextException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a parallel operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-branch outcomes. +/// +/// +/// This is the base type for parallel failures. Subclasses may be added in +/// future releases (for example, a dedicated +/// ParallelFailureToleranceExceededException); catching +/// remains forward-compatible. +/// +public class ParallelException : DurableExecutionException +{ + /// + /// The aggregate result of the parallel operation. Type-erased — cast to + /// IBatchResult<T> if the per-branch result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the parallel operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public ParallelException() { } + /// Creates a with the given message. + public ParallelException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public ParallelException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a map operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-item outcomes. +/// +/// +/// This is the base type for map failures. Subclasses may be added in future +/// releases; catching remains forward-compatible. +/// A dedicated type (rather than reusing ) lets +/// callers pattern-match which concurrent operation failed. +/// +public class MapException : DurableExecutionException +{ + /// + /// The aggregate result of the map operation. Type-erased — cast to + /// IBatchResult<T> if the per-item result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the map operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public MapException() { } + /// Creates a with the given message. + public MapException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public MapException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs new file mode 100644 index 000000000..971ac6f64 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs @@ -0,0 +1,122 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The result of running a durable execution handler. +/// +internal sealed class HandlerResult +{ + public required InvocationStatus Status { get; init; } + public TResult? Result { get; init; } + public string? Message { get; init; } + public Exception? Exception { get; init; } +} + +/// +/// Core orchestration engine for durable execution. Races user code against +/// a termination signal using Task.WhenAny. When user code completes, returns +/// SUCCEEDED/FAILED. When termination wins (wait, callback, invoke), returns PENDING. +/// +internal static class DurableExecutionHandler +{ + /// + /// Runs the user's workflow function within the durable execution engine. + /// + /// + /// + /// Suspension flow — example: await ctx.WaitAsync(TimeSpan.FromSeconds(5)): + /// + /// + /// user code DurableContext TerminationMgr RunAsync + /// ───────── ────────────── ────────────── ──────── + /// WaitAsync(5s) ─────► queue WAIT START + /// checkpoint + /// Terminate() ──────► TerminationTask + /// completes + /// ◄────── new TCS().Task + /// (never completes) + /// await blocks + /// forever WhenAny: + /// ── termination wins + /// ── userTask abandoned + /// ── return Pending + /// + /// + /// Key insight: WaitAsync never returns a completed Task — it hands back + /// a TaskCompletionSource that is never resolved. The user's await blocks + /// indefinitely. The escape signal is terminationManager.Terminate(), + /// which Task.WhenAny picks up. We return Pending; the dangling user + /// Task is GC'd. The service flushes checkpoints, fires the wait timer, then + /// re-invokes Lambda — on replay, WaitAsync sees the matching SUCCEED + /// checkpoint and returns Task.CompletedTask normally. + /// + /// + /// The same pattern applies to retries (RetryScheduled), callbacks + /// (CallbackPending), and chained invokes (InvokePending). + /// + /// + /// The workflow return type. + /// Hydrated execution state from prior invocations. + /// Manages the suspension signal. + /// The user's workflow function receiving a DurableContext. + /// The handler result indicating SUCCEEDED, FAILED, or PENDING. + internal static async Task> RunAsync( + ExecutionState executionState, + TerminationManager terminationManager, + Func> userHandler) + { + // Run user code on a threadpool thread so it executes independently of + // the termination signal. When TerminationManager fires (e.g., WaitAsync), + // we need the WhenAny race below to resolve immediately without waiting + // for the user task to reach an await point. + var userTask = Task.Run(userHandler); + + // Race: user code completing vs. termination signal (wait/callback/retry). + // If termination wins, we return PENDING and the abandoned userTask is never awaited. + var winner = await Task.WhenAny(userTask, terminationManager.TerminationTask); + + if (winner == terminationManager.TerminationTask) + { + var terminationResult = await terminationManager.TerminationTask; + + if (terminationResult.Exception != null) + { + return new HandlerResult + { + Status = InvocationStatus.Failed, + Message = terminationResult.Exception.Message, + Exception = terminationResult.Exception + }; + } + + return new HandlerResult + { + Status = InvocationStatus.Pending, + Message = terminationResult.Message + }; + } + + try + { + var result = await userTask; + return new HandlerResult + { + Status = InvocationStatus.Succeeded, + Result = result + }; + } + catch (Exception ex) + { + return new HandlerResult + { + Status = InvocationStatus.Failed, + Message = ex.Message, + Exception = ex + }; + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs new file mode 100644 index 000000000..9c2b22b41 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The service envelope input for a durable execution invocation. +/// This is what Lambda receives from the durable execution service. +/// +public sealed class DurableExecutionInvocationInput +{ + /// + /// The unique ARN identifying this durable execution. + /// + [JsonPropertyName("DurableExecutionArn")] + public required string DurableExecutionArn { get; set; } + + /// + /// Token for optimistic concurrency on checkpoint operations. + /// + [JsonPropertyName("CheckpointToken")] + public string? CheckpointToken { get; set; } + + /// + /// Previously checkpointed operation state for replay. Consumed by + /// DurableFunction.WrapAsync for replay correlation; user code + /// should not modify this on a live invocation envelope. + /// + [JsonPropertyName("InitialExecutionState")] + public InitialExecutionState? InitialExecutionState { get; set; } +} + +/// +/// The previously checkpointed execution state provided on replay invocations. +/// +public sealed class InitialExecutionState +{ + /// + /// The list of operations from prior invocations. + /// + [JsonPropertyName("Operations")] + public IReadOnlyList? Operations { get; set; } + + /// + /// If present, indicates that more operations are available. Use this value + /// with GetDurableExecutionState to fetch the next page. + /// + [JsonPropertyName("NextMarker")] + public string? NextMarker { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs new file mode 100644 index 000000000..f02e38a99 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The service envelope output returned by a durable execution invocation. +/// +public sealed class DurableExecutionInvocationOutput +{ + /// + /// The terminal status of this invocation. + /// + [JsonPropertyName("Status")] + [JsonConverter(typeof(UpperSnakeCaseEnumConverter))] + public required InvocationStatus Status { get; set; } + + /// + /// The serialized result (only present when Status is Succeeded). + /// + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// + /// Error details (only present when Status is Failed). + /// + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs new file mode 100644 index 000000000..cb5a7a297 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs @@ -0,0 +1,259 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using System.Threading; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.DurableExecution.Services; +using Amazon.Lambda.Model; +using Amazon.Runtime; +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Static helper that wraps a durable workflow function, handling all envelope +/// translation between DurableExecutionInvocationInput/Output and user types. +/// +/// All four overloads dispatch through the registered +/// on , so AOT-safe and reflection-based +/// callers share a single code path. Callers wire AOT support by registering an +/// AOT-aware serializer with the runtime +/// (e.g., SourceGeneratorLambdaJsonSerializer<TContext>) — no per-call +/// JsonSerializerContext argument is required. +/// +public static class DurableFunction +{ + private static readonly Lazy _cachedLambdaClient = + new(() => new AmazonLambdaClient(), LazyThreadSafetyMode.ExecutionAndPublication); + + /// + /// Wrap a workflow (typed input + output). + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) + => WrapAsyncCore(workflow, invocationInput, lambdaContext, _cachedLambdaClient.Value); + + /// + /// Wrap a workflow (typed input + output) with explicit Lambda client. + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + => WrapAsyncCore(workflow, invocationInput, lambdaContext, lambdaClient); + + /// + /// Wrap a void workflow (typed input, no output). + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) + => WrapAsync(workflow, invocationInput, lambdaContext, _cachedLambdaClient.Value); + + /// + /// Wrap a void workflow with explicit Lambda client. + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + => WrapAsyncCore( + async (input, ctx) => { await workflow(input, ctx); return null; }, + invocationInput, lambdaContext, lambdaClient); + + private static async Task WrapAsyncCore( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + { + var serializer = LambdaSerializerHelper.GetRequired(lambdaContext); + + var state = new ExecutionState(); + state.LoadFromCheckpoint(invocationInput.InitialExecutionState); + + var serviceClient = new LambdaDurableServiceClient(lambdaClient); + var checkpointToken = invocationInput.CheckpointToken; + + var nextMarker = invocationInput.InitialExecutionState?.NextMarker; + while (!string.IsNullOrEmpty(nextMarker)) + { + var (operations, marker) = await serviceClient.GetExecutionStateAsync( + invocationInput.DurableExecutionArn, checkpointToken, nextMarker); + state.AddOperations(operations); + nextMarker = marker; + } + + var userPayload = ExtractUserPayload(invocationInput, serializer); + var terminationManager = new TerminationManager(); + var idGenerator = new OperationIdGenerator(); + + await using var batcher = new CheckpointBatcher( + checkpointToken, + (token, ops, ct) => serviceClient.CheckpointAsync( + invocationInput.DurableExecutionArn, token, ops, + // The service stamps a freshly-allocated CallbackId onto a started + // CALLBACK op (and may emit terminal-state callbacks/timers); merge + // those back into ExecutionState so the next ExecuteAsync sees them. + onNewOperations: state.AddOperations, + cancellationToken: ct)); + + var context = new DurableContext( + state, terminationManager, idGenerator, + invocationInput.DurableExecutionArn, lambdaContext, batcher); + + HandlerResult result; + try + { + // Push execution-level metadata into a logging scope so structured + // providers (the runtime's JSON formatter, Serilog, Powertools, + // etc.) tag every log line emitted by user code with the + // execution ARN and request id. + using (context.Logger.BeginScope(new Dictionary + { + ["durableExecutionArn"] = invocationInput.DurableExecutionArn, + ["awsRequestId"] = lambdaContext.AwsRequestId ?? string.Empty, + })) + { + result = await DurableExecutionHandler.RunAsync( + state, terminationManager, + async () => await workflow(userPayload, context)); + } + + await batcher.DrainAsync(); + } + catch (DurableExecutionException ex) when (ex.InnerException is AmazonServiceException sdkEx && IsTerminalCheckpointError(sdkEx)) + { + return new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = ErrorObject.FromException(ex) + }; + } + + return MapToOutput(result, serializer); + } + + /// + /// Returns true for checkpoint-flush SDK errors that should fail the workflow + /// (Failed envelope) instead of escaping to the host (Lambda retry). The catch + /// site unwraps a first because + /// wraps every SDK error so + /// user logs show durable-execution context — this method then classifies the + /// inner . + /// + /// + /// Classification rule: + /// - 4xx (except 429) → terminal: permanent caller-side failure (missing ARN/KMS key, + /// IAM denial, validation). Retrying will not fix it, so return Failed. + /// - 429 / 5xx / no status (network or SDK-internal) → not terminal: transient, + /// allow the exception to escape so Lambda retries the invocation. + /// - Carve-out: InvalidParameterValueException with a message starting with + /// "Invalid Checkpoint Token" is treated as transient — the service rejects a + /// stale token but a retry with a fresh token will succeed. + /// + /// Only checkpoint-flush errors flow through this catch. There are two paths: + /// 1. A flush triggered synchronously from inside a user StepAsync call + /// (the user awaits EnqueueAsync → batch flush → SDK throws → service client + /// wraps). + /// 2. The final after the workflow returns. + /// + /// State-hydration errors (GetExecutionStateAsync) propagate as + /// too, but they are NOT caught here — they + /// flow up to the host so Lambda retries. + /// + /// User-code SDK errors (e.g. an SDK call inside a Step body) are caught by + /// StepRunner and surfaced as StepException for the workflow's normal + /// step-failure handling. + /// + private static bool IsTerminalCheckpointError(AmazonServiceException ex) + { + var status = (int)ex.StatusCode; + if (status < 400 || status >= 500 || status == 429) + return false; + + if (ex.ErrorCode == "InvalidParameterValueException" + && ex.Message != null + && ex.Message.StartsWith("Invalid Checkpoint Token", StringComparison.Ordinal)) + { + return false; + } + + return true; + } + + // The user's input payload is stored inside the service envelope as an EXECUTION-type + // operation. This is part of the durable execution wire format — each invocation includes + // its input as a checkpoint record so the service can validate replay consistency. + // A missing EXECUTION op is a malformed envelope: surfacing it as a typed exception here + // gives a clear error instead of letting default!/null bubble into user code as an opaque + // NullReferenceException. + private static TInput ExtractUserPayload( + DurableExecutionInvocationInput input, + ILambdaSerializer serializer) + { + if (input.InitialExecutionState?.Operations != null) + { + foreach (var op in input.InitialExecutionState.Operations) + { + if (op.Type != OperationTypes.Execution || op.ExecutionDetails?.InputPayload == null) + continue; + + var payload = op.ExecutionDetails.InputPayload; + var bytes = Encoding.UTF8.GetBytes(payload); + using var ms = new MemoryStream(bytes); + return serializer.Deserialize(ms); + } + } + + throw new DurableExecutionException( + "Durable execution envelope is malformed: no EXECUTION-type operation with an input payload was found. " + + "The service must include an EXECUTION op carrying the workflow's input on every invocation."); + } + + private static DurableExecutionInvocationOutput MapToOutput( + HandlerResult result, + ILambdaSerializer serializer) + { + return result.Status switch + { + InvocationStatus.Succeeded => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Succeeded, + Result = SerializeOutput(result.Result, serializer) + }, + InvocationStatus.Failed => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = result.Exception != null + ? ErrorObject.FromException(result.Exception) + : new ErrorObject { ErrorMessage = result.Message } + }, + // Pending = workflow suspended (wait/retry/callback). No Result or Error — + // the service will re-invoke with accumulated checkpoints when ready. + InvocationStatus.Pending => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Pending + }, + _ => throw new InvalidOperationException($"Unexpected status: {result.Status}") + }; + } + + private static string? SerializeOutput(TOutput? value, ILambdaSerializer serializer) + { + if (value == null) return null; + + using var ms = new MemoryStream(); + serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs new file mode 100644 index 000000000..7b8c02402 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs @@ -0,0 +1,17 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The terminal status of a durable execution invocation. +/// +public enum InvocationStatus +{ + /// The workflow completed successfully. + Succeeded, + /// The workflow failed with an unhandled exception. + Failed, + /// The workflow suspended (waiting for time, callback, or invocation). + Pending +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs new file mode 100644 index 000000000..88618f2cb --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs @@ -0,0 +1,91 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Serializable error representation stored in checkpoint state. +/// +public sealed class ErrorObject +{ + /// + /// The fully-qualified exception type name. + /// + [JsonPropertyName("ErrorType")] + public string? ErrorType { get; set; } + + /// + /// The exception message. + /// + [JsonPropertyName("ErrorMessage")] + public string? ErrorMessage { get; set; } + + /// + /// Stack trace frames. + /// + [JsonPropertyName("StackTrace")] + public IReadOnlyList? StackTrace { get; set; } + + /// + /// Additional serialized error data. + /// + [JsonPropertyName("ErrorData")] + public string? ErrorData { get; set; } + + /// + /// Creates an ErrorObject from an exception. + /// + /// + /// SDK operation wrappers (, + /// , , + /// ) unwrap to the original error captured + /// from the failed operation — preserving the user-visible + /// ErrorType/ErrorData/StackTrace instead of recording + /// the wrapper's type. This way a chained invoker sees the originating + /// exception (e.g. System.InvalidOperationException) rather than + /// Amazon.Lambda.DurableExecution.StepException. Mirrors the Java + /// SDK's DurableExecutor.buildErrorObject behavior. + /// + public static ErrorObject FromException(Exception exception) + { + return exception switch + { + StepException step => new ErrorObject + { + ErrorType = step.ErrorType, + ErrorMessage = step.Message, + StackTrace = step.OriginalStackTrace, + ErrorData = step.ErrorData + }, + ChildContextException child => new ErrorObject + { + ErrorType = child.ErrorType, + ErrorMessage = child.Message, + StackTrace = child.OriginalStackTrace, + ErrorData = child.ErrorData + }, + InvokeException invoke => new ErrorObject + { + ErrorType = invoke.ErrorType, + ErrorMessage = invoke.Message, + StackTrace = invoke.OriginalStackTrace, + ErrorData = invoke.ErrorData + }, + CallbackException callback => new ErrorObject + { + ErrorType = callback.ErrorType, + ErrorMessage = callback.Message, + StackTrace = callback.OriginalStackTrace, + ErrorData = callback.ErrorData + }, + _ => new ErrorObject + { + ErrorType = exception.GetType().FullName, + ErrorMessage = exception.Message, + StackTrace = exception.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries) + } + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs new file mode 100644 index 000000000..62814fd62 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs @@ -0,0 +1,38 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// One item inside an — the outcome of a single +/// branch (parallel) or item (map). +/// +/// The branch/item result type. +public interface IBatchItem +{ + /// + /// Zero-based position in the original branches/items list. Stable across + /// replays. + /// + int Index { get; } + + /// + /// Optional human-readable name for this branch/item. + /// Surfaces on the wire OperationUpdate.Name field for observability. + /// + string? Name { get; } + + /// + /// Status of this item at the moment the batch resolved. + /// + BatchItemStatus Status { get; } + + /// + /// The branch/item result. Populated only when is + /// . + /// + T? Result { get; } + + /// + /// The branch/item failure. Populated only when is + /// . + /// + DurableExecutionException? Error { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs new file mode 100644 index 000000000..90d7e14b7 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs @@ -0,0 +1,90 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Non-generic marker for . Used by +/// so callers can hold a reference to +/// the aggregate result without knowing the per-branch type at compile time. +/// +public interface IBatchResult +{ + /// + /// Why the batch resolved. + /// + CompletionReason CompletionReason { get; } + + /// True if any item is in . + bool HasFailure { get; } + + /// Number of items in . + int SuccessCount { get; } + + /// Number of items in . + int FailureCount { get; } + + /// Number of items in . + int StartedCount { get; } + + /// Total number of items. + int TotalCount { get; } +} + +/// +/// Result of a parallel (and future map) operation. Aggregates the per-branch +/// outcomes, completion bookkeeping, and convenience accessors. +/// +/// The per-branch/per-item result type. +/// +/// The result is reconstructed from per-branch checkpoints — the aggregate is +/// never serialized as a single blob in user T. Per-branch results live on +/// ParallelBranch child-context checkpoints; this type assembles them. +/// +public interface IBatchResult : IBatchResult +{ + /// + /// All items, in original index order. + /// + IReadOnlyList> All { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Succeeded { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Failed { get; } + + /// + /// Items that were not dispatched when the batch resolved (a + /// short-circuit fired before they were started), + /// in original index order. + /// + IReadOnlyList> Started { get; } + + /// + /// Returns the results of every successful item, in original index order. + /// + /// + /// Items in or are skipped — this + /// method never throws on partial-failure batches. Use + /// if you want a strict-success accessor. + /// + IReadOnlyList GetResults(); + + /// + /// Returns the errors for every failed item, in original index order. + /// + IReadOnlyList GetErrors(); + + /// + /// Throws the first failed item's if any + /// item failed; no-op otherwise. + /// + /// + /// The first failed item's error. + /// + void ThrowIfError(); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs new file mode 100644 index 000000000..a6484a480 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// A pending callback created by +/// . +/// Hands back a for external systems to use, plus a +/// hook that +/// suspends the workflow until the external system completes the callback. +/// +/// The callback result type. +public interface ICallback +{ + /// + /// The callback ID generated by the durable execution service. External + /// systems pass this ID to SendDurableExecutionCallbackSuccess / + /// SendDurableExecutionCallbackFailure / + /// SendDurableExecutionCallbackHeartbeat to deliver a result. + /// + string CallbackId { get; } + + /// + /// Suspends the workflow until the callback is completed, then returns the + /// deserialized result. + /// + /// + /// On the first invocation that reaches this call, the workflow suspends + /// (Lambda terminates). When the external system completes the callback + /// the service re-invokes Lambda; this call then returns the cached result + /// without re-executing user code. + /// + /// + /// Thrown when the external system reported a failure result. + /// + /// + /// Thrown when the service timed out the callback (overall timeout or + /// heartbeat timeout elapsed). + /// + Task GetResultAsync(CancellationToken cancellationToken = default); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IConditionCheckContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IConditionCheckContext.cs new file mode 100644 index 000000000..cd1d605b6 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IConditionCheckContext.cs @@ -0,0 +1,26 @@ +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Context passed to a WaitForConditionAsync check function on every +/// polling iteration. Provides a logger scoped to the current attempt and the +/// 1-based attempt number, mirroring the surface of +/// (minus OperationId: every iteration of a +/// wait-for-condition operation shares the same operation ID, so exposing it +/// here would be misleading — see DESIGN-QUESTIONS.md#Q6). +/// +public interface IConditionCheckContext +{ + /// + /// Logger scoped to this condition-check attempt. + /// + ILogger Logger { get; } + + /// + /// The current 1-based attempt number. Increments on every polling + /// iteration; on replay, equals the number of attempts already + /// checkpointed plus one. + /// + int AttemptNumber { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs new file mode 100644 index 000000000..2f25b553f --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs @@ -0,0 +1,477 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The primary interface for durable execution operations. +/// Passed to user workflow functions to access checkpointed steps and waits. +/// Additional operations (callbacks, parallel, map, etc.) are added in +/// follow-up PRs. +/// +public interface IDurableContext +{ + /// + /// Replay-safe logger. Messages emitted while the workflow is re-deriving + /// prior operations from checkpointed state are suppressed by default, so + /// a 30-step workflow re-invoked 30 times still emits each line once. + /// Use this instead of Console.WriteLine or other ambient loggers, + /// which will repeat on every replay. Replace the underlying logger or + /// disable replay-aware filtering via . + /// + ILogger Logger { get; } + + /// + /// Swap the underlying logger or toggle replay-aware filtering. Idempotent — + /// later calls overwrite earlier configuration. + /// + /// + /// The logger configuration specifying the underlying logger and whether + /// replay-aware filtering is enabled. + /// + void ConfigureLogger(LoggerConfig config); + + /// + /// Metadata about the current durable execution. + /// + IExecutionContext ExecutionContext { get; } + + /// + /// The underlying Lambda context. + /// + ILambdaContext LambdaContext { get; } + + /// + /// Execute a step with automatic checkpointing. The step result is serialized + /// to a checkpoint using the registered on + /// . AOT and reflection-based scenarios + /// share this single overload — the AOT story is determined by the registered + /// serializer (e.g., SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// The type of the step's result. + /// + /// The step body to execute. Receives an exposing + /// the step's logger, attempt number, and operation ID. + /// + /// + /// An optional name for the step, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional step configuration (e.g. retry policy). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// The deserialized result of the step. + Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute a step that returns no value. + /// + /// + /// The step body to execute. Receives an exposing + /// the step's logger, attempt number, and operation ID. + /// + /// + /// An optional name for the step, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional step configuration (e.g. retry policy). Defaults are used when null. + /// + /// A token to observe for cancellation. + Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Suspend execution for the specified duration without consuming compute time. + /// The Lambda is suspended and the service re-invokes it after the wait elapses. + /// Duration must be at least 1 second (service timer granularity). + /// + /// + /// How long to suspend execution. Must be at least 1 second. + /// + /// + /// An optional name for the wait, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// A token to observe for cancellation. + Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); + + /// + /// Run a user function inside a logical sub-workflow (a "child context"). + /// The child has its own deterministic operation-ID space; its result is + /// checkpointed as a CONTEXT operation so subsequent invocations + /// replay the cached value without re-executing the func. + /// + /// + /// Use child contexts to group related durable operations (e.g. a step plus + /// a wait plus a step) into a single observability/error-handling boundary. + /// On failure, surfaces as ; supply + /// to remap into a + /// domain-specific exception. + /// The child context's return value is serialized to a checkpoint using the + /// registered on + /// . + /// + /// The type of the child context's result. + /// + /// The user function to run inside the child context. Receives a nested + /// with its own deterministic operation-ID space. + /// + /// + /// An optional name for the child context, used for observability and to derive + /// the deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional child context configuration (e.g. + /// ). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// The deserialized result of the child context. + Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Run a user function inside a logical sub-workflow (a "child context") + /// that returns no value. The child has its own deterministic operation-ID + /// space and is checkpointed as a CONTEXT operation so subsequent + /// invocations skip re-executing the func. + /// + /// + /// Use child contexts to group related durable operations (e.g. a step plus + /// a wait plus a step) into a single observability/error-handling boundary. + /// On failure, surfaces as ; supply + /// to remap into a + /// domain-specific exception. + /// + /// + /// The user function to run inside the child context. Receives a nested + /// with its own deterministic operation-ID space. + /// + /// + /// An optional name for the child context, used for observability and to derive + /// the deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional child context configuration (e.g. + /// ). Defaults are used when null. + /// + /// A token to observe for cancellation. + Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Create a callback for an external system to complete. Returns an + /// handle exposing the service-allocated + /// (pass to the external system) and + /// + /// (await to suspend until a result arrives). + /// + /// + /// The callback result is deserialized using the + /// registered on . AOT and reflection-based + /// scenarios share this single overload — the AOT story is determined by the + /// registered serializer (e.g., + /// SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// Errors are deferred to ; + /// CreateCallbackAsync always returns successfully so user code + /// between CreateCallbackAsync and the result-await runs deterministically + /// across replays. + /// + /// + /// The type of the result the callback will deliver. + /// + /// An optional name for the callback, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional callback configuration (e.g. timeout). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// + /// An handle exposing the service-allocated callback + /// ID and a method to await the result. + /// + Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Composite operation that creates a callback, runs the supplied submitter + /// (which hands the callbackId to an external system), and suspends + /// until the external system delivers a result. Equivalent to manually + /// composing + /// + + /// + + /// inside a child context. + /// + /// + /// Submitter failures (after retries are exhausted) surface as + /// . Callback failures and timeouts + /// surface as / + /// . + /// + /// The type of the result the callback will deliver. + /// + /// A function that hands the service-allocated callbackId to the external + /// system. Receives the callback ID and an . + /// + /// + /// An optional name for the operation, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional configuration (e.g. submitter retry policy and callback timeout). + /// Defaults are used when null. + /// + /// A token to observe for cancellation. + /// The deserialized result delivered by the external system. + Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Invoke another durable Lambda function and await its result. The + /// invocation is checkpointed so it survives parent failures and is not + /// double-fired on replay. The payload and result are serialized to/from + /// a checkpoint using the registered on + /// . + /// + /// + /// must be a qualified identifier (version, + /// alias, or $LATEST); unqualified ARNs are rejected by the durable + /// execution service. + /// + /// The type of the payload sent to the target function. + /// The type of the result returned by the target function. + /// + /// The qualified identifier (version, alias, or $LATEST) of the durable + /// Lambda function to invoke. Unqualified ARNs are rejected. + /// + /// The payload to pass to the target function. + /// + /// An optional name for the invocation, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional invocation configuration (e.g. retry policy). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// The deserialized result returned by the target function. + Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Poll a condition by repeatedly invoking until + /// the configured decides to stop. + /// Between polls the workflow is suspended (no compute charge); the + /// service re-invokes the Lambda when the strategy's chosen delay elapses. + /// + /// + /// On every iteration the function receives the + /// state returned by the previous invocation (seeded by + /// on the very + /// first call), so users can carry per-poll bookkeeping (e.g. a cursor or + /// retry counter) inside the state itself. If the strategy stops because + /// of 's max-attempts limit (rather + /// than because the condition is met), a + /// is thrown carrying the last observed state. + /// The check function's return value is serialized to a checkpoint using + /// the registered on + /// . AOT and reflection-based + /// scenarios share this single overload — the AOT story is determined by + /// the registered serializer (e.g., + /// SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// + /// The type of the per-poll state carried between condition checks. + /// + /// + /// The condition check invoked on each poll. Receives the state returned by the + /// previous invocation (seeded by + /// on the first call) + /// and an , and returns the next state. + /// + /// + /// The configuration controlling polling, including the + /// and the initial state. + /// + /// + /// An optional name for the operation, used for observability and to derive the + /// deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// A token to observe for cancellation. + /// The final state observed when the strategy decides to stop. + Task WaitForConditionAsync( + Func> check, + WaitForConditionConfig config, + string? name = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple branches concurrently. Each branch runs inside its own + /// child context; per-branch results are aggregated into an + /// . Branches are dispatched up to + /// ; the aggregate resolves + /// according to . + /// + /// + /// On per-branch failure (a branch's user function throws), the failure is + /// captured on the corresponding instead of + /// aborting the parallel. The parallel only throws + /// when + /// criteria are violated. Use + /// for explicit strict-success + /// semantics. Per-branch results are serialized to checkpoints using the + /// registered on + /// (typically configured via + /// LambdaBootstrapBuilder.Create(handler, serializer)). + /// + /// The type of the result produced by each branch. + /// + /// The branches to execute concurrently. Each branch receives its own + /// and returns a result of type . + /// + /// + /// An optional name for the parallel operation, used for observability and to derive + /// the deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional parallel configuration (e.g. + /// and ). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// + /// An aggregating the per-branch results, resolved + /// according to . + /// + Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple named branches concurrently. Names appear in execution + /// traces and on . + /// + /// + /// Per-branch results are serialized to checkpoints using the + /// registered on + /// . + /// + /// The type of the result produced by each branch. + /// + /// The named branches to execute concurrently. Each + /// carries a name (surfaced on ) and the function to run. + /// + /// + /// An optional name for the parallel operation, used for observability and to derive + /// the deterministic operation ID. Defaults to a name inferred from the call site. + /// + /// + /// Optional parallel configuration (e.g. + /// and ). Defaults are used when null. + /// + /// A token to observe for cancellation. + /// + /// An aggregating the per-branch results, resolved + /// according to . + /// + Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Process a collection of items concurrently, running + /// once per item. Each item runs inside its own child context; per-item + /// results are aggregated into an . Items + /// are dispatched up to ; the aggregate + /// resolves according to . + /// + /// + /// The per-item function receives the durable context, the item, its + /// zero-based index, and the full source list (matching the Python and + /// JavaScript SDKs). On per-item failure (the user function throws), the + /// failure is captured on the corresponding + /// instead of aborting the map. By default + /// () every item runs and failures + /// surface via ; the map throws + /// only when + /// criteria are violated. Use + /// for explicit + /// strict-success semantics. Per-item results are serialized to checkpoints + /// using the registered on + /// . + /// + Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); +} + +/// +/// Context passed to step functions. +/// +public interface IStepContext +{ + /// + /// Logger scoped to this step. Same instance as + /// ; emits within an + /// that carries the step's + /// operationId, operationName, and attempt. + /// + ILogger Logger { get; } + + /// + /// The current retry attempt number (1-based). + /// + int AttemptNumber { get; } + + /// + /// The deterministic operation ID for this step. + /// + string OperationId { get; } +} + +/// +/// Metadata about the current execution. +/// +public interface IExecutionContext +{ + /// + /// The ARN of the current durable execution. + /// + string DurableExecutionArn { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs new file mode 100644 index 000000000..d871ebb98 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Determines whether a failed step should be retried and with what delay. +/// +public interface IRetryStrategy +{ + /// + /// Evaluates whether the given exception warrants a retry. + /// + /// The exception that caused the step to fail. + /// The 1-based attempt number that just failed. + /// A decision indicating whether to retry and the delay before the next attempt. + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +/// +/// The outcome of a retry evaluation. +/// +public readonly struct RetryDecision +{ + /// Whether the step should be retried. + public bool ShouldRetry { get; } + + /// The delay before the next retry attempt. + public TimeSpan Delay { get; } + + private RetryDecision(bool shouldRetry, TimeSpan delay) + { + ShouldRetry = shouldRetry; + Delay = delay; + } + + /// Indicates the step should not be retried. + public static RetryDecision DoNotRetry() => new(false, TimeSpan.Zero); + + /// Indicates the step should be retried after the specified delay. + public static RetryDecision RetryAfter(TimeSpan delay) => new(true, delay); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs new file mode 100644 index 000000000..866fb3bab --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs @@ -0,0 +1,23 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Context passed to the submitter delegate of +/// . +/// Provides a replay-safe logger scoped to the submitter step. +/// +/// +/// Distinct from so the submitter API can evolve +/// independently. Logger-only surface. +/// +public interface IWaitForCallbackContext +{ + /// + /// Logger scoped to the submitter step. + /// + ILogger Logger { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IWaitStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitStrategy.cs new file mode 100644 index 000000000..7ca26964a --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitStrategy.cs @@ -0,0 +1,27 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Decides, per polling iteration, whether a WaitForConditionAsync +/// operation should keep polling and how long to wait before the next attempt. +/// +/// +/// Distinct from : that interface decides +/// retry-on-exception (input is the thrown ); this one +/// decides poll-until-condition (input is the latest +/// observed by the check function). Implementations are typically obtained +/// via the factory; users who need richer logic +/// (e.g. wall-clock-time budgets, conditional jitter) can implement this +/// interface directly. +/// +/// The state type produced by the check function. +public interface IWaitStrategy +{ + /// + /// Evaluates the latest from the check function + /// and the 1-based just executed, and + /// returns either (terminate) or + /// (poll again after + /// the given delay). + /// + WaitDecision Decide(TState state, int attemptNumber); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs new file mode 100644 index 000000000..5c9dda77c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs @@ -0,0 +1,15 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation produced by +/// when assembling the +/// . +/// +internal sealed class BatchItem : IBatchItem +{ + public required int Index { get; init; } + public required string? Name { get; init; } + public required BatchItemStatus Status { get; init; } + public T? Result { get; init; } + public DurableExecutionException? Error { get; init; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs new file mode 100644 index 000000000..db97f02c1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs @@ -0,0 +1,16 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// AOT-friendly for the internal +/// payload stored on a concurrent operation's parent +/// CONTEXT checkpoint (parallel or map). Only this internal type — never user T — +/// flows through here, so the source-generated metadata is sufficient. +/// +[JsonSerializable(typeof(BatchSummary))] +[JsonSerializable(typeof(BatchUnitSummary))] +[JsonSerializable(typeof(ErrorObject))] +internal sealed partial class BatchJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs new file mode 100644 index 000000000..362303a0e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs @@ -0,0 +1,80 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation. Computes derived views +/// ( / / ) +/// eagerly so consumers don't pay for re-filtering on every access. +/// +internal sealed class BatchResult : IBatchResult +{ + public BatchResult(IReadOnlyList> all, CompletionReason completionReason) + { + All = all; + CompletionReason = completionReason; + + var succeeded = new List>(); + var failed = new List>(); + var started = new List>(); + + foreach (var item in all) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded.Add(item); break; + case BatchItemStatus.Failed: failed.Add(item); break; + case BatchItemStatus.Started: started.Add(item); break; + } + } + + Succeeded = succeeded; + Failed = failed; + Started = started; + } + + public IReadOnlyList> All { get; } + public IReadOnlyList> Succeeded { get; } + public IReadOnlyList> Failed { get; } + public IReadOnlyList> Started { get; } + public CompletionReason CompletionReason { get; } + + public bool HasFailure => Failed.Count > 0; + + public int SuccessCount => Succeeded.Count; + public int FailureCount => Failed.Count; + public int StartedCount => Started.Count; + public int TotalCount => All.Count; + + public IReadOnlyList GetResults() + { + var list = new List(Succeeded.Count); + foreach (var item in Succeeded) + { + // Result is non-null on success items by construction; the BCL-typed + // index is preserved by walking Succeeded (already in original order). + list.Add(item.Result!); + } + return list; + } + + public IReadOnlyList GetErrors() + { + var list = new List(Failed.Count); + foreach (var item in Failed) + { + // Error is non-null on failure items by construction. + list.Add(item.Error!); + } + return list; + } + + public void ThrowIfError() + { + foreach (var item in All) + { + if (item.Status == BatchItemStatus.Failed && item.Error != null) + { + throw item.Error; + } + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs new file mode 100644 index 000000000..b118ce558 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs @@ -0,0 +1,56 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Internal payload shape stored on a concurrent operation's parent CONTEXT +/// checkpoint (as ContextDetails.Result) and reconstructed on replay. +/// Shared by both and +/// : carries the completion reason and +/// the per-unit index → status map so the can be +/// rebuilt without depending on user T shape. +/// +/// +/// Under per-unit results live on the children's +/// own CONTEXT checkpoints and only (plus +/// index/name) is recorded here. Under the +/// children emit no checkpoint, so each unit's serialized result +/// () or error +/// () is recorded inline here and read back +/// on replay. +/// +internal sealed class BatchSummary +{ + [JsonPropertyName("CompletionReason")] + public string? CompletionReason { get; set; } + + [JsonPropertyName("Units")] + public IList Units { get; set; } = new List(); +} + +internal sealed class BatchUnitSummary +{ + [JsonPropertyName("Index")] + public int Index { get; set; } + + [JsonPropertyName("Name")] + public string? Name { get; set; } + + [JsonPropertyName("Status")] + public string? Status { get; set; } + + /// + /// Serialized per-unit result, recorded inline only for + /// succeeded units (where no child checkpoint + /// exists to read it from). null under . + /// + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// + /// Per-unit error, recorded inline only for + /// failed units. null under . + /// + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs new file mode 100644 index 000000000..16b06480c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs @@ -0,0 +1,272 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkCallbackOptions = Amazon.Lambda.Model.CallbackOptions; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable callback operation. Sync-flushes a CALLBACK START checkpoint +/// (the service stamps a freshly-allocated CallbackId onto the response, +/// which the batcher merges back into ), then hands +/// the user an they can later +/// +/// to suspend on. +/// +/// +/// Replay branches — example: +/// +/// var cb = await ctx.CreateCallbackAsync<ApprovalResult>(name: "approval"); +/// // ... external system told to use cb.CallbackId ... +/// var result = await cb.GetResultAsync(); +/// +/// +/// Fresh: no prior state → sync-flush CALLBACK START; +/// the service responds with a CallbackId (merged into state by the +/// batcher); construct the and return it. +/// then suspends. +/// STARTED: a CallbackId is already on the checkpoint; reuse it. +/// suspends (the external system hasn't +/// responded yet) — service re-invokes once it does. +/// SUCCEEDED / FAILED / TIMED_OUT: terminal — construct the +/// with the cached state and return. +/// immediately deserializes / throws. +/// +/// CRITICAL: CreateCallbackAsync always succeeds — it returns the +/// handle regardless of terminal state. Errors are +/// deferred to +/// so user code between CreateCallbackAsync and the result-await runs +/// deterministically across replays. +/// +/// LIFETIME: the handle returned to user code IS the operation object, so it +/// transitively roots , , +/// and . This is invocation-scoped by design — +/// do not store an across invocations (e.g. in a +/// static field on a warm Lambda container). The batcher is disposed when the +/// workflow returns and the captured state belongs to that invocation only; +/// re-using the handle later will read disposed/stale machinery. +/// +/// Serialization is delegated to the registered on +/// . AOT-safe and reflection-based callers +/// share the same code path: the AOT story is determined entirely by the serializer +/// the user registered with the runtime (e.g., +/// SourceGeneratorLambdaJsonSerializer<TContext>). +/// +internal sealed class CallbackOperation : DurableOperation>, ICallback +{ + private readonly CallbackConfig? _config; + private readonly ILambdaSerializer _serializer; + + private string? _callbackId; + + public CallbackOperation( + string operationId, + string? name, + string? parentId, + CallbackConfig? config, + ILambdaSerializer serializer, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _config = config; + _serializer = serializer; + } + + protected override string OperationType => OperationTypes.Callback; + + /// + /// Set when an existing terminal-state checkpoint was observed during + /// dispatch. reads this directly to short- + /// circuit deserialization (or throw the recorded error) without suspending. + /// + private Operation? _terminalReplay; + + /// + public string CallbackId => _callbackId + ?? throw new InvalidOperationException( + "CallbackId is unavailable. Ensure CreateCallbackAsync has completed before reading CallbackId."); + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush the START so the service can allocate a CallbackId for us. + // The batcher's onNewOperations hook merges the service's response into + // ExecutionState, so reading state.GetOperation(OperationId) right after + // the await sees the populated CallbackDetails. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Callback, + Action = OperationAction.START, + SubType = OperationSubTypes.Callback, + Name = Name, + CallbackOptions = BuildCallbackOptions() + }, cancellationToken); + + var stamped = State.GetOperation(OperationId); + var callbackId = stamped?.CallbackDetails?.CallbackId; + if (string.IsNullOrEmpty(callbackId)) + { + // Service didn't return a CallbackId — this is a service-contract + // violation, not user error. Surface as a non-deterministic error + // so the workflow fails fast rather than silently NRE-ing later. + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' was started but the service did not return a CallbackId."); + } + + _callbackId = callbackId; + + // If the service already reported a terminal state on the START response + // (the external system replied synchronously, or timeout was instant), + // record it for GetResultAsync to short-circuit on. + if (IsTerminalStatus(stamped?.Status)) + { + _terminalReplay = stamped; + } + + return this; + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + var callbackId = existing.CallbackDetails?.CallbackId; + if (string.IsNullOrEmpty(callbackId)) + { + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has no CallbackId on its checkpoint."); + } + + _callbackId = callbackId; + + // CRITICAL: we must NOT raise on terminal state here. + // CreateCallbackAsync always returns the ICallback handle so any user + // code between create and GetResult runs deterministically across + // replays. Defer status inspection to GetResultAsync below. + switch (existing.Status) + { + case OperationStatuses.Succeeded: + case OperationStatuses.Failed: + case OperationStatuses.TimedOut: + _terminalReplay = existing; + break; + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // External system hasn't responded yet — GetResultAsync will + // suspend so the service can re-invoke once it does. + break; + + default: + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + + return Task.FromResult>(this); + } + + /// + public async Task GetResultAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Terminal-state checkpoint already observed by Start/Replay — return + // (or throw) immediately without suspending. + if (_terminalReplay != null) + { + return ResolveTerminal(_terminalReplay); + } + + // A later checkpoint in this same invocation (e.g. WaitForCallback's + // submitter step flush) may have merged a terminal status into + // ExecutionState via NewExecutionState. Re-read once before suspending + // so we avoid a wasted reinvocation when the answer is already here. + var current = State.GetOperation(OperationId); + if (IsTerminalStatus(current?.Status)) + { + return ResolveTerminal(current!); + } + + // No terminal state yet. Suspend the workflow; the service re-invokes + // when the external system delivers a result. + return await Termination.SuspendAndAwait( + TerminationReason.CallbackPending, + $"callback:{Name ?? OperationId}"); + } + + private T ResolveTerminal(Operation op) + { + switch (op.Status) + { + case OperationStatuses.Succeeded: + var serialized = op.CallbackDetails?.Result; + if (serialized == null) return default!; + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + case OperationStatuses.Failed: + throw BuildFailedException(op); + + case OperationStatuses.TimedOut: + throw BuildTimeoutException(op); + + default: + // Should be unreachable — _terminalReplay is only set for terminal statuses. + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has unexpected status '{op.Status}' on result resolution."); + } + } + + private CallbackFailedException BuildFailedException(Operation op) + { + var err = op.CallbackDetails?.Error; + var message = err?.ErrorMessage ?? "Callback failed"; + return new CallbackFailedException(message) + { + CallbackId = op.CallbackDetails?.CallbackId, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace, + }; + } + + private CallbackTimeoutException BuildTimeoutException(Operation op) + { + var err = op.CallbackDetails?.Error; + var message = err?.ErrorMessage ?? "Callback timed out"; + return new CallbackTimeoutException(message) + { + CallbackId = op.CallbackDetails?.CallbackId, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace, + }; + } + + private SdkCallbackOptions? BuildCallbackOptions() + { + if (_config == null) return null; + if (_config.Timeout == TimeSpan.Zero && _config.HeartbeatTimeout == TimeSpan.Zero) return null; + + var options = new SdkCallbackOptions(); + if (_config.Timeout > TimeSpan.Zero) + options.TimeoutSeconds = (int)Math.Max(1, Math.Ceiling(_config.Timeout.TotalSeconds)); + if (_config.HeartbeatTimeout > TimeSpan.Zero) + options.HeartbeatTimeoutSeconds = (int)Math.Max(1, Math.Ceiling(_config.HeartbeatTimeout.TotalSeconds)); + return options; + } + + private static bool IsTerminalStatus(string? status) => + status == OperationStatuses.Succeeded + || status == OperationStatuses.Failed + || status == OperationStatuses.TimedOut; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs new file mode 100644 index 000000000..800d55bcf --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs @@ -0,0 +1,218 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Runtime.ExceptionServices; +using System.Threading.Channels; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Background batcher for outbound checkpoint updates. Operations are enqueued +/// via ; a single worker drains the queue and flushes +/// each batch via the supplied flushAsync delegate. Each EnqueueAsync +/// call awaits the flush of its containing batch (sync semantics). +/// +/// +/// Fire-and-forget semantics are achieved by simply not awaiting the returned +/// Task. Errors still surface deterministically via _terminalError: the +/// next sync or rethrows. +/// Callers using fire-and-forget should observe the discarded Task's exception +/// (see StepOperation.FireAndForget) so it doesn't trip the runtime's +/// UnobservedTaskException event. +/// +internal sealed class CheckpointBatcher : IAsyncDisposable +{ + private readonly Func, CancellationToken, Task> _flushAsync; + private readonly CheckpointBatcherConfig _config; + private readonly Channel _channel; + private readonly Task _worker; + private readonly CancellationTokenSource _shutdownCts = new(); + + private string? _checkpointToken; + private Exception? _terminalError; + private int _disposed; + + public CheckpointBatcher( + string? initialCheckpointToken, + Func, CancellationToken, Task> flushAsync, + CheckpointBatcherConfig? config = null) + { + _checkpointToken = initialCheckpointToken; + _flushAsync = flushAsync; + _config = config ?? new CheckpointBatcherConfig(); + _channel = Channel.CreateUnbounded(new UnboundedChannelOptions + { + SingleReader = true, + SingleWriter = false + }); + _worker = Task.Run(() => RunWorkerAsync(_shutdownCts.Token)); + } + + /// + /// The most recent checkpoint token returned by the service. Updated after + /// every successful batch flush. + /// + public string? CheckpointToken => Volatile.Read(ref _checkpointToken); + + /// + /// Queues for flushing. The returned Task completes + /// when the batch containing this update has been successfully flushed to the + /// service. If the worker has already encountered a terminal error, the + /// exception is rethrown immediately. + /// + public async Task EnqueueAsync(SdkOperationUpdate update, CancellationToken cancellationToken = default) + { + var terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var item = new BatchItem(update, tcs); + + if (!_channel.Writer.TryWrite(item)) + { + // Writer is completed (terminal error or disposed) — surface the cause. + terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + throw new ObjectDisposedException(nameof(CheckpointBatcher)); + } + + await tcs.Task.WaitAsync(cancellationToken).ConfigureAwait(false); + } + + /// + /// Closes the channel and awaits the worker. Any items already enqueued are + /// flushed; any subsequent call throws. + /// + public async Task DrainAsync() + { + _channel.Writer.TryComplete(); + try + { + await _worker.ConfigureAwait(false); + } + catch + { + // Surfaced via _terminalError below. + } + + var terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + } + + public async ValueTask DisposeAsync() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) return; + + _channel.Writer.TryComplete(); + _shutdownCts.Cancel(); + try { await _worker.ConfigureAwait(false); } + catch { /* swallow on dispose */ } + _shutdownCts.Dispose(); + } + + private async Task RunWorkerAsync(CancellationToken shutdownToken) + { + // TODO: also enforce _config.MaxBatchBytes here. Today we only cap by + // operation count; an item whose serialized size pushes the batch over + // ~750 KB will be sent and rejected service-side. See CheckpointBatcherConfig. + var batch = new List(_config.MaxBatchOperations); + + try + { + while (await _channel.Reader.WaitToReadAsync(shutdownToken).ConfigureAwait(false)) + { + // Drain everything currently queued. + while (_channel.Reader.TryRead(out var item)) + { + batch.Add(item); + if (batch.Count >= _config.MaxBatchOperations) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + + // Optionally wait for late arrivals to coalesce into one batch. + if (_config.FlushInterval > TimeSpan.Zero && batch.Count > 0) + { + using var windowCts = CancellationTokenSource.CreateLinkedTokenSource(shutdownToken); + windowCts.CancelAfter(_config.FlushInterval); + try + { + while (await _channel.Reader.WaitToReadAsync(windowCts.Token).ConfigureAwait(false)) + { + while (_channel.Reader.TryRead(out var item)) + { + batch.Add(item); + if (batch.Count >= _config.MaxBatchOperations) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + } + } + catch (OperationCanceledException) when (!shutdownToken.IsCancellationRequested) + { + // Window elapsed; fall through to flush. + } + } + + if (batch.Count > 0) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + } + catch (OperationCanceledException) when (shutdownToken.IsCancellationRequested) + { + // Disposed mid-wait; fall through to drain. + } + catch (Exception ex) + { + // FlushBatchAsync's exception path already records _terminalError and + // signals batch members. This catch covers anything else (channel, + // logic). Make sure we still propagate. + Volatile.Write(ref _terminalError, ex); + } + finally + { + // Anything left in the channel after the worker exits — fail it. + var failure = Volatile.Read(ref _terminalError) ?? new ObjectDisposedException(nameof(CheckpointBatcher)); + foreach (var leftover in batch) + leftover.Completion.TrySetException(failure); + while (_channel.Reader.TryRead(out var item)) + item.Completion.TrySetException(failure); + + _channel.Writer.TryComplete(); + } + } + + private async Task FlushBatchAsync(IReadOnlyList batch, CancellationToken cancellationToken) + { + var updates = new SdkOperationUpdate[batch.Count]; + for (int i = 0; i < batch.Count; i++) + updates[i] = batch[i].Update; + + try + { + var newToken = await _flushAsync(_checkpointToken, updates, cancellationToken).ConfigureAwait(false); + Volatile.Write(ref _checkpointToken, newToken); + foreach (var item in batch) + item.Completion.TrySetResult(true); + } + catch (Exception ex) + { + Volatile.Write(ref _terminalError, ex); + foreach (var item in batch) + item.Completion.TrySetException(ex); + _channel.Writer.TryComplete(); + // No rethrow: the worker loop exits via the completed channel and + // RunWorkerAsync's finally handles any leftovers. + } + } + + private readonly record struct BatchItem(SdkOperationUpdate Update, TaskCompletionSource Completion); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs new file mode 100644 index 000000000..88913e868 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Tunables for . +/// +internal sealed class CheckpointBatcherConfig +{ + /// + /// How long the worker waits for additional items to coalesce into a single + /// batch before flushing. Default = flush as soon + /// as the queue drains. Increase to reduce API calls when many checkpoints + /// are emitted concurrently (e.g. parallel branches, future Map operation). + /// + public TimeSpan FlushInterval { get; init; } = TimeSpan.Zero; + + /// + /// Maximum operations per batch. Service-side limit is 200. + /// + public int MaxBatchOperations { get; init; } = 200; + + /// + /// Maximum batch size in bytes. Service-side limit is ~750 KB. + /// + /// + /// TODO: not enforced today. The worker only checks ; + /// a single oversized item (or a batch whose serialized size exceeds 750 KB) + /// will be sent to the service and rejected there. Wire this in alongside + /// the async-flush operations (Map / Parallel / child-context) since those + /// are the scenarios that can actually fill a batch — today every batch is + /// 1 item with = Zero, so the gap is latent. + /// + internal int MaxBatchBytes { get; init; } = 750 * 1024; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs new file mode 100644 index 000000000..7c2427053 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -0,0 +1,248 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable child context operation. Runs a user-supplied function inside a +/// nested with its own deterministic operation-ID +/// space, persisting the function's result so subsequent invocations replay +/// the cached value without re-executing. +/// +/// +/// Replay branches — example: await ctx.RunInChildContextAsync(child => ..., name: "phase") +/// +/// Fresh: no prior state → sync-flush CONTEXT START → run user +/// func → on success emit CONTEXT SUCCEED → on failure emit CONTEXT FAIL +/// and throw . +/// SUCCEEDED: return cached deserialized result; user func is +/// NOT re-executed. +/// FAILED: throw with the +/// recorded error; if is +/// set, the mapped exception is thrown instead. +/// STARTED / PENDING: re-run the user func without +/// re-checkpointing START. The child's own operations recover from their +/// own checkpoints, so this is replay propagation; if a wait/callback +/// inside the child is still pending, the user func re-suspends. +/// +/// Unlike , child contexts have no retry strategy: +/// failure is terminal and surfaces immediately via +/// . +/// +internal sealed class ChildContextOperation : DurableOperation +{ + private readonly Func> _func; + private readonly ChildContextConfig? _config; + private readonly ILambdaSerializer _serializer; + private readonly Func _childContextFactory; + private readonly bool _isVirtual; + + public ChildContextOperation( + string operationId, + string? name, + string? parentId, + Func> func, + ChildContextConfig? config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null, + bool isVirtual = false) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _func = func; + _config = config; + _serializer = serializer; + _childContextFactory = childContextFactory; + _isVirtual = isVirtual; + } + + protected override string OperationType => OperationTypes.Context; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + // Virtual (NestingType.Flat) branches emit no CONTEXT checkpoint of their + // own — the parallel/map orchestrator records their outcome inline on the + // parent payload. Inner operations still checkpoint (re-parented to the + // non-virtual ancestor via the virtual child generator's reported + // ParentId), so a suspend inside a virtual branch is still recoverable. + if (!_isVirtual) + { + // Sync-flush CONTEXT START before user code so the service has a record + // of the parent context if the inner func suspends (e.g. a Wait inside + // the child terminates the workflow before SUCCEED is reached). + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = _config?.SubType, + Name = Name + }, cancellationToken); + } + + return await ExecuteFunc(cancellationToken); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Side-effecting code runs at most once: replay returns the + // cached result without invoking the user func. + return Task.FromResult(DeserializeResult(existing.ContextDetails?.Result)); + + case OperationStatuses.Failed: + throw MapFailureException(BuildChildContextException(existing)); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run the user func: the child's own operations replay from + // their own checkpoints. Do NOT re-checkpoint START — the + // original is still authoritative. If something inside the + // child is still pending (Wait, callback, retry) the user func + // will re-suspend on its own. + return ExecuteFunc(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"Child context operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task ExecuteFunc(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + // For a virtual (Flat) branch, inner operations report this branch's own + // ParentId — the non-virtual parallel/map ancestor — since the branch + // itself emits no CONTEXT checkpoint to reference. For a normal child + // context the reported parent is ignored (it roots at OperationId). + var childContext = _childContextFactory(OperationId, ParentId, _isVirtual); + + T result; + try + { + result = await _func(childContext); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (NonDeterministicExecutionException) + { + // Replay-mismatch from an inner operation means the entire execution + // is corrupt — checkpointing this as CONTEXT FAIL would freeze the + // mismatch into history and prevent future invocations from + // re-detecting it. Bubble up untouched. + throw; + } + catch (StepInterruptedException) + { + // AtMostOncePerRetry crash recovery: a step inside the child saw a + // STARTED checkpoint with no terminal record and routed through its + // retry strategy. The step has already checkpointed its own outcome; + // wrapping this as CONTEXT FAIL would mask that. Bubble up so the + // step's strategy / replay flow stays authoritative. + throw; + } + catch (Exception ex) + { + // Virtual branches suppress the FAIL checkpoint but still propagate + // the exception — the orchestrator records the failure inline on the + // parent payload. + if (!_isVirtual) + { + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.FAIL, + SubType = _config?.SubType, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + } + + throw MapFailureException(new ChildContextException(ex.Message, ex) + { + SubType = _config?.SubType, + ErrorType = ex.GetType().FullName, + OriginalStackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }); + } + + // Virtual branches suppress the SUCCEED checkpoint; the orchestrator + // serializes the result inline on the parent payload instead. + if (!_isVirtual) + { + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.SUCCEED, + SubType = _config?.SubType, + Name = Name, + Payload = SerializeResult(result) + }, cancellationToken); + } + + return result; + } + + private Exception MapFailureException(ChildContextException ex) + { + var mapper = _config?.ErrorMapping; + if (mapper == null) return ex; + + var mapped = mapper(ex); + return mapped ?? ex; + } + + private ChildContextException BuildChildContextException(Operation failedOp) + { + var err = failedOp.ContextDetails?.Error; + return new ChildContextException(err?.ErrorMessage ?? "Child context failed") + { + SubType = failedOp.SubType ?? _config?.SubType, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private T DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private string SerializeResult(T value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private static SdkErrorObject ToSdkError(Exception ex) => new() + { + ErrorType = ex.GetType().FullName, + ErrorMessage = ex.Message, + StackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs new file mode 100644 index 000000000..2a9c9bf37 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -0,0 +1,773 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using System.Text.Json; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Shared orchestration base for the concurrent durable operations +/// ( and ). +/// Runs N user-supplied units concurrently (each as a +/// ) under a shared +/// and concurrency limit, persisting the +/// aggregate result so subsequent invocations replay it without re-executing. +/// +/// +/// Subclasses supply only what differs between Parallel and Map — the unit count, +/// how to obtain a unit's (name, func), the parent/child sub-type labels, +/// and the failure-exception factory. All concurrency, completion, checkpoint, and +/// replay logic lives here. +/// +/// Fresh: no prior state → sync-flush parent CONTEXT START → +/// dispatch units respecting MaxConcurrency → wait for in-flight to +/// complete after CompletionConfig short-circuit → emit parent CONTEXT +/// SUCCEED with summary payload (). +/// SUCCEEDED: parent payload supplies the snapshot of per-unit +/// statuses + completion reason; per-unit results are deserialised from the +/// children's own CONTEXT checkpoints. +/// FAILED: same reconstruction; throws the subclass exception +/// carrying the rebuilt . +/// STARTED / PENDING: re-execute (children replay from their +/// own checkpoints). +/// +/// Per-unit errors do NOT abort the operation directly — the orchestrator catches +/// each unit's , records it as a failed +/// , and consults the +/// after every completion. Only when the completion config marks the run as +/// does it throw. +/// +internal abstract class ConcurrentOperation : DurableOperation> +{ + private readonly CompletionConfig _completionConfig; + private readonly int? _maxConcurrency; + + /// + /// True for : per-unit child contexts emit no + /// CONTEXT checkpoint, so their results/errors are recorded inline on this + /// parent operation's payload and read back from + /// there on replay. + /// + private readonly bool _isVirtual; + + /// Serializer used to deserialize per-unit child results on replay. + protected readonly ILambdaSerializer Serializer; + + /// Factory used to build each unit's inner child context. Takes + /// (operationId, reportedParentId, isVirtual). + protected readonly Func ChildContextFactory; + + protected ConcurrentOperation( + string operationId, + string? name, + string? parentId, + CompletionConfig completionConfig, + int? maxConcurrency, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null, + bool isVirtual = false) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _completionConfig = completionConfig; + _maxConcurrency = maxConcurrency; + Serializer = serializer; + ChildContextFactory = childContextFactory; + _isVirtual = isVirtual; + } + + protected override string OperationType => OperationTypes.Context; + + // ── Subclass hooks ────────────────────────────────────────────────── + + /// The number of units (branches or items) to execute. + protected abstract int UnitCount { get; } + + /// Parent CONTEXT sub-type label (e.g. Parallel / Map). + protected abstract string ParentSubType { get; } + + /// Per-unit child-context sub-type label (e.g. ParallelBranch / MapItem). + protected abstract string ChildSubType { get; } + + /// Singular operation noun used in messages (e.g. "Parallel" / "Map"). + protected abstract string OperationNoun { get; } + + /// Plural unit noun used in messages (e.g. "branches" / "items"). + protected abstract string UnitNounPlural { get; } + + /// + /// Resolves the unit at into its display name and the + /// function to run inside the unit's child context. + /// + protected abstract (string? Name, Func> Func) GetUnit(int index); + + /// + /// Builds the subclass-specific exception thrown when the operation resolves + /// with . + /// + protected abstract DurableExecutionException CreateException(string message, IBatchResult result); + + // ── Orchestration ─────────────────────────────────────────────────── + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush parent CONTEXT START. Mirrors ChildContextOperation: if a + // unit suspends (e.g., a Wait inside it), the service needs to know the + // parent existed. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = ParentSubType, + Name = Name + }, cancellationToken); + + return await ExecuteUnitsAsync(cancellationToken); + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); + + case OperationStatuses.Failed: + // Reconstruct so the caller (and the exception's Result) sees the + // per-unit outcomes; then throw. + var failed = ReconstructFromCheckpoints(existing, throwOnFailure: false); + throw BuildException(failed); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run: units replay from their own checkpoints. + return ExecuteUnitsAsync(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"{OperationNoun} operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task> ExecuteUnitsAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var unitCount = UnitCount; + var slots = new UnitOutcome[unitCount]; + var dispatched = new bool[unitCount]; + + var maxConcurrency = _maxConcurrency ?? unitCount; + // Optimisation: when MaxConcurrency >= unitCount, skip the semaphore + // entirely. Behaviour is identical, allocations are lower. (Also covers + // the empty-collection case, where unitCount == 0 and no unit runs.) + var semaphore = (maxConcurrency >= unitCount || unitCount == 0) + ? null + : new SemaphoreSlim(maxConcurrency, maxConcurrency); + + var minSuccessful = _completionConfig.MinSuccessful; + var toleratedFailureCount = _completionConfig.ToleratedFailureCount; + var toleratedFailurePercentage = _completionConfig.ToleratedFailurePercentage; + + var succeeded = 0; + var failed = 0; + + var inFlight = new List(unitCount); + + // Units run with the parent's token so cooperative cancellation still + // propagates into user code, but we must NOT abandon already-dispatched + // units while they're still writing checkpoints — that would diverge + // between the original run and replay. The dispatch loop and + // Task.WhenAll below therefore await every in-flight task even when + // cancellation fires; the semaphore is disposed only after those units + // have settled (success, failure, or cooperative OCE). + try + { + try + { + for (var i = 0; i < unitCount; i++) + { + // Volatile reads pair with the Interlocked.Increment writes + // in the onComplete callback. Reads are non-atomic across + // the two counters: at worst we observe slightly stale + // values and dispatch one extra unit before the next + // completion forces a re-check. That's acceptable — the + // post-loop ComputeCompletionReason is the source of truth. + var succSnap = Volatile.Read(ref succeeded); + var failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + break; + } + + if (semaphore != null) + { + await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false); + // Re-check after acquiring: the wait may have unblocked + // because earlier units finished and short-circuited the + // operation. + succSnap = Volatile.Read(ref succeeded); + failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + semaphore.Release(); + break; + } + } + + var index = i; + dispatched[index] = true; + inFlight.Add(RunUnitAsync(index, slots, semaphore, cancellationToken, + onComplete: outcome => + { + if (outcome.Status == BatchItemStatus.Succeeded) + Interlocked.Increment(ref succeeded); + else if (outcome.Status == BatchItemStatus.Failed) + Interlocked.Increment(ref failed); + })); + } + } + finally + { + // CRITICAL: wait for every dispatched unit — even on the + // exceptional path (parent-token cancellation mid-dispatch, or a + // synchronous throw out of the loop) — before the semaphore is + // disposed. Otherwise surviving units' Release() calls hit + // ObjectDisposedException, the tasks become unobserved, and they + // keep writing checkpoints out from under us. + // + // We deliberately DO NOT cancel already-running units when a + // short-circuit fires — orphan units that continue writing + // checkpoints would diverge between the original run and replay. + // Letting them finish guarantees determinism: all dispatched units + // end up Succeeded or Failed. Only un-dispatched units surface as + // Started. + if (inFlight.Count > 0) + { + try + { + await Task.WhenAll(inFlight).ConfigureAwait(false); + } + catch + { + // Swallow here — Task.WhenAll only surfaces the first + // exception, but every unit task is now in a terminal + // state and we want to inspect each one individually below + // to decide whether to surface a workflow-level error. The + // Task objects themselves still carry their exceptions, so + // this swallow does not orphan them. + } + } + } + } + finally + { + semaphore?.Dispose(); + } + + // Surface any workflow-level exception (e.g. NonDeterministicExecutionException) + // raised inside a unit. RunUnitAsync re-throws DurableExecutionException + // (other than ChildContextException which is captured into the slot) so the + // task faults with that exception. Take the first such failure: these are + // structural errors, not "unit failed gracefully" outcomes. + foreach (var t in inFlight) + { + if (t.IsFaulted && t.Exception is { } agg) + { + foreach (var inner in agg.InnerExceptions) + { + if (inner is DurableExecutionException dex && inner is not ChildContextException) + { + throw dex; + } + } + } + } + + // Re-throw any pending parent-token cancellation now that units have + // settled and the semaphore has been disposed cleanly. + cancellationToken.ThrowIfCancellationRequested(); + + // Build BatchItems for every unit in original order. + var items = new List>(unitCount); + for (var i = 0; i < unitCount; i++) + { + var (unitName, _) = GetUnit(i); + if (dispatched[i]) + { + var outcome = slots[i]; + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = outcome.Status, + Result = outcome.Status == BatchItemStatus.Succeeded ? outcome.Result : default, + Error = outcome.Status == BatchItemStatus.Failed ? outcome.Error : null + }); + } + else + { + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = BatchItemStatus.Started, + Result = default, + Error = null + }); + } + } + + var completionReason = ComputeCompletionReason(items, unitCount); + var result = new BatchResult(items, completionReason); + + var failureException = completionReason == CompletionReason.FailureToleranceExceeded + ? BuildException(result) + : null; + + await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); + + if (failureException != null) + { + throw failureException; + } + + return result; + } + + private async Task RunUnitAsync( + int index, + UnitOutcome[] slots, + SemaphoreSlim? semaphore, + CancellationToken cancellationToken, + Action onComplete) + { + try + { + var (unitName, unitFunc) = GetUnit(index); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); + + var childOp = new ChildContextOperation( + childOpId, + unitName, + OperationId, + unitFunc, + new ChildContextConfig { SubType = ChildSubType }, + Serializer, + ChildContextFactory, + State, + Termination, + DurableExecutionArn, + Batcher, + isVirtual: _isVirtual); + + try + { + var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); + slots[index] = new UnitOutcome { Status = BatchItemStatus.Succeeded, Result = result }; + } + catch (ChildContextException ex) + { + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = ex }; + } + catch (DurableExecutionException) + { + // E.g. NonDeterministicExecutionException — these are not "unit + // failed gracefully" but workflow-level problems. Surface them: + // re-throw out of the operation without writing a slot (the + // orchestrator's outer flow handles it). + throw; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // Parent-token cancellation: per cross-cutting decision Q10, OCE + // escapes unwrapped. Don't write a slot — Task.WhenAll observes + // this and the orchestrator re-throws after settling. + throw; + } + catch (OperationCanceledException ex) + { + // Unit-internal cancellation that is NOT tied to the parent token + // (e.g. the unit's own CancellationTokenSource fired). Treat it as + // a normal per-unit failure rather than killing the operation as + // cancelled. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + catch (Exception ex) + { + // Wrap unexpected exceptions as ChildContextException — they're + // per-unit failures from the user's POV. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + + onComplete(slots[index]); + } + finally + { + // Defensive: with this structure the semaphore is only disposed after + // Task.WhenAll(inFlight) has settled, so this Release should always + // succeed. ObjectDisposedException would indicate a bug elsewhere, but + // we tolerate it here so the task doesn't fault with a noise exception + // that masks the real one. + try + { + semaphore?.Release(); + } + catch (ObjectDisposedException) + { + } + } + } + + private static bool ShouldStopDispatching( + int succeeded, + int failed, + int totalUnits, + int? minSuccessful, + int? toleratedFailureCount, + double? toleratedFailurePercentage) + { + // Min-successful: short-circuit the moment we have enough wins. + if (minSuccessful is { } min && succeeded >= min) + return true; + + // Failure thresholds short-circuit on too many losses. + if (toleratedFailureCount is { } tfc && failed > tfc) + return true; + + if (toleratedFailurePercentage is { } tfp && totalUnits > 0) + { + var ratio = (double)failed / totalUnits; + if (ratio > tfp) return true; + } + + return false; + } + + private CompletionReason ComputeCompletionReason(IReadOnlyList> items, int totalCount) + { + var failed = 0; + var succeeded = 0; + var started = 0; + + foreach (var item in items) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded++; break; + case BatchItemStatus.Failed: failed++; break; + case BatchItemStatus.Started: started++; break; + } + } + + // Failure tolerance: only short-circuit-by-failure when at least one + // failure threshold is explicitly set. The factory CompletionConfig.AllSuccessful() + // sets ToleratedFailureCount = 0 to opt into fail-fast; an "empty" + // CompletionConfig (all properties null) is permissive. + if (_completionConfig.ToleratedFailureCount is { } tfc && failed > tfc) + return CompletionReason.FailureToleranceExceeded; + + if (_completionConfig.ToleratedFailurePercentage is { } tfp && totalCount > 0) + { + var ratio = (double)failed / totalCount; + if (ratio > tfp) return CompletionReason.FailureToleranceExceeded; + } + + // Min-successful satisfied (and we didn't run all units): MinSuccessfulReached. + if (_completionConfig.MinSuccessful is { } min && succeeded >= min && started > 0) + { + return CompletionReason.MinSuccessfulReached; + } + + // Every dispatched unit finished one way or the other (or all-completed + // without any failure criteria). + return CompletionReason.AllCompleted; + } + + private DurableExecutionException BuildException(IBatchResult result) + { + var message = + $"{OperationNoun} operation failed: failure tolerance exceeded " + + $"({result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed)."; + return CreateException(message, result); + } + + private async Task CheckpointParentResultAsync( + BatchResult result, + CompletionReason completionReason, + DurableExecutionException? failureException, + CancellationToken cancellationToken) + { + var summary = new BatchSummary + { + CompletionReason = SerializeCompletionReason(completionReason), + Units = new List(result.All.Count) + }; + for (var i = 0; i < result.All.Count; i++) + { + var item = result.All[i]; + var unit = new BatchUnitSummary + { + Index = item.Index, + Name = item.Name, + Status = SerializeStatus(item.Status) + }; + + // Flat (virtual) units emit no child checkpoint, so their per-unit + // result/error has nowhere to live except inline on this summary. + // Nested units leave these null — they're read from each child's own + // CONTEXT checkpoint on replay. + if (_isVirtual) + { + if (item.Status == BatchItemStatus.Succeeded) + { + unit.Result = SerializeResult(item.Result); + } + else if (item.Status == BatchItemStatus.Failed && item.Error != null) + { + unit.Error = ErrorObject.FromException(item.Error); + } + } + + summary.Units.Add(unit); + } + + var payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + var failed = failureException != null; + + // On FAIL, Nested operations omit the payload because replay rebuilds + // per-unit outcomes from the children's own checkpoints. Flat operations + // have no child checkpoints, so the summary (carrying inline results and + // errors) must be persisted even on FAIL for replay to reconstruct it. + var payloadOnFail = _isVirtual; + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, + SubType = ParentSubType, + Name = Name, + Payload = failed && !payloadOnFail ? null : payload, + Error = failed ? BuildAggregateError(result, failureException!) : null + }, cancellationToken); + } + + private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwOnFailure) + { + var summary = ParseSummary(parent.ContextDetails?.Result); + + var items = new List>(UnitCount); + for (var i = 0; i < UnitCount; i++) + { + var (unitName, _) = GetUnit(i); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{i + 1}"); + var childOp = State.GetOperation(childOpId); + var summaryEntry = summary?.Units.FirstOrDefault(b => b.Index == i); + + BatchItemStatus status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : InferStatusFromChildOp(childOp); + + // Prefer the name that was checkpointed at the moment the batch + // resolved. This is the only authoritative source for units reported + // as Started (no per-unit checkpoint exists to consult), and it lets + // us detect unit-name drift between deployments. + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && unitName != null && checkpointedName != unitName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for {OperationNoun.ToLowerInvariant()} unit {i} of operation " + + $"'{Name ?? OperationId}': expected name '{unitName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of concurrent " + + $"units between deployments."); + } + var resolvedName = checkpointedName ?? unitName; + + T? unitResult = default; + DurableExecutionException? unitError = null; + + // Flat (virtual) units have no child checkpoint — their result/error + // was recorded inline on this summary. Nested units read from the + // child's own CONTEXT checkpoint. A unit is "inline" when the summary + // entry carries a Result/Error, which only Flat writes. + if (_isVirtual && summaryEntry != null) + { + if (status == BatchItemStatus.Succeeded && summaryEntry.Result != null) + { + unitResult = DeserializeResult(summaryEntry.Result); + } + else if (status == BatchItemStatus.Failed && summaryEntry.Error != null) + { + var err = summaryEntry.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + } + else if (status == BatchItemStatus.Succeeded && childOp?.ContextDetails?.Result != null) + { + unitResult = DeserializeResult(childOp.ContextDetails.Result); + } + else if (status == BatchItemStatus.Failed && childOp?.ContextDetails?.Error != null) + { + var err = childOp.ContextDetails.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = childOp.SubType ?? ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + + items.Add(new BatchItem + { + Index = i, + Name = resolvedName, + Status = status, + Result = unitResult, + Error = unitError + }); + } + + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, UnitCount); + + var result = new BatchResult(items, completionReason); + + if (throwOnFailure && completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildException(result); + } + + return result; + } + + private static BatchItemStatus InferStatusFromChildOp(Operation? childOp) + { + if (childOp == null) return BatchItemStatus.Started; + return childOp.Status switch + { + OperationStatuses.Succeeded => BatchItemStatus.Succeeded, + OperationStatuses.Failed => BatchItemStatus.Failed, + _ => BatchItemStatus.Started + }; + } + + private SdkErrorObject BuildAggregateError(IBatchResult result, DurableExecutionException failureException) + { + return new SdkErrorObject + { + ErrorType = failureException.GetType().FullName, + ErrorMessage = + $"{OperationNoun} operation failed: {result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed." + }; + } + + private static BatchSummary? ParseSummary(string? payload) + { + if (string.IsNullOrEmpty(payload)) return null; + try + { + return JsonSerializer.Deserialize(payload, BatchJsonContext.Default.BatchSummary); + } + catch (JsonException) + { + // Tolerate older / corrupted payloads — fall back to inferring status + // from per-unit checkpoints. + return null; + } + } + + private static string SerializeStatus(BatchItemStatus status) => status switch + { + BatchItemStatus.Succeeded => "SUCCEEDED", + BatchItemStatus.Failed => "FAILED", + BatchItemStatus.Started => "STARTED", + _ => throw new ArgumentOutOfRangeException(nameof(status)) + }; + + private static BatchItemStatus DeserializeStatus(string? wire) => wire switch + { + "SUCCEEDED" => BatchItemStatus.Succeeded, + "FAILED" => BatchItemStatus.Failed, + "STARTED" => BatchItemStatus.Started, + _ => BatchItemStatus.Started + }; + + private static string SerializeCompletionReason(CompletionReason reason) => reason switch + { + CompletionReason.AllCompleted => "ALL_COMPLETED", + CompletionReason.MinSuccessfulReached => "MIN_SUCCESSFUL_REACHED", + CompletionReason.FailureToleranceExceeded => "FAILURE_TOLERANCE_EXCEEDED", + _ => throw new ArgumentOutOfRangeException(nameof(reason)) + }; + + private static CompletionReason DeserializeCompletionReason(string? wire) => wire switch + { + "ALL_COMPLETED" => CompletionReason.AllCompleted, + "MIN_SUCCESSFUL_REACHED" => CompletionReason.MinSuccessfulReached, + "FAILURE_TOLERANCE_EXCEEDED" => CompletionReason.FailureToleranceExceeded, + _ => CompletionReason.AllCompleted + }; + + private T DeserializeResult(string serialized) + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return Serializer.Deserialize(ms); + } + + /// + /// Serializes a per-unit result for inline storage in the + /// (Flat units only). Mirrors the SUCCEED-payload + /// serialization a Nested unit's would + /// have written to its own checkpoint. + /// + private string SerializeResult(T? value) + { + using var ms = new MemoryStream(); + Serializer.Serialize(value!, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + /// + /// Internal scratch space tracking each unit's outcome as it lands in the + /// executor; copied into the user-facing once every + /// dispatched unit has settled. + /// + private struct UnitOutcome + { + public BatchItemStatus Status; + public T? Result; + public DurableExecutionException? Error; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs new file mode 100644 index 000000000..9f3570fac --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Abstract base for durable operations (Step, Wait, ...). Subclasses implement +/// (no prior checkpoint) and +/// (some checkpoint exists); the base handles lookup and dispatch. +/// +/// The operation's result type. +internal abstract class DurableOperation +{ + protected readonly ExecutionState State; + protected readonly TerminationManager Termination; + protected readonly string OperationId; + protected readonly string? Name; + protected readonly string? ParentId; + protected readonly string DurableExecutionArn; + protected readonly CheckpointBatcher? Batcher; + + protected DurableOperation( + string operationId, + string? name, + string? parentId, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + { + OperationId = operationId; + Name = name; + ParentId = parentId; + State = state; + Termination = termination; + DurableExecutionArn = durableExecutionArn; + Batcher = batcher; + } + + /// The wire-format operation type (e.g. "STEP", "WAIT"). + protected abstract string OperationType { get; } + + /// + /// Looks up any prior checkpoint for this op and dispatches to + /// (none) or (some). + /// + public Task ExecuteAsync(CancellationToken cancellationToken) + { + State.ValidateReplayConsistency(OperationId, OperationType, Name); + + // Record that the workflow has reached this op. If every completed + // checkpointed op has now been visited, the state flips out of replay. + State.TrackReplay(OperationId); + + var existing = State.GetOperation(OperationId); + return existing == null + ? StartAsync(cancellationToken) + : ReplayAsync(existing, cancellationToken); + } + + /// First-time execution path: no prior checkpoint exists. + protected abstract Task StartAsync(CancellationToken cancellationToken); + + /// + /// Replay path: a checkpoint from a prior invocation exists. Subclasses + /// switch on . + /// against constants. + /// + protected abstract Task ReplayAsync(Operation existing, CancellationToken cancellationToken); + + /// + /// Enqueues an outbound checkpoint and awaits its batch flush. No-op when + /// no batcher is wired (e.g. unit tests that don't exercise flushing). + /// + protected Task EnqueueAsync(SdkOperationUpdate update, CancellationToken cancellationToken = default) + => Batcher?.EnqueueAsync(update, cancellationToken) ?? Task.CompletedTask; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs new file mode 100644 index 000000000..7ff404675 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs @@ -0,0 +1,196 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// In-memory store of the operations replayed from +/// plus replay-mode tracking. Outbound checkpoints are owned by +/// ; this type is the inbound side only. +/// +/// +/// +/// At construction the workflow is "replaying" if and only if any user-replayable +/// op is present. The service always sends one EXECUTION-type op +/// carrying the input payload — that's bookkeeping, not user history, +/// so it doesn't count. +/// is called by every DurableOperation.ExecuteAsync +/// at the top of the call. Once every checkpointed completed +/// non-EXECUTION op has been visited, the workflow has caught up +/// to the replay frontier and flips to false +/// for the rest of the invocation. +/// +/// +/// Thread safety: two paths reach this type concurrently. (1) The +/// background worker invokes +/// (via the onNewOperations hook) while the +/// workflow thread reads via / — +/// e.g. the fire-and-forget StepOperation path. (2) +/// dispatches N branches concurrently, each +/// running its own , so +/// , , +/// , and the +/// getter are reachable from multiple threads at once. +/// All read/write access to _operations, _visitedOperations, +/// _isReplaying and _remainingReplayOps is therefore guarded by a +/// single private lock. Every guarded path is an O(1) dictionary lookup, set +/// insert, or short iteration, so contention stays brief; we use a plain +/// lock rather than because +/// none of the guarded code paths are async, and rather than +/// ConcurrentDictionary because performs +/// a compound add-then-scan. +/// +/// +internal sealed class ExecutionState +{ + private readonly object _lock = new(); + private readonly Dictionary _operations = new(); + private readonly HashSet _visitedOperations = new(); + private bool _isReplaying; + private int _remainingReplayOps; + + public int CheckpointedOperationCount + { + get { lock (_lock) return _operations.Count; } + } + + /// + /// True when the workflow is re-deriving prior operations from checkpointed + /// state. False when running fresh (not-yet-checkpointed) code. + /// + public bool IsReplaying + { + get { lock (_lock) return _isReplaying; } + } + + public void LoadFromCheckpoint(InitialExecutionState? initialState) + { + lock (_lock) + { + if (initialState?.Operations != null) + { + AddOperationsLocked(initialState.Operations); + } + + // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, + // CANCELLED, STOPPED) we need to re-derive before resuming live work. + // The service-side EXECUTION op (input payload bookkeeping) is always + // present and doesn't count. If the only ops are in-progress + // (READY/PENDING/STARTED), there's nothing to re-derive — the next + // user call IS the next thing to run — so IsReplaying starts false. + var (_, terminalCount) = ScanReplayableLocked(); + _remainingReplayOps = terminalCount; + _isReplaying = terminalCount > 0; + } + } + + public void AddOperations(IEnumerable operations) + { + lock (_lock) + { + AddOperationsLocked(operations); + } + } + + /// + /// Returns the checkpointed record for , or null + /// if none. Callers should switch on against + /// constants to decide replay behavior. + /// + public Operation? GetOperation(string operationId) + { + lock (_lock) + { + _operations.TryGetValue(operationId, out var op); + return op; + } + } + + public bool HasOperation(string operationId) + { + lock (_lock) + { + return _operations.ContainsKey(operationId); + } + } + + /// + /// Records that the workflow has reached . + /// Once every checkpointed completed non-EXECUTION op has been + /// visited the workflow has caught up to the replay frontier and + /// flips to false. Idempotent: calling more than + /// once with the same id has no additional effect. + /// + public void TrackReplay(string operationId) + { + lock (_lock) + { + if (!_isReplaying) return; + if (!_visitedOperations.Add(operationId)) return; + if (!_operations.TryGetValue(operationId, out var op)) return; + if (op.Type == OperationTypes.Execution) return; + if (!IsTerminalStatus(op.Status)) return; + + if (--_remainingReplayOps <= 0) + _isReplaying = false; + } + } + + public void ValidateReplayConsistency(string operationId, string expectedType, string? expectedName) + { + lock (_lock) + { + // Independent of IsReplaying: as long as a checkpoint record exists + // for this id, its type/name must match what user code is asking for. + // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), + // IsReplaying is false but the records still exist and code drift can + // still produce a mismatch. + if (!_operations.TryGetValue(operationId, out var op)) return; + + if (op.Type != null && op.Type != expectedType) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + + if (expectedName != null && op.Name != null && op.Name != expectedName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + } + } + + private void AddOperationsLocked(IEnumerable operations) + { + foreach (var op in operations) + { + if (op.Id == null) continue; + _operations[op.Id] = op; + } + } + + private (bool HasReplayable, int TerminalCount) ScanReplayableLocked() + { + var has = false; + var count = 0; + foreach (var op in _operations.Values) + { + if (op.Type == OperationTypes.Execution) continue; + has = true; + if (IsTerminalStatus(op.Status)) count++; + } + return (has, count); + } + + private static bool IsTerminalStatus(string? status) => + status == OperationStatuses.Succeeded + || status == OperationStatuses.Failed + || status == OperationStatuses.Cancelled + || status == OperationStatuses.Stopped + || status == OperationStatuses.TimedOut; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExponentialBackoff.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExponentialBackoff.cs new file mode 100644 index 000000000..47da147c9 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExponentialBackoff.cs @@ -0,0 +1,41 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Shared exponential-backoff math for both +/// (retry-on-exception) and +/// ExponentialWaitStrategy<TState> (wait-for-condition polling). +/// Computes min(initialDelay * backoff^(attempt-1), maxDelay), applies +/// the requested jitter, then ceilings to whole seconds with a 1-second floor +/// (the service timer's smallest unit). +/// +internal static class ExponentialBackoff +{ + [ThreadStatic] + private static Random? t_random; + private static Random Random => t_random ??= new Random(); + + /// + /// Computes the delay for the given (1-based) + /// using exponential backoff with the requested jitter strategy. Returned + /// delay is always at least 1 second (service timer floor). + /// + public static TimeSpan CalculateDelay( + int attemptNumber, + TimeSpan initialDelay, + TimeSpan maxDelay, + double backoffRate, + JitterStrategy jitter) + { + var baseDelay = initialDelay.TotalSeconds * Math.Pow(backoffRate, attemptNumber - 1); + var cappedDelay = Math.Min(baseDelay, maxDelay.TotalSeconds); + + var finalDelay = jitter switch + { + JitterStrategy.Full => Random.NextDouble() * cappedDelay, + JitterStrategy.Half => cappedDelay * (0.5 + 0.5 * Random.NextDouble()), + _ => cappedDelay + }; + + return TimeSpan.FromSeconds(Math.Max(1, Math.Ceiling(finalDelay))); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs new file mode 100644 index 000000000..bc27ff7a3 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs @@ -0,0 +1,185 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda.Core; +using SdkChainedInvokeOptions = Amazon.Lambda.Model.ChainedInvokeOptions; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable chained-invoke operation. Schedules an asynchronous invocation of +/// another durable Lambda function via the durable execution service and +/// suspends the parent workflow until the chained execution reaches a terminal +/// state. The service drives the chained function and re-invokes the parent +/// with an updated operation status. +/// +/// +/// Replay branches — example: +/// await ctx.InvokeAsync<Req, Resp>("arn:...:fn:prod", req, "process_payment") +/// +/// Fresh: serialize payload → sync-flush CHAINED_INVOKE START +/// (carrying ) → suspend with +/// . +/// SUCCEEDED: deserialize and return cached result from +/// ChainedInvokeDetails.Result; the chained function is NOT +/// re-invoked. +/// FAILED: throw populated +/// from the recorded error. +/// TIMED_OUT: throw . +/// STOPPED: throw . +/// STARTED / PENDING: chained execution is still in +/// flight; re-suspend without re-checkpointing — the original +/// START remains authoritative. +/// +/// Mirrors 's "sync-flush START → suspend" idiom; +/// the chained function executes out-of-process so there is nothing to run +/// locally on either fresh or replay paths besides the suspend wiring. +/// Serialization is delegated to the registered +/// on ; AOT-safe and reflection-based +/// callers share the same code path (the AOT story is determined by the +/// registered serializer). +/// +internal sealed class InvokeOperation : DurableOperation +{ + private readonly string _functionName; + private readonly TPayload _payload; + private readonly InvokeConfig? _config; + private readonly ILambdaSerializer _serializer; + + public InvokeOperation( + string operationId, + string? name, + string? parentId, + string functionName, + TPayload payload, + InvokeConfig? config, + ILambdaSerializer serializer, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _functionName = functionName; + _payload = payload; + _config = config; + _serializer = serializer; + } + + protected override string OperationType => OperationTypes.ChainedInvoke; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var serializedPayload = SerializeValue(_payload); + + // The service is what actually invokes the chained function, so it + // must receive this START before we suspend. If we only batched it + // locally and the parent process were recycled at suspend, the START + // would be lost and the chained function would never run. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.ChainedInvoke, + Action = OperationAction.START, + SubType = OperationSubTypes.ChainedInvoke, + Name = Name, + Payload = serializedPayload, + ChainedInvokeOptions = new SdkChainedInvokeOptions + { + FunctionName = _functionName, + TenantId = _config?.TenantId + } + }, cancellationToken); + + return await Termination.SuspendAndAwait( + TerminationReason.InvokePending, $"invoke:{Name ?? _functionName}"); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + return Task.FromResult(DeserializeResult(existing.ChainedInvokeDetails?.Result)); + + case OperationStatuses.Failed: + throw BuildFailed(existing); + + case OperationStatuses.TimedOut: + throw BuildTimedOut(existing); + + case OperationStatuses.Stopped: + throw BuildStopped(existing); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Chained function is still running. Just suspend again — + // the original START is already on the service, so don't + // re-checkpoint it. Whenever the service re-invokes us next, + // it will include the updated status. + return Termination.SuspendAndAwait( + TerminationReason.InvokePending, $"invoke:{Name ?? _functionName}"); + + default: + throw new NonDeterministicExecutionException( + $"Chained invoke operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private string SerializeValue(TPayload value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private TResult DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private InvokeFailedException BuildFailed(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeFailedException(err?.ErrorMessage ?? "Chained invoke failed.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private InvokeTimedOutException BuildTimedOut(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeTimedOutException(err?.ErrorMessage ?? "Chained invoke timed out.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private InvokeStoppedException BuildStopped(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeStoppedException(err?.ErrorMessage ?? "Chained invoke was stopped.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaCoreLogger.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaCoreLogger.cs new file mode 100644 index 000000000..1ffef5ec0 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaCoreLogger.cs @@ -0,0 +1,161 @@ +using System.Collections.Generic; +using System.Text; +using Microsoft.Extensions.Logging; +using CoreLambdaLogger = Amazon.Lambda.Core.LambdaLogger; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default for . Routes log +/// records through so they flow into the same +/// pipeline used by the rest of the AWS Lambda for .NET runtime — the runtime +/// host installs a redirector that produces structured JSON when +/// AWS_LAMBDA_LOG_FORMAT=JSON and honors AWS_LAMBDA_LOG_LEVEL. +/// +/// +/// In-package adapter to avoid forcing a dependency on +/// Amazon.Lambda.Logging.AspNetCore; users who want a richer experience +/// (Serilog, Powertools, etc.) can swap their own logger via +/// . +/// +/// When state is the standard FormattedLogValues produced by +/// , the original template and named arguments +/// are forwarded so the runtime's JSON formatter surfaces named placeholders +/// ({OrderId}) as top-level structured attributes. Mirrors the pattern +/// in Amazon.Lambda.Logging.AspNetCore.LambdaILogger. +/// +/// maintains an chain of +/// scope state. Scopes whose state is a key/value collection have each entry +/// appended to the outgoing template/args, so structured scope metadata +/// (durableExecutionArn, operationId, etc.) shows up as +/// top-level JSON fields without callers having to swap in a third-party +/// logger. Inner scopes win on key collision; explicit message arguments +/// always win over scope keys. +/// +internal sealed class LambdaCoreLogger : ILogger +{ + private const string OriginalFormatKey = "{OriginalFormat}"; + + private static readonly AsyncLocal CurrentScope = new(); + + public IDisposable BeginScope(TState state) where TState : notnull + { + var scope = new Scope(state, CurrentScope.Value); + CurrentScope.Value = scope; + return scope; + } + + // Level filtering is performed by the runtime layer (AWS_LAMBDA_LOG_LEVEL). + public bool IsEnabled(LogLevel logLevel) => logLevel != LogLevel.None; + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + if (!IsEnabled(logLevel)) return; + + string? messageTemplate = null; + var parameters = new List(); + HashSet? claimedKeys = null; + + if (state is IEnumerable> structure) + { + foreach (var property in structure) + { + if (property is { Key: OriginalFormatKey, Value: string value }) + { + messageTemplate = value; + } + else + { + parameters.Add(property.Value!); + claimedKeys ??= new HashSet(StringComparer.Ordinal); + claimedKeys.Add(property.Key); + } + } + + // No {OriginalFormat} → not a real FormattedLogValues; ignore the args + // we collected and fall back to the formatter below. + if (messageTemplate == null) + { + parameters.Clear(); + claimedKeys = null; + } + } + + messageTemplate ??= formatter(state, exception); + + AppendScopeAttributes(ref messageTemplate, parameters, ref claimedKeys); + + var levelName = logLevel.ToString(); + var args = parameters.Count == 0 ? Array.Empty() : parameters.ToArray(); + if (exception != null) + { + CoreLambdaLogger.Log(levelName, exception, messageTemplate, args); + } + else + { + CoreLambdaLogger.Log(levelName, messageTemplate, args); + } + } + + private static void AppendScopeAttributes( + ref string messageTemplate, + List parameters, + ref HashSet? claimedKeys) + { + var current = CurrentScope.Value; + if (current == null) return; + + StringBuilder? sb = null; + + // Walk innermost → outermost so the first key seen for a given name wins + // (mirrors how Microsoft.Extensions.Logging structured providers resolve + // overlapping scope keys: the closest scope dominates). + for (var s = current; s != null; s = s.Parent) + { + if (s.State is not IEnumerable> kvps) continue; + foreach (var kvp in kvps) + { + // Skip {OriginalFormat} (some scope-state factories emit one). + if (kvp.Key == OriginalFormatKey) continue; + + claimedKeys ??= new HashSet(StringComparer.Ordinal); + if (!claimedKeys.Add(kvp.Key)) continue; + + sb ??= new StringBuilder(messageTemplate); + sb.Append(' ').Append('{').Append(kvp.Key).Append('}'); + parameters.Add(kvp.Value!); + } + } + + if (sb != null) messageTemplate = sb.ToString(); + } + + private sealed class Scope : IDisposable + { + public object State { get; } + public Scope? Parent { get; } + private bool _disposed; + + public Scope(object state, Scope? parent) + { + State = state; + Parent = parent; + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + + // Restore the parent. Out-of-order disposal would desync the chain, + // but that violates the using-statement contract that callers rely + // on; we don't try to defend against it. + CurrentScope.Value = Parent; + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs new file mode 100644 index 000000000..dfebe820e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +internal static class LambdaSerializerHelper +{ + private const string MissingSerializerMessage = + "No ILambdaSerializer is registered on ILambdaContext.Serializer. " + + "In the class library programming model, register one with " + + "[assembly: LambdaSerializer(typeof(...))]. In an executable / custom " + + "runtime, pass it to LambdaBootstrapBuilder.Create(handler, serializer). " + + "In tests, set TestLambdaContext.Serializer."; + + public static ILambdaSerializer GetRequired(ILambdaContext lambdaContext) => + lambdaContext.Serializer ?? throw new InvalidOperationException(MissingSerializerMessage); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs new file mode 100644 index 000000000..ed23ba950 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Globalization; +using Amazon.Lambda; +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable map operation. Processes a collection in parallel, running the +/// user-supplied function once per item — each as a +/// . All orchestration, completion, +/// checkpoint, and replay logic lives in ; +/// this subclass supplies only the map-specific bits: how to turn an item index +/// into a (name, func) pair (the per-item callback receives the item, its +/// index, and the full source list), the Map sub-type labels, and the +/// factory. +/// +internal sealed class MapOperation : ConcurrentOperation +{ + private readonly IReadOnlyList _items; + private readonly Func, Task> _func; + private readonly Func? _itemNamer; + + public MapOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList items, + Func, Task> func, + MapConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) + { + _items = items; + _func = func; + _itemNamer = config.ItemNamer; + } + + protected override int UnitCount => _items.Count; + protected override string ParentSubType => OperationSubTypes.Map; + protected override string ChildSubType => OperationSubTypes.MapItem; + protected override string OperationNoun => "Map"; + protected override string UnitNounPlural => "items"; + + protected override (string? Name, Func> Func) GetUnit(int index) + { + var item = _items[index]; + // Default name is the index — matches the unnamed-branch convention in + // ParallelAsync. A custom ItemNamer can derive a readable name from the + // item's content. Naming affects observability only, never replay + // correlation (child operation IDs are derived from the index). + var name = _itemNamer is not null + ? _itemNamer(item!, index) + : index.ToString(CultureInfo.InvariantCulture); + + return (name, ctx => _func(ctx, item, index, _items)); + } + + protected override DurableExecutionException CreateException(string message, IBatchResult result) + { + return new MapException(message) + { + Result = result, + CompletionReason = result.CompletionReason + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs new file mode 100644 index 000000000..bd74e6da5 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs @@ -0,0 +1,132 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Security.Cryptography; +using System.Text; +using System.Threading; +using Amazon.Util; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Generates deterministic operation IDs for durable operations. Each call +/// increments an internal counter and SHA-256 hashes "<parentId>-<counter>" +/// (or just "<counter>" at the root). The same workflow position +/// produces a stable, opaque ID across replays — and the human-readable step +/// name is carried separately on OperationUpdate.Name, so renaming a +/// step does not break replay correlation. +/// +internal sealed class OperationIdGenerator +{ + private int _counter; + private readonly string _prefix; + + /// + /// Creates a root-level generator. + /// + public OperationIdGenerator() + : this(parentId: null) + { + } + + /// + /// Creates a child generator scoped under a parent operation. The parent + /// ID (already hashed) becomes part of the prefix, so child IDs are + /// hash("<parentHash>-1"), hash("<parentHash>-2"), etc. + /// + public OperationIdGenerator(string? parentId) + : this(idPrefix: parentId, reportedParentId: parentId) + { + } + + /// + /// Creates a child generator that decouples the hash prefix used to derive + /// inner-operation IDs from the reported on those + /// operations' wire OperationUpdate.ParentId. + /// + /// + /// Prefix hashed into inner-operation IDs (hash("<idPrefix>-1"), ...). + /// Always the owning context's own operation ID, so two sibling branches + /// never collide on inner IDs. + /// + /// + /// The parent operation ID stamped on inner operations. For a normal + /// (non-virtual) context this equals . For a + /// branch — a "virtual" context that emits no + /// CONTEXT checkpoint of its own — this is the nearest non-virtual ancestor + /// (the parallel/map operation), so inner operations re-parent past the + /// branch to an operation that actually exists in the checkpoint store. + /// + private OperationIdGenerator(string? idPrefix, string? reportedParentId) + { + _counter = 0; + ParentId = reportedParentId; + _prefix = idPrefix != null ? idPrefix + "-" : string.Empty; + } + + /// + /// Gets the parent operation ID, if any. + /// + public string? ParentId { get; } + + /// + /// Generates the next operation ID. The counter is pre-incremented so the + /// first ID is hash("1"). + /// + /// + /// Uses so concurrent callers + /// (e.g. user code that wraps multiple StepAsync calls in + /// Task.WhenAll with Task.Run, or future ParallelAsync/ + /// MapAsync branches that fan out before awaiting) cannot collide + /// on the same ID. Determinism still requires that calls happen in a + /// deterministic order — atomicity prevents duplicate IDs but not + /// reordering between replays. + /// + public string NextId() + { + var counter = Interlocked.Increment(ref _counter); + return HashOperationId(_prefix + counter.ToString(System.Globalization.CultureInfo.InvariantCulture)); + } + + /// + /// SHA-256 hashes and returns a 64-char lowercase + /// hex digest. Public so tests and child-context construction can reproduce + /// the same hashing logic. + /// + public static string HashOperationId(string rawId) + { + var bytes = Encoding.UTF8.GetBytes(rawId); + var hash = SHA256.HashData(bytes); + return AWSSDKUtils.ToHex(hash, lowercase: true); + } + + /// + /// Creates a child generator scoped under an operation ID from this generator. + /// + public OperationIdGenerator CreateChild(string operationId) + { + return new OperationIdGenerator(operationId); + } + + /// + /// Creates a child generator for a branch — a + /// "virtual" context. Inner-operation IDs are still derived from + /// (so sibling branches don't collide), but + /// the IDs are reported under (the + /// nearest non-virtual ancestor) because the virtual branch emits no CONTEXT + /// checkpoint that inner operations could reference as their parent. + /// + public OperationIdGenerator CreateVirtualChild(string operationId, string? reportedParentId) + { + return new OperationIdGenerator(idPrefix: operationId, reportedParentId: reportedParentId); + } + + /// + /// Resets the counter (used for testing only). Not safe to call concurrently + /// with ; tests must quiesce before resetting. + /// + internal void Reset() + { + Interlocked.Exchange(ref _counter, 0); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs new file mode 100644 index 000000000..08b7d1781 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable parallel operation. Runs N user-supplied branches concurrently, +/// each as a . All orchestration, +/// completion, checkpoint, and replay logic lives in +/// ; this subclass supplies only the +/// branch-specific bits (unit count, per-branch (name, func), sub-type +/// labels, and the failure-exception factory). +/// +internal sealed class ParallelOperation : ConcurrentOperation +{ + private readonly IReadOnlyList> _branches; + + public ParallelOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList> branches, + ParallelConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) + { + _branches = branches; + } + + protected override int UnitCount => _branches.Count; + protected override string ParentSubType => OperationSubTypes.Parallel; + protected override string ChildSubType => OperationSubTypes.ParallelBranch; + protected override string OperationNoun => "Parallel"; + protected override string UnitNounPlural => "branches"; + + protected override (string? Name, Func> Func) GetUnit(int index) + { + var branch = _branches[index]; + return (branch.Name, branch.Func); + } + + protected override DurableExecutionException CreateException(string message, IBatchResult result) + { + return new ParallelException(message) + { + Result = result, + CompletionReason = result.CompletionReason + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ReplayAwareLogger.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ReplayAwareLogger.cs new file mode 100644 index 000000000..5e24f53fe --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ReplayAwareLogger.cs @@ -0,0 +1,57 @@ +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// decorator that suppresses messages while the workflow +/// is replaying prior operations. Reads +/// on every call so it correctly transitions to passthrough the moment the +/// state's per-operation tracker decides we've caught up to fresh execution. +/// +/// +/// Mirrors the suppression behavior of the Python and Java durable execution +/// SDKs: replay calls return without invoking the +/// inner logger. always delegates so scopes +/// stay balanced — suppression only applies at log emission. +/// +internal sealed class ReplayAwareLogger : ILogger +{ + private readonly ILogger _inner; + private readonly ExecutionState _state; + private readonly bool _modeAware; + + public ReplayAwareLogger(ILogger inner, ExecutionState state, bool modeAware) + { + _inner = inner; + _state = state; + _modeAware = modeAware; + } + + /// The wrapped logger; exposed so ConfigureLogger can rewrap without losing it. + public ILogger Inner => _inner; + + /// Whether replay suppression is active. + public bool ModeAware => _modeAware; + + public IDisposable? BeginScope(TState state) where TState : notnull + => _inner.BeginScope(state); + + public bool IsEnabled(LogLevel logLevel) + { + if (ShouldSuppress()) return false; + return _inner.IsEnabled(logLevel); + } + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + if (ShouldSuppress()) return; + _inner.Log(logLevel, eventId, state, exception, formatter); + } + + private bool ShouldSuppress() => _modeAware && _state.IsReplaying; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs new file mode 100644 index 000000000..4d04d8a72 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs @@ -0,0 +1,355 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Microsoft.Extensions.Logging; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkStepOptions = Amazon.Lambda.Model.StepOptions; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable step operation. Runs the user's function (with retry support), +/// persisting its result so subsequent invocations replay the cached value +/// without re-executing. +/// +/// +/// Replay branches — example: await ctx.StepAsync(ChargeCard, "charge") +/// +/// Fresh: no prior state → run func → emit SUCCEED → return. +/// SUCCEEDED: return cached result; func is NOT re-executed. +/// FAILED: re-throw the recorded exception. +/// PENDING (retry timer not yet fired): re-suspend without +/// running func; service re-invokes once NextAttemptTimestamp elapses. +/// STARTED + AtMostOncePerRetry: crash recovery — treat as a +/// failed attempt, route through retry strategy. +/// READY: service has post-PENDING re-invoked us; the retry +/// timer fired and the next attempt is up. Run it. +/// +/// Serialization is delegated to the registered on +/// . AOT-safe and reflection-based callers +/// share the same code path: the AOT story is determined entirely by the serializer +/// the user registered with the runtime (e.g., +/// SourceGeneratorLambdaJsonSerializer<TContext>). +/// +internal sealed class StepOperation : DurableOperation +{ + private readonly Func> _func; + private readonly StepConfig? _config; + private readonly ILambdaSerializer _serializer; + private readonly ILogger _logger; + + public StepOperation( + string operationId, + string? name, + string? parentId, + Func> func, + StepConfig? config, + ILambdaSerializer serializer, + ILogger logger, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _func = func; + _config = config; + _serializer = serializer; + _logger = logger; + } + + protected override string OperationType => OperationTypes.Step; + + protected override Task StartAsync(CancellationToken cancellationToken) + => ExecuteFunc(attemptNumber: 1, cancellationToken); + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Side-effecting code runs at most once: replay returns the + // cached result without invoking func. + return Task.FromResult(DeserializeResult(existing.StepDetails?.Result)); + + case OperationStatuses.Failed: + // Retries were exhausted or never configured — re-throw so the + // user's catch-block flow matches the original execution. + throw CreateStepException(existing); + + case OperationStatuses.Pending: + return ReplayPending(existing, cancellationToken); + + case OperationStatuses.Started: + return ReplayStarted(existing, cancellationToken); + + case OperationStatuses.Ready: + return ReplayReady(existing, cancellationToken); + + default: + // CANCELLED / STOPPED / unrecognized status. Re-running the + // step would re-execute side effects and silently mask a + // service-state we don't know how to interpret. Fail loud. + throw new NonDeterministicExecutionException( + $"Step operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + /// + /// READY means the service has post-PENDING re-invoked us — the retry + /// timer fired and the step is eligible to run its next attempt. No + /// timer check is needed (the service has already decided we're up); + /// just advance the attempt counter and execute. + /// + private Task ReplayReady(Operation ready, CancellationToken cancellationToken) + { + var attemptNumber = (ready.StepDetails?.Attempt ?? 0) + 1; + return ExecuteFunc(attemptNumber, cancellationToken); + } + + /// + /// PENDING means a retry was scheduled (RETRY checkpoint). The service's + /// transition to READY when the timer fires is the authoritative "timer + /// fired" signal; we still get re-invoked in PENDING only if the service + /// re-invokes slightly early. The wall-clock check below is a safety net + /// for that case — clock skew can't cause a missed retry because if our + /// clock is fast we just run early, and if it's slow we re-suspend and + /// the service's READY transition takes over. + /// + private Task ReplayPending(Operation pending, CancellationToken cancellationToken) + { + var nextAttemptTs = pending.StepDetails?.NextAttemptTimestamp; + var attemptNumber = (pending.StepDetails?.Attempt ?? 0) + 1; + + if (nextAttemptTs is { } scheduledMs && + DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() < scheduledMs) + { + // Retry timer hasn't fired yet — re-suspend so we don't bill compute + // while the timer ticks. Service re-invokes once the timer elapses. + return Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"retry:{Name ?? OperationId}"); + } + + return ExecuteFunc(attemptNumber, cancellationToken); + } + + /// + /// STARTED means a START checkpoint was written but no SUCCEED/FAIL exists. + /// For AtMostOncePerRetry this signals a crash mid-step — treat as failure + /// and route through retry. For AtLeastOncePerRetry just re-execute. + /// + private Task ReplayStarted(Operation started, CancellationToken cancellationToken) + { + var attemptNumber = (started.StepDetails?.Attempt ?? 0) + 1; + + if (_config?.Semantics == StepSemantics.AtMostOncePerRetry) + { + // Re-running func would risk a duplicate side effect (e.g. double + // charge). Treat the lost result as a failure; let the retry + // strategy decide whether to try again or give up. + // + // Surface as StepInterruptedException so user strategies can + // distinguish "my code threw" from "a prior attempt crashed before + // recording a terminal record". + var error = started.StepDetails?.Error; + var ex = error != null + ? new StepInterruptedException(error.ErrorMessage ?? "Step failed on previous attempt") { ErrorType = error.ErrorType } + : new StepInterruptedException("Step result lost during AtMostOncePerRetry replay"); + return HandleStepFailureAsync(ex, attemptNumber, cancellationToken); + } + + return ExecuteFunc(attemptNumber, cancellationToken); + } + + private async Task ExecuteFunc(int attemptNumber, CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Emit a START checkpoint before running user code, unless we're already + // resuming a STARTED record (which means an earlier attempt wrote it). + // + // AtMostOncePerRetry: SYNC flush. If Lambda crashes before SUCCEED is + // flushed, ReplayStarted routes through retry instead of re-executing. + // A queued-but-unflushed START is indistinguishable from "never ran" if + // we die, so the sync flush is correctness-load-bearing here. + // + // AtLeastOncePerRetry (default): FIRE-AND-FORGET. Replay correctness + // doesn't depend on the START — SUCCEED alone is sufficient — so this + // is purely telemetry (attempt timing, retry count visible in history). + if (State.GetOperation(OperationId)?.Status != OperationStatuses.Started) + { + var startUpdate = new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.START, + SubType = OperationSubTypes.Step, + Name = Name + }; + + if (_config?.Semantics == StepSemantics.AtMostOncePerRetry) + { + await EnqueueAsync(startUpdate, cancellationToken); + } + else + { + FireAndForget(EnqueueAsync(startUpdate, cancellationToken)); + } + } + + + try + { + var stepContext = new StepContext(OperationId, attemptNumber, _logger); + + // Step-scoped metadata so structured log providers tag user code + // lines with the operation id, name, and current attempt. Wrap + // only the user-func call — checkpoint emission shouldn't carry + // step metadata into any side-channel logging. + T result; + using (_logger.BeginScope(new Dictionary + { + ["operationId"] = OperationId, + ["operationName"] = Name ?? string.Empty, + ["attempt"] = attemptNumber, + })) + { + result = await _func(stepContext); + } + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.SUCCEED, + SubType = OperationSubTypes.Step, + Name = Name, + Payload = SerializeResult(result) + }, cancellationToken); + + return result; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + // Funnel into the retry/fail decision tree. May checkpoint RETRY and + // suspend (Pending), or checkpoint FAIL and rethrow to user. + return await HandleStepFailureAsync(ex, attemptNumber, cancellationToken); + } + } + + /// + /// Funnels a step failure into the retry/fail decision. May checkpoint + /// RETRY and suspend (Pending), or checkpoint FAIL and rethrow. + /// + private async Task HandleStepFailureAsync(Exception ex, int attemptNumber, CancellationToken cancellationToken) + { + var retryStrategy = _config?.RetryStrategy; + if (retryStrategy != null) + { + var decision = retryStrategy.ShouldRetry(ex, attemptNumber); + if (decision.ShouldRetry) + { + // Service requires NextAttemptDelaySeconds >= 1. Built-in + // strategies already produce >=1s delays; this guard only + // matters for user-supplied IRetryStrategy / FromDelegate. + var requestedSeconds = decision.Delay.TotalSeconds; + var delaySeconds = (int)Math.Max(1, Math.Ceiling(requestedSeconds)); + if (requestedSeconds < 1) + { + _logger.LogWarning( + "Retry delay for step '{StepName}' attempt {Attempt} was {Requested:F3}s (< 1s); coerced to {Coerced}s.", + Name ?? OperationId, attemptNumber, requestedSeconds, delaySeconds); + } + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.RETRY, + SubType = OperationSubTypes.Step, + Name = Name, + Error = ToSdkError(ex), + StepOptions = new SdkStepOptions { NextAttemptDelaySeconds = delaySeconds } + }, cancellationToken); + return await Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"retry:{Name ?? OperationId}"); + } + } + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.FAIL, + SubType = OperationSubTypes.Step, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + + throw new StepException(ex.Message, ex) + { + ErrorType = ex.GetType().FullName + }; + } + + private T DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private string SerializeResult(T value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private static StepException CreateStepException(Operation failedOp) + { + var err = failedOp.StepDetails?.Error; + return new StepException(err?.ErrorMessage ?? "Step failed") + { + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private static SdkErrorObject ToSdkError(Exception ex) => new() + { + ErrorType = ex.GetType().FullName, + ErrorMessage = ex.Message, + StackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }; + + /// + /// Discards a Task but observes any exception so it doesn't surface as an + /// UnobservedTaskException. Used for fire-and-forget START checkpoints + /// under AtLeastOncePerRetry semantics. The actual error still propagates + /// via CheckpointBatcher._terminalError: the next sync EnqueueAsync + /// or DrainAsync will rethrow with the original cause. + /// + private static void FireAndForget(Task task) + { + _ = task.ContinueWith( + static t => _ = t.Exception, + CancellationToken.None, + TaskContinuationOptions.OnlyOnFaulted | TaskContinuationOptions.ExecuteSynchronously, + TaskScheduler.Default); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs new file mode 100644 index 000000000..1218f6ceb --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// The reason the execution was terminated. +/// +internal enum TerminationReason +{ + WaitScheduled, + RetryScheduled, + CallbackPending, + InvokePending, + CheckpointFailed +} + +/// +/// The result of a termination signal. +/// +internal sealed class TerminationResult +{ + public required TerminationReason Reason { get; init; } + public string? Message { get; init; } + public Exception? Exception { get; init; } +} + +/// +/// Manages the suspension signal for durable execution. +/// Uses a TaskCompletionSource that resolves when the function should suspend. +/// Only the first Terminate() call wins; subsequent calls are ignored. +/// +internal sealed class TerminationManager +{ + private readonly TaskCompletionSource _tcs = new(TaskCreationOptions.RunContinuationsAsynchronously); + private int _terminated; + + /// + /// A Task that resolves when Terminate() is called. Used in Task.WhenAny + /// to race against user code. + /// + public Task TerminationTask => _tcs.Task; + + /// + /// Whether Terminate() has been called. + /// + public bool IsTerminated => Volatile.Read(ref _terminated) == 1; + + /// + /// Signals that the execution should suspend. Thread-safe; only the first + /// call has effect. + /// + /// true if this call triggered termination, false if already terminated. + public bool Terminate(TerminationReason reason, string? message = null, Exception? exception = null) + { + if (Interlocked.CompareExchange(ref _terminated, 1, 0) != 0) + return false; + + _tcs.TrySetResult(new TerminationResult + { + Reason = reason, + Message = message, + Exception = exception + }); + + return true; + } + + /// + /// Trips the termination signal and returns a Task that never completes. + /// This is the standard suspension idiom: the caller awaits the returned + /// Task, and 's Task.WhenAny + /// race picks up instead, returning Pending + /// to the service. The returned Task is abandoned and GC'd. + /// + public Task SuspendAndAwait(TerminationReason reason, string? message = null, Exception? exception = null) + { + Terminate(reason, message, exception); + return new TaskCompletionSource().Task; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitForConditionOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitForConditionOperation.cs new file mode 100644 index 000000000..742265782 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitForConditionOperation.cs @@ -0,0 +1,387 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using Amazon.Lambda.Core; +using Microsoft.Extensions.Logging; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkStepOptions = Amazon.Lambda.Model.StepOptions; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable wait-for-condition (polling) operation. Repeatedly invokes a +/// user-supplied check function until an +/// decides to stop. Between iterations the workflow is suspended so the +/// Lambda is not billing compute while waiting. +/// +/// +/// Wire format reuses STEP+RETRY exactly: +/// +/// Type=STEP, SubType="WaitForCondition" +/// Each polling iteration emits Action=RETRY with the latest +/// in Payload and the strategy's +/// chosen delay in StepOptions.NextAttemptDelaySeconds. +/// Termination emits Action=SUCCEED with the final state in +/// Payload; check-function exceptions emit Action=FAIL. +/// +/// Replay branches — example: await ctx.WaitForConditionAsync(check, config, "poll") +/// +/// Fresh: sync-flush START → run check with +/// → strategy decides Stop/Continue. +/// SUCCEEDED: return the deserialized cached state; check is NOT re-run. +/// FAILED: re-throw a +/// (or fall back to if the FAIL was caused by +/// the check function throwing — the latter carries the original +/// error type/message). +/// PENDING (RETRY scheduled): if the next-attempt timer hasn't +/// fired yet, re-suspend; otherwise read the prior state from +/// StepDetails.Result, advance the attempt counter, and run the +/// check again. +/// READY: timer fired and the service re-invoked us. Read the +/// prior state, advance the attempt counter, run the check. +/// STARTED: the START checkpoint was written but the very first +/// check attempt didn't complete (Lambda crash / timeout). Re-execute +/// with as +/// the seed. +/// +/// State checkpointing in each RETRY's payload is what makes the polling loop +/// survive Lambda re-invocations deterministically. +/// +internal sealed class WaitForConditionOperation : DurableOperation +{ + private readonly Func> _check; + private readonly WaitForConditionConfig _config; + private readonly ILambdaSerializer _serializer; + private readonly ILogger _logger; + + public WaitForConditionOperation( + string operationId, + string? name, + string? parentId, + Func> check, + WaitForConditionConfig config, + ILambdaSerializer serializer, + ILogger logger, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _check = check; + _config = config; + _serializer = serializer; + _logger = logger; + } + + protected override string OperationType => OperationTypes.Step; + + protected override Task StartAsync(CancellationToken cancellationToken) + => ExecuteIteration(_config.InitialState, attemptNumber: 1, cancellationToken); + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Polling concluded on a previous invocation; return the + // cached final state without re-running the check. + return Task.FromResult(DeserializeState(existing.StepDetails?.Result)); + + case OperationStatuses.Failed: + throw BuildFailureException(existing); + + case OperationStatuses.Pending: + return ReplayPending(existing, cancellationToken); + + case OperationStatuses.Ready: + return ReplayReady(existing, cancellationToken); + + case OperationStatuses.Started: + // START emitted but no RETRY/SUCCEED yet — the very first + // check attempt was lost. Re-execute with InitialState. Do + // NOT re-emit START (the original is authoritative). + return ExecuteIteration(_config.InitialState, attemptNumber: 1, cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"WaitForCondition operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + /// + /// PENDING means the prior iteration emitted RETRY and the service + /// scheduled a timer. If the timer hasn't fired we re-suspend; once it + /// fires, the next iteration runs against the previously checkpointed + /// state, NOT . + /// + private Task ReplayPending(Operation pending, CancellationToken cancellationToken) + { + var nextAttemptTs = pending.StepDetails?.NextAttemptTimestamp; + if (nextAttemptTs is { } scheduledMs && + DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() < scheduledMs) + { + // Timer still ticking — re-suspend without re-checkpointing. + return Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"wait_for_condition:{Name ?? OperationId}"); + } + + var priorState = DeserializeStateOrInitial(pending.StepDetails?.Result); + var attemptNumber = (pending.StepDetails?.Attempt ?? 0) + 1; + return ExecuteIteration(priorState, attemptNumber, cancellationToken); + } + + /// + /// READY means the service has re-invoked us post-PENDING — the next + /// poll is up. Read the latest state from the prior RETRY's payload + /// and advance the attempt counter. + /// + private Task ReplayReady(Operation ready, CancellationToken cancellationToken) + { + var priorState = DeserializeStateOrInitial(ready.StepDetails?.Result); + var attemptNumber = (ready.StepDetails?.Attempt ?? 0) + 1; + return ExecuteIteration(priorState, attemptNumber, cancellationToken); + } + + private async Task ExecuteIteration(TState currentState, int attemptNumber, CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Emit START on the very first attempt only — and sync-flush so the + // service has a record of the polling op even if the check function + // drives termination via, e.g., a wait inside it. Subsequent + // iterations resume from a RETRY/READY/PENDING checkpoint and skip + // START. + if (State.GetOperation(OperationId) == null) + { + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.START, + SubType = OperationSubTypes.WaitForCondition, + Name = Name + }, cancellationToken); + } + + TState newState; + try + { + var checkContext = new ConditionCheckContext(attemptNumber, _logger); + newState = await _check(currentState, checkContext); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + // The check threw. WaitForCondition has no per-exception retry + // strategy (Python/JS/Java SDKs all treat check failure as terminal), + // so checkpoint FAIL and surface the original exception via + // StepException — same shape as StepOperation's terminal failure. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.FAIL, + SubType = OperationSubTypes.WaitForCondition, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + + throw new StepException(ex.Message, ex) + { + ErrorType = ex.GetType().FullName + }; + } + + WaitDecision decision; + try + { + decision = _config.WaitStrategy.Decide(newState, attemptNumber); + } + catch (WaitForConditionException maxEx) + { + // Strategy is signaling max-attempts reached. The strategy + // didn't have access to LastState; we do — populate it now, + // checkpoint FAIL, and rethrow. + var enriched = new WaitForConditionException( + $"WaitForCondition '{Name ?? OperationId}' exhausted {attemptNumber} attempts without the condition being met.", + maxEx) + { + AttemptsExhausted = attemptNumber, + LastState = newState + }; + + // Persist the last observed state in Error.ErrorData so a replay + // that hits this cached FAIL can reconstruct LastState identically + // to the live throw. The wire protocol forbids a Payload on FAIL + // updates, so ErrorData is the only field that survives replay. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.FAIL, + SubType = OperationSubTypes.WaitForCondition, + Name = Name, + Error = new SdkErrorObject + { + ErrorType = typeof(WaitForConditionException).FullName, + ErrorMessage = enriched.Message, + ErrorData = SerializeState(newState) + } + }, cancellationToken); + + throw enriched; + } + + if (!decision.ShouldContinue) + { + // Stop() means the condition has been met. Persist the final + // state and return it to the caller. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.SUCCEED, + SubType = OperationSubTypes.WaitForCondition, + Name = Name, + Payload = SerializeState(newState) + }, cancellationToken); + + return newState; + } + + // Continue polling — emit RETRY with the new state in the payload + // and the next-attempt delay in StepOptions. Sync-flush so the + // service definitely has the new state and timer scheduled before + // we suspend. + var delaySeconds = (int)Math.Max(1, Math.Ceiling(decision.Delay.TotalSeconds)); + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.RETRY, + SubType = OperationSubTypes.WaitForCondition, + Name = Name, + Payload = SerializeState(newState), + StepOptions = new SdkStepOptions { NextAttemptDelaySeconds = delaySeconds } + }, cancellationToken); + + return await Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"wait_for_condition:{Name ?? OperationId}"); + } + + private TState DeserializeState(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private TState DeserializeStateOrInitial(string? serialized) + { + if (serialized == null) return _config.InitialState; + try + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + catch (Exception ex) + { + // If the serializer can't read the prior state, fall back to + // InitialState — matches Python's behavior. Log a warning so + // corrupted payloads / schema migrations are observable instead + // of silently restarting the polling loop. + _logger.LogWarning( + "WaitForCondition operation '{OperationId}' failed to deserialize prior state ({ExceptionType}: {Message}); falling back to InitialState.", + OperationId, ex.GetType().FullName, ex.Message); + return _config.InitialState; + } + } + + private string SerializeState(TState value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private Exception BuildFailureException(Operation failedOp) + { + var err = failedOp.StepDetails?.Error; + // Distinguish "max attempts exhausted" (we recorded the type as + // WaitForConditionException above) from "check function threw" + // (recorded as the original exception type via StepException). + if (err?.ErrorType == typeof(WaitForConditionException).FullName) + { + // Recover LastState from the FAIL checkpoint's Error.ErrorData. + // Live execution serializes the most recent state alongside the + // error so replay surfaces an identically-populated exception. + // Falls back to null when ErrorData is absent (legacy data + // pre-dating this serialization) or unreadable. + object? lastState = null; + var lastStatePayload = err?.ErrorData; + if (lastStatePayload != null) + { + try + { + var bytes = Encoding.UTF8.GetBytes(lastStatePayload); + using var ms = new MemoryStream(bytes); + lastState = _serializer.Deserialize(ms); + } + catch (Exception deserEx) + { + _logger.LogWarning( + "WaitForCondition operation '{OperationId}' failed to deserialize LastState from FAIL checkpoint ErrorData ({ExceptionType}: {Message}); LastState will be null on the rethrown exception.", + OperationId, deserEx.GetType().FullName, deserEx.Message); + } + } + + return new WaitForConditionException(err?.ErrorMessage ?? $"WaitForCondition '{Name ?? OperationId}' exhausted attempts.") + { + AttemptsExhausted = failedOp.StepDetails?.Attempt ?? 0, + LastState = lastState + }; + } + + return new StepException(err?.ErrorMessage ?? "WaitForCondition check function failed") + { + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private static SdkErrorObject ToSdkError(Exception ex) => new() + { + ErrorType = ex.GetType().FullName, + ErrorMessage = ex.Message, + StackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }; +} + +/// +/// Internal implementation of . +/// +internal sealed class ConditionCheckContext : IConditionCheckContext +{ + public ConditionCheckContext(int attemptNumber, ILogger logger) + { + AttemptNumber = attemptNumber; + Logger = logger; + } + + public ILogger Logger { get; } + public int AttemptNumber { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs new file mode 100644 index 000000000..948503f26 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkWaitOptions = Amazon.Lambda.Model.WaitOptions; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable wait operation. Suspends the workflow for a given duration without +/// consuming compute time; the service schedules a timer and re-invokes Lambda +/// when it fires. +/// +/// +/// Replay semantics — example: await ctx.WaitAsync(TimeSpan.FromHours(1)) +/// +/// Fresh: emit WAIT START → flush → suspend → service schedules timer. +/// Replay (SUCCEEDED): timer fired, return CompletedTask. +/// Replay (STARTED/PENDING): timer still ticking → re-suspend (or +/// short-circuit if the deadline already elapsed but SUCCEEDED hasn't +/// been stamped yet). +/// +/// See for the +/// suspension mechanics (Task.WhenAny race against TerminationManager). +/// +internal sealed class WaitOperation : DurableOperation +{ + private readonly int _waitSeconds; + + public WaitOperation( + string operationId, + string? name, + string? parentId, + int waitSeconds, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _waitSeconds = waitSeconds; + } + + protected override string OperationType => OperationTypes.Wait; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + // Sync-flush WAIT START before suspending — the service can't schedule + // a timer for a checkpoint it hasn't received. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Wait, + Action = OperationAction.START, + SubType = OperationSubTypes.Wait, + Name = Name, + WaitOptions = new SdkWaitOptions { WaitSeconds = _waitSeconds } + }, cancellationToken); + + return await Termination.SuspendAndAwait( + TerminationReason.WaitScheduled, $"wait:{Name ?? OperationId}"); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Common post-timer case: service stamped the wait as SUCCEEDED + // and re-invoked Lambda. Workflow proceeds to the next step. + return Task.FromResult(null); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Service hasn't marked the wait complete yet. Either the timer + // is still ticking, or the deadline elapsed but SUCCEEDED hasn't + // been stamped yet — treat elapsed deadlines as "done" to avoid + // a pointless extra round-trip. + var expiresAtMs = existing.WaitDetails?.ScheduledEndTimestamp; + if (expiresAtMs is { } ts && DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() >= ts) + { + return Task.FromResult(null); + } + + // Timer still ticking — re-suspend without re-checkpointing. + // The original WAIT START is still authoritative. + return Termination.SuspendAndAwait( + TerminationReason.WaitScheduled, $"wait:{Name ?? OperationId}"); + + default: + throw new NonDeterministicExecutionException( + $"Wait operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs new file mode 100644 index 000000000..b58f810a1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs @@ -0,0 +1,27 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for chained invoke operations. +/// +/// +/// Use with +/// to configure a single chained invocation. Payload/result serialization is +/// performed by the registered on +/// (typically configured via +/// LambdaBootstrapBuilder.Create(handler, serializer)); there are +/// intentionally no serializer fields here, matching the pattern established +/// by . +/// +public sealed class InvokeConfig +{ + /// + /// Optional tenant identifier propagated to the chained invocation via + /// ChainedInvokeOptions.TenantId. Used to route the invocation to a + /// tenant-isolated function. Matches the tenantId field on the + /// Python, JavaScript, and Java SDKs. + /// + public string? TenantId { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs new file mode 100644 index 000000000..da631b687 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs @@ -0,0 +1,87 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Thrown when a chained invoke operation reaches a non-success terminal state. +/// +/// +/// Base class for the invoke exception tree. Catch +/// to handle every chained-invoke failure mode uniformly, or pattern-match the +/// concrete subclasses to react differently to specific outcomes: +/// +/// — the chained function threw. +/// — the chained invocation +/// reached the service-side TIMED_OUT terminal state. +/// — the chained execution was +/// stopped by the service or an operator. +/// +/// Mirrors the Java SDK's InvokeException / InvokeFailedException +/// / InvokeTimedOutException / InvokeStoppedException tree; the +/// .NET SDK keeps non-abstract so callers can also +/// rethrow it directly when wrapping fallback logic. +/// +public class InvokeException : DurableExecutionException +{ + /// The fully-qualified name of the invoked function (ARN, alias, or version). + public string? FunctionName { get; init; } + + /// The fully-qualified type name of the original exception, when known. + public string? ErrorType { get; init; } + + /// Optional structured error data attached by the invoked function. + public string? ErrorData { get; init; } + + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public InvokeException() { } + /// Creates an with the given message. + public InvokeException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status FAILED — +/// the invoked function ran and threw. +/// +public class InvokeFailedException : InvokeException +{ + /// Creates an empty . + public InvokeFailedException() { } + /// Creates an with the given message. + public InvokeFailedException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeFailedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status TIMED_OUT. +/// +public class InvokeTimedOutException : InvokeException +{ + /// Creates an empty . + public InvokeTimedOutException() { } + /// Creates an with the given message. + public InvokeTimedOutException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeTimedOutException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status STOPPED +/// — the invocation was stopped administratively by the durable execution +/// service before reaching a normal terminal state. +/// +public class InvokeStoppedException : InvokeException +{ + /// Creates an empty . + public InvokeStoppedException() { } + /// Creates an with the given message. + public InvokeStoppedException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeStoppedException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/LoggerConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/LoggerConfig.cs new file mode 100644 index 000000000..801d0a7c9 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/LoggerConfig.cs @@ -0,0 +1,24 @@ +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for . Lets users +/// swap the underlying (e.g. Serilog, AWS Lambda Powertools) +/// or disable replay-aware filtering for debugging. +/// +public sealed class LoggerConfig +{ + /// + /// Optional to use instead of the SDK default. When + /// null, the durable context keeps its existing inner logger. + /// + public ILogger? CustomLogger { get; init; } + + /// + /// When true (default), messages are suppressed while the workflow is + /// re-deriving prior operations from checkpointed state. Set to false to + /// see every log line on every replay (useful for local debugging). + /// + public bool ModeAware { get; init; } = true; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs new file mode 100644 index 000000000..5b7c76e5f --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs @@ -0,0 +1,75 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-item checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class MapConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of items processed concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the map operation is considered complete. Defaults to + /// — every item runs regardless + /// of per-item failures, which are surfaced via + /// rather than thrown. + /// + /// + /// This permissive default matches the Python and Java SDKs' map operation. + /// It differs intentionally from , + /// which defaults to (fail-fast). + /// For fail-fast map behavior — any item failure surfaces a + /// when the result is awaited — set this to + /// , or call + /// on the result. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); + + /// + /// How item branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// Under each item runs in a virtual context + /// that emits no per-item CONTEXT checkpoint; per-item results and + /// errors are recorded inline on the map operation's payload instead. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; + + /// + /// Optional function to generate a custom name for each item's branch. + /// Receives the item and its zero-based index, and returns the branch name + /// surfaced in execution traces and on . + /// When null (default), branches are named by index ("0", + /// "1", ...), matching . + /// + public Func? ItemNamer { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs new file mode 100644 index 000000000..a36c793e7 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs @@ -0,0 +1,36 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Controls how branches in a parallel/map operation are represented in the +/// checkpoint graph. +/// +/// +/// +/// is the default — each branch produces a full CONTEXT +/// operation visible in execution traces. +/// +/// +/// uses virtual contexts to reduce checkpoint volume (no +/// per-branch CONTEXT operation): each branch's result or error is +/// recorded inline on the parent parallel/map operation's payload instead. +/// +/// +public enum NestingType +{ + /// + /// Each branch creates a full isolated CONTEXT operation. Higher + /// observability in execution traces but more checkpoint operations + /// (default). + /// + Nested, + + /// + /// Branches run in virtual contexts that emit no CONTEXT checkpoint + /// of their own — per-branch results/errors are recorded inline on the + /// parent operation's payload. Reduces checkpoint cost at the expense of + /// less granular execution traces. Branch operations inside a flat branch + /// (steps, waits) still checkpoint, re-parented to the parallel/map + /// operation. + /// + Flat +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs new file mode 100644 index 000000000..ca358a46b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -0,0 +1,244 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// One operation in the durable execution service's invocation envelope. +/// Property names mirror the wire format exactly so System.Text.Json can +/// populate this type declaratively. +/// +public sealed class Operation +{ + /// The operation's unique identifier. + [JsonPropertyName("Id")] + public string? Id { get; set; } + + /// Operation type — see . + [JsonPropertyName("Type")] + public string? Type { get; set; } + + /// Operation status — see . + [JsonPropertyName("Status")] + public string? Status { get; set; } + + /// User-supplied operation name (e.g., the step name). + [JsonPropertyName("Name")] + public string? Name { get; set; } + + /// Identifier of the parent operation, if any (used for nested contexts). + [JsonPropertyName("ParentId")] + public string? ParentId { get; set; } + + /// Operation sub-type, if any (e.g., for child contexts). + [JsonPropertyName("SubType")] + public string? SubType { get; set; } + + /// Unix-epoch milliseconds at which the operation started. + [JsonPropertyName("StartTimestamp")] + public long? StartTimestamp { get; set; } + + /// Unix-epoch milliseconds at which the operation ended. + [JsonPropertyName("EndTimestamp")] + public long? EndTimestamp { get; set; } + + /// Step-specific details (present when is STEP). + [JsonPropertyName("StepDetails")] + public StepDetails? StepDetails { get; set; } + + /// Wait-specific details (present when is WAIT). + [JsonPropertyName("WaitDetails")] + public WaitDetails? WaitDetails { get; set; } + + /// Execution-specific details (present when is EXECUTION). + [JsonPropertyName("ExecutionDetails")] + public ExecutionDetails? ExecutionDetails { get; set; } + + /// Callback-specific details (present when is CALLBACK). + [JsonPropertyName("CallbackDetails")] + public CallbackDetails? CallbackDetails { get; set; } + + /// Chained-invoke details (present when is CHAINED_INVOKE). + [JsonPropertyName("ChainedInvokeDetails")] + public ChainedInvokeDetails? ChainedInvokeDetails { get; set; } + + /// Child-context details (present when is CONTEXT). + [JsonPropertyName("ContextDetails")] + public ContextDetails? ContextDetails { get; set; } +} + +/// Details for a STEP operation. +public sealed class StepDetails +{ + /// Serialized step result. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error from the most recent attempt, if it failed. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } + + /// The attempt number (1-based). + [JsonPropertyName("Attempt")] + public int? Attempt { get; set; } + + /// Unix-epoch milliseconds at which the next retry attempt is scheduled. + [JsonPropertyName("NextAttemptTimestamp")] + public long? NextAttemptTimestamp { get; set; } +} + +/// Details for a WAIT operation. +public sealed class WaitDetails +{ + /// Unix-epoch milliseconds at which the wait is scheduled to end. + [JsonPropertyName("ScheduledEndTimestamp")] + public long? ScheduledEndTimestamp { get; set; } +} + +/// Details for an EXECUTION operation. +public sealed class ExecutionDetails +{ + /// The serialized user input payload for this invocation. + [JsonPropertyName("InputPayload")] + public string? InputPayload { get; set; } +} + +/// Details for a CALLBACK operation. +public sealed class CallbackDetails +{ + /// The callback identifier returned to the external system. + [JsonPropertyName("CallbackId")] + public string? CallbackId { get; set; } + + /// Serialized callback result. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error returned by the external system, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// Details for a CHAINED_INVOKE operation. +public sealed class ChainedInvokeDetails +{ + /// Serialized result from the invoked function. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error returned by the invoked function, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// Details for a CONTEXT operation (child contexts). +public sealed class ContextDetails +{ + /// Serialized result of the child context. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error from the child context, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// +/// Wire-format string constants. +/// Plural name avoids collision with Amazon.Lambda.OperationType. +/// +public static class OperationTypes +{ + /// Step operation. + public const string Step = "STEP"; + + /// Wait/timer operation. + public const string Wait = "WAIT"; + + /// Callback (external-system signal) operation. + public const string Callback = "CALLBACK"; + + /// Chained-invoke (durable-to-durable call) operation. + public const string ChainedInvoke = "CHAINED_INVOKE"; + + /// Child-context operation. + public const string Context = "CONTEXT"; + + /// Top-level execution operation carrying the user input payload. + public const string Execution = "EXECUTION"; +} + +/// +/// Wire-format string constants. SubType is a +/// finer-grained classifier sent alongside for +/// observability — the values are PascalCase ("Step", "Wait") and distinct +/// from the uppercase values. +/// +public static class OperationSubTypes +{ + /// Step sub-type. + public const string Step = "Step"; + + /// Wait sub-type. + public const string Wait = "Wait"; + + /// Callback sub-type. + public const string Callback = "Callback"; + + /// Wait-for-callback sub-type. + public const string WaitForCallback = "WaitForCallback"; + + /// Chained-invoke sub-type. + public const string ChainedInvoke = "ChainedInvoke"; + + /// Child-context sub-type. + public const string Context = "Context"; + + /// Wait-for-condition (polling) sub-type. + public const string WaitForCondition = "WaitForCondition"; + + /// Parallel parent sub-type. + public const string Parallel = "Parallel"; + + /// Parallel branch (per-branch child-context) sub-type. + public const string ParallelBranch = "ParallelBranch"; + + /// Map parent sub-type. + public const string Map = "Map"; + + /// Map item (per-item child-context) sub-type. + public const string MapItem = "MapItem"; +} + +/// +/// Wire-format string constants. +/// Plural name avoids collision with Amazon.Lambda.OperationStatus. +/// +public static class OperationStatuses +{ + /// The operation has started. + public const string Started = "STARTED"; + + /// The operation completed successfully. + public const string Succeeded = "SUCCEEDED"; + + /// The operation failed. + public const string Failed = "FAILED"; + + /// The operation is pending (waiting for time, callback, or invocation). + public const string Pending = "PENDING"; + + /// The operation was cancelled. + public const string Cancelled = "CANCELLED"; + + /// The operation is ready to resume. + public const string Ready = "READY"; + + /// The operation was stopped. + public const string Stopped = "STOPPED"; + + /// The operation timed out (e.g. callback or chained invoke timeout). + public const string TimedOut = "TIMED_OUT"; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs new file mode 100644 index 000000000..bcc17f181 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs @@ -0,0 +1,58 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-branch checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class ParallelConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of branches running concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the parallel operation is considered complete. Defaults to + /// — any single branch failure + /// surfaces as a when the parallel result + /// is awaited. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// Under each branch runs in a virtual + /// context that emits no per-branch CONTEXT checkpoint; per-branch + /// results and errors are recorded inline on the parallel operation's + /// payload instead. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/README.md b/Libraries/src/Amazon.Lambda.DurableExecution/README.md new file mode 100644 index 000000000..5df3f6b23 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/README.md @@ -0,0 +1,110 @@ +# AWS Lambda Durable Execution SDK for .NET + +> **Preview.** `Amazon.Lambda.DurableExecution` is in active development (0.x). Public APIs may change before 1.0. + +`Amazon.Lambda.DurableExecution` is the .NET SDK for building resilient, long-running AWS Lambda functions that automatically checkpoint progress and resume after failures. Workflows can run for up to one year, with charges only for active compute time. + +## Key Features + +- **Automatic checkpointing** — progress is saved after each step; failures resume from the last checkpoint. +- **Cost-effective waits** — suspend execution for minutes, hours, or days without compute charges. +- **Configurable retries** — built-in retry strategies with exponential backoff and jitter. +- **Replay safety** — functions deterministically resume from checkpoints after interruptions. +- **Type safety** — full generic type support for step results. +- **AOT-friendly** — pluggable `ILambdaSerializer` so you can register `SourceGeneratorLambdaJsonSerializer` for trimmed / Native AOT functions. + +## How It Works + +Your handler delegates to `DurableFunction.WrapAsync`, which gives your workflow function an `IDurableContext`. The context is your interface to durable operations: + +- `ctx.StepAsync` — run code and checkpoint the result. ([docs](docs/core/steps.md)) +- `ctx.WaitAsync` — suspend execution without compute charges. ([docs](docs/core/wait.md)) +- `ctx.WaitForConditionAsync` — poll a check function until a condition is met, suspending between polls. ([docs](docs/core/wait-for-condition.md)) +- `ctx.CreateCallbackAsync` / `ctx.WaitForCallbackAsync` — wait for external events (approvals, webhooks). ([docs](docs/core/callbacks.md)) +- `ctx.RunInChildContextAsync` — run an isolated child context with its own checkpoint log. ([docs](docs/core/child-contexts.md)) +- `ctx.ParallelAsync` — run independent branches concurrently and aggregate their results. ([docs](docs/core/parallel.md)) + +## Quick Start + +### Installation + +```bash +dotnet add package Amazon.Lambda.DurableExecution +``` + +### Your first durable function + +> **Programming model:** the preview only supports the **executable programming model** — your function is an executable assembly that hosts its own bootstrap loop and passes the serializer to the runtime in code. Class-library handlers on the managed runtime will be supported once the changes made to Amazon.Lambda.RuntimeSupport to support durable functions has been deployed to the managed runtime. This README will be updated then. + +A complete order-processing workflow with two steps and a wait, deployed as an executable assembly on the `dotnet10` runtime. `Main` builds a `LambdaBootstrap` with your handler and an `ILambdaSerializer`, and `DurableFunction.WrapAsync` uses that serializer to checkpoint step inputs and outputs. + +```csharp +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderProcessor; + +public class OrderProcessor +{ + public static async Task Main() + { + var handler = new OrderProcessor(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(Order order, IDurableContext ctx) + { + var reservation = await ctx.StepAsync( + async _ => await InventoryService.ReserveAsync(order.Items), + name: "reserve-inventory"); + + var payment = await ctx.StepAsync( + async _ => await PaymentService.ChargeAsync(order.PaymentMethod, order.Total), + name: "process-payment"); + + await ctx.WaitAsync(TimeSpan.FromHours(2), name: "warehouse-processing"); + + var shipment = await ctx.StepAsync( + async _ => await ShippingService.ShipAsync(reservation, order.Address), + name: "confirm-shipment"); + + return new OrderResult(order.Id, shipment.TrackingNumber); + } +} + +public record Order(string Id, IReadOnlyList Items, PaymentMethod PaymentMethod, decimal Total, Address Address); +public record OrderResult(string OrderId, string TrackingNumber); +``` + +For AOT or trim-friendly serialization, swap `DefaultLambdaJsonSerializer` for `SourceGeneratorLambdaJsonSerializer` and register your `JsonSerializerContext`. + +## Documentation + +**Core operations** + +- [Steps](docs/core/steps.md) — execute code with automatic checkpointing, retry strategies, and at-least/at-most-once semantics. +- [Wait](docs/core/wait.md) — pause execution without compute charges. +- [Wait For Condition](docs/core/wait-for-condition.md) — poll until a condition is met, suspending between polls with a configurable wait strategy. +- [Callbacks](docs/core/callbacks.md) — wait for external systems to respond. +- [Child Contexts](docs/core/child-contexts.md) — group related operations into isolated, checkpointed units. +- [Parallel](docs/core/parallel.md) — fan out independent branches concurrently with configurable concurrency and completion policies. + +**Examples** + +End-to-end test functions (each paired with an integration test) live under `Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/`. + +## Related SDKs + +- [aws-durable-execution-sdk-java](https://github.com/aws/aws-durable-execution-sdk-java) — Java SDK +- [aws-durable-execution-sdk-js](https://github.com/aws/aws-durable-execution-sdk-js) — JavaScript / TypeScript SDK +- [aws-durable-execution-sdk-python](https://github.com/aws/aws-durable-execution-sdk-python) — Python SDK diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs new file mode 100644 index 000000000..c5acf2fb6 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs @@ -0,0 +1,193 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.RegularExpressions; +using Amazon.Lambda.DurableExecution.Internal; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Jitter strategy for exponential backoff to prevent thundering-herd scenarios. +/// +public enum JitterStrategy +{ + /// No randomization — delay is exactly the calculated backoff value. + None, + /// Random delay between 0 and the calculated backoff value (recommended). + Full, + /// Random delay between 50% and 100% of the calculated backoff value. + Half +} + +/// +/// Controls whether a step re-executes if the Lambda is re-invoked mid-attempt. +/// +public enum StepSemantics +{ + /// + /// Default. The step may re-execute if the Lambda is re-invoked during execution. + /// Use for idempotent operations. + /// + AtLeastOncePerRetry, + + /// + /// The step executes at most once per retry attempt. A START checkpoint is written + /// before execution; on replay with an existing START, the SDK skips re-execution + /// and proceeds to the retry handler. + /// + AtMostOncePerRetry +} + +/// +/// Factory methods for common retry strategies. +/// +public static class RetryStrategy +{ + /// 6 attempts, 2x backoff, 5s initial delay, 60s max, Full jitter. + public static IRetryStrategy Default { get; } = Exponential( + maxAttempts: 6, + initialDelay: TimeSpan.FromSeconds(5), + maxDelay: TimeSpan.FromSeconds(60), + backoffRate: 2.0, + jitter: JitterStrategy.Full); + + /// 3 attempts, 2x backoff, 1s initial delay, 5s max, Half jitter. + public static IRetryStrategy Transient { get; } = Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 2.0, + jitter: JitterStrategy.Half); + + /// No retry — 1 attempt only. + public static IRetryStrategy None { get; } = Exponential(maxAttempts: 1); + + /// + /// Creates an exponential backoff retry strategy. + /// + /// + /// Thrown if < 1, < 1, + /// is non-positive, is non-positive, + /// or > . + /// + public static IRetryStrategy Exponential( + int maxAttempts = 3, + TimeSpan? initialDelay = null, + TimeSpan? maxDelay = null, + double backoffRate = 2.0, + JitterStrategy jitter = JitterStrategy.Full, + Type[]? retryableExceptions = null, + string[]? retryableMessagePatterns = null) + { + return new ExponentialRetryStrategy( + maxAttempts, + initialDelay ?? TimeSpan.FromSeconds(5), + maxDelay ?? TimeSpan.FromSeconds(300), + backoffRate, + jitter, + retryableExceptions, + retryableMessagePatterns); + } + + /// + /// Creates a retry strategy from a delegate. + /// + /// Thrown if is null. + public static IRetryStrategy FromDelegate(Func strategy) + { + if (strategy == null) throw new ArgumentNullException(nameof(strategy)); + return new DelegateRetryStrategy(strategy); + } +} + +internal sealed class ExponentialRetryStrategy : IRetryStrategy +{ + private readonly int _maxAttempts; + private readonly TimeSpan _initialDelay; + private readonly TimeSpan _maxDelay; + private readonly double _backoffRate; + private readonly JitterStrategy _jitter; + private readonly Type[]? _retryableExceptions; + private readonly Regex[]? _retryableMessagePatterns; + + public ExponentialRetryStrategy( + int maxAttempts, + TimeSpan initialDelay, + TimeSpan maxDelay, + double backoffRate, + JitterStrategy jitter, + Type[]? retryableExceptions, + string[]? retryableMessagePatterns) + { + if (maxAttempts < 1) + throw new ArgumentOutOfRangeException(nameof(maxAttempts), maxAttempts, "must be >= 1"); + if (initialDelay <= TimeSpan.Zero) + throw new ArgumentOutOfRangeException(nameof(initialDelay), initialDelay, "must be > 0"); + if (maxDelay <= TimeSpan.Zero) + throw new ArgumentOutOfRangeException(nameof(maxDelay), maxDelay, "must be > 0"); + if (initialDelay > maxDelay) + throw new ArgumentOutOfRangeException(nameof(initialDelay), initialDelay, $"must be <= maxDelay ({maxDelay})"); + if (backoffRate < 1.0 || double.IsNaN(backoffRate) || double.IsInfinity(backoffRate)) + throw new ArgumentOutOfRangeException(nameof(backoffRate), backoffRate, "must be a finite value >= 1.0"); + + _maxAttempts = maxAttempts; + _initialDelay = initialDelay; + _maxDelay = maxDelay; + _backoffRate = backoffRate; + _jitter = jitter; + _retryableExceptions = retryableExceptions; + _retryableMessagePatterns = retryableMessagePatterns? + .Select(p => new Regex(p)) + .ToArray(); + } + + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + { + if (attemptNumber >= _maxAttempts) + return RetryDecision.DoNotRetry(); + + if (!IsRetryable(exception)) + return RetryDecision.DoNotRetry(); + + var delay = CalculateDelay(attemptNumber); + return RetryDecision.RetryAfter(delay); + } + + private bool IsRetryable(Exception exception) + { + if (_retryableExceptions == null && _retryableMessagePatterns == null) + return true; + + if (_retryableExceptions != null) + { + var exType = exception.GetType(); + if (_retryableExceptions.Any(t => t.IsAssignableFrom(exType))) + return true; + } + + if (_retryableMessagePatterns != null) + { + var message = exception.Message; + if (_retryableMessagePatterns.Any(p => p.IsMatch(message))) + return true; + } + + return false; + } + + internal TimeSpan CalculateDelay(int attemptNumber) + => ExponentialBackoff.CalculateDelay(attemptNumber, _initialDelay, _maxDelay, _backoffRate, _jitter); +} + +internal sealed class DelegateRetryStrategy : IRetryStrategy +{ + private readonly Func _strategy; + + public DelegateRetryStrategy(Func strategy) + { + _strategy = strategy ?? throw new ArgumentNullException(nameof(strategy)); + } + + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + => _strategy(exception, attemptNumber); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs new file mode 100644 index 000000000..a38dda31b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs @@ -0,0 +1,203 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Model; +using Amazon.Runtime; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkOperation = Amazon.Lambda.Model.Operation; +using Operation = Amazon.Lambda.DurableExecution.Operation; +using StepDetails = Amazon.Lambda.DurableExecution.StepDetails; +using WaitDetails = Amazon.Lambda.DurableExecution.WaitDetails; +using ExecutionDetails = Amazon.Lambda.DurableExecution.ExecutionDetails; +using ContextDetails = Amazon.Lambda.DurableExecution.ContextDetails; +using CallbackDetails = Amazon.Lambda.DurableExecution.CallbackDetails; +using ChainedInvokeDetails = Amazon.Lambda.DurableExecution.ChainedInvokeDetails; + +namespace Amazon.Lambda.DurableExecution.Services; + +/// +/// Calls the real AWS Lambda Durable Execution APIs via the AWSSDK.Lambda client. +/// +internal sealed class LambdaDurableServiceClient +{ + private readonly IAmazonLambda _lambdaClient; + + public LambdaDurableServiceClient(IAmazonLambda lambdaClient) + { + _lambdaClient = lambdaClient; + } + + /// + /// Flushes pending checkpoint operations to the durable execution service. + /// SDK errors are wrapped in so user logs + /// show the durable-execution context (which API call, which ARN) alongside the + /// underlying SDK message — instead of a bare AWSSDK stack trace with no clue + /// about what was being called. + /// When is supplied, any + /// NewExecutionState.Operations the service returns (e.g. a freshly + /// allocated CallbackId after a callback START checkpoint, or a + /// timer-fired SUCCEEDED) are forwarded to the callback so the caller can + /// merge them into its in-memory . + /// + public async Task CheckpointAsync( + string durableExecutionArn, + string? checkpointToken, + IReadOnlyList pendingOperations, + Action>? onNewOperations = null, + CancellationToken cancellationToken = default) + { + if (pendingOperations.Count == 0) + return checkpointToken; + + var request = new CheckpointDurableExecutionRequest + { + DurableExecutionArn = durableExecutionArn, + CheckpointToken = checkpointToken ?? "", + Updates = pendingOperations is List list ? list : pendingOperations.ToList() + }; + + CheckpointDurableExecutionResponse response; + try + { + response = await _lambdaClient.CheckpointDurableExecutionAsync(request, cancellationToken); + } + catch (AmazonServiceException ex) + { + throw new DurableExecutionException( + $"Failed to checkpoint operations for durable execution '{durableExecutionArn}': {ex.Message}", + ex); + } + + // The service returns NewExecutionState carrying any operations updated + // since the last checkpoint — most importantly, the callback ID stamped + // onto a freshly-started CALLBACK op, plus any externally-completed + // callbacks/timers. Hand them to the caller (DurableFunction wires this + // back into ExecutionState) so subsequent replay-style lookups see the + // updated state immediately. + var updated = response.NewExecutionState?.Operations; + if (onNewOperations != null && updated != null && updated.Count > 0) + { + var mapped = new List(updated.Count); + foreach (var sdkOp in updated) + mapped.Add(MapFromSdkOperation(sdkOp)); + onNewOperations(mapped); + } + + return response.CheckpointToken; + } + + /// + /// Fetches additional pages of execution state when the initial state is paginated. + /// SDK errors are wrapped in for the same + /// reason as . + /// + public async Task<(List Operations, string? NextMarker)> GetExecutionStateAsync( + string durableExecutionArn, + string? checkpointToken, + string marker, + CancellationToken cancellationToken = default) + { + var request = new GetDurableExecutionStateRequest + { + DurableExecutionArn = durableExecutionArn, + CheckpointToken = checkpointToken ?? "", + Marker = marker + }; + + GetDurableExecutionStateResponse response; + try + { + response = await _lambdaClient.GetDurableExecutionStateAsync(request, cancellationToken); + } + catch (AmazonServiceException ex) + { + throw new DurableExecutionException( + $"Failed to fetch execution state for durable execution '{durableExecutionArn}' (marker '{marker}'): {ex.Message}", + ex); + } + + var operations = new List(); + if (response.Operations != null) + { + foreach (var sdkOp in response.Operations) + { + operations.Add(MapFromSdkOperation(sdkOp)); + } + } + + return (operations, response.NextMarker); + } + + private static Operation MapFromSdkOperation(SdkOperation sdkOp) + { + return new Operation + { + Id = sdkOp.Id, + Type = sdkOp.Type, + Status = sdkOp.Status, + Name = sdkOp.Name, + ParentId = sdkOp.ParentId, + SubType = sdkOp.SubType, + StepDetails = sdkOp.StepDetails != null ? new StepDetails + { + Result = sdkOp.StepDetails.Result, + Error = MapError(sdkOp.StepDetails.Error), + Attempt = sdkOp.StepDetails.Attempt, + NextAttemptTimestamp = sdkOp.StepDetails.NextAttemptTimestamp.HasValue + ? new DateTimeOffset(sdkOp.StepDetails.NextAttemptTimestamp.Value, TimeSpan.Zero).ToUnixTimeMilliseconds() + : null + } : null, + WaitDetails = sdkOp.WaitDetails != null ? new WaitDetails + { + ScheduledEndTimestamp = sdkOp.WaitDetails.ScheduledEndTimestamp.HasValue + ? new DateTimeOffset(sdkOp.WaitDetails.ScheduledEndTimestamp.Value, TimeSpan.Zero).ToUnixTimeMilliseconds() + : null + } : null, + ExecutionDetails = sdkOp.ExecutionDetails != null ? new ExecutionDetails + { + InputPayload = sdkOp.ExecutionDetails.InputPayload + } : null, + ContextDetails = sdkOp.ContextDetails != null ? new ContextDetails + { + Result = sdkOp.ContextDetails.Result, + Error = MapError(sdkOp.ContextDetails.Error) + } : null, + CallbackDetails = sdkOp.CallbackDetails != null ? new CallbackDetails + { + CallbackId = sdkOp.CallbackDetails.CallbackId, + Result = sdkOp.CallbackDetails.Result, + Error = MapError(sdkOp.CallbackDetails.Error) + } : null, + ChainedInvokeDetails = sdkOp.ChainedInvokeDetails != null ? new ChainedInvokeDetails + { + Result = sdkOp.ChainedInvokeDetails.Result, + Error = MapError(sdkOp.ChainedInvokeDetails.Error) + } : null + }; + } + + /// + /// Maps an SDK into the + /// internal . Carries every field the wire object + /// exposes — ErrorType, ErrorMessage, ErrorData, and + /// StackTrace — so the durable execution exception builders + /// (, , and + /// the tree) can rehydrate the original + /// failure faithfully on real-service replay. + /// + private static ErrorObject? MapError(Amazon.Lambda.Model.ErrorObject? sdkError) + { + if (sdkError == null) return null; + return new ErrorObject + { + ErrorType = sdkError.ErrorType, + ErrorMessage = sdkError.ErrorMessage, + ErrorData = sdkError.ErrorData, + // SDK exposes List; assigning into IReadOnlyList? + // is reference-identical. A null list (SDK 4.x default when the + // field isn't set on the wire) propagates as null on our side. + StackTrace = sdkError.StackTrace + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs new file mode 100644 index 000000000..eea3dc791 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for step execution. +/// +public sealed class StepConfig +{ + /// + /// Retry strategy for failed steps. When null (default), failures are not retried. + /// + public IRetryStrategy? RetryStrategy { get; set; } + + /// + /// Controls whether a step may re-execute if the Lambda is re-invoked mid-attempt. + /// Default is . + /// + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs b/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs new file mode 100644 index 000000000..1ebfe58a1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs @@ -0,0 +1,66 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Converts between UPPER_SNAKE_CASE wire format (e.g., CHAINED_INVOKE) +/// and PascalCase enum values (e.g., ChainedInvoke). +/// +public sealed class UpperSnakeCaseEnumConverter : JsonConverter where T : struct, Enum +{ + /// + public override T Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + if (reader.TokenType == JsonTokenType.Null) + return default; + + var value = reader.GetString(); + if (value == null) + return default; + + // Convert UPPER_SNAKE_CASE to PascalCase for enum lookup + var pascalCase = SnakeToPascal(value); + + if (Enum.TryParse(pascalCase, ignoreCase: true, out var result)) + return result; + + // Fallback: try direct case-insensitive parse of the raw value + if (Enum.TryParse(value, ignoreCase: true, out result)) + return result; + + throw new JsonException($"Unable to parse '{value}' as {typeof(T).Name}."); + } + + /// + public override void Write(Utf8JsonWriter writer, T value, JsonSerializerOptions options) + { + writer.WriteStringValue(PascalToSnake(value.ToString())); + } + + private static string SnakeToPascal(string snake) + { + var parts = snake.Split('_'); + for (int i = 0; i < parts.Length; i++) + { + if (parts[i].Length > 0) + parts[i] = char.ToUpper(parts[i][0]) + parts[i][1..].ToLower(); + } + return string.Join("", parts); + } + + private static string PascalToSnake(string pascal) + { + var result = new System.Text.StringBuilder(); + for (int i = 0; i < pascal.Length; i++) + { + if (i > 0 && char.IsUpper(pascal[i])) + result.Append('_'); + result.Append(char.ToUpper(pascal[i])); + } + return result.ToString(); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitDecision.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitDecision.cs new file mode 100644 index 000000000..f2b7dff98 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitDecision.cs @@ -0,0 +1,42 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Decision returned by an on each polling +/// iteration: either stop polling (the condition has been met or attempts +/// have been exhausted) or continue after the given delay. +/// +public readonly record struct WaitDecision +{ + /// + /// True when the strategy wants the operation to keep polling; false when + /// the operation should terminate (condition satisfied or limit reached). + /// + public bool ShouldContinue { get; } + + /// + /// Delay before the next poll. Only meaningful when + /// is true; otherwise + /// . The wire-level timer floors this at 1 + /// second. + /// + public TimeSpan Delay { get; } + + private WaitDecision(bool shouldContinue, TimeSpan delay) + { + ShouldContinue = shouldContinue; + Delay = delay; + } + + /// + /// Stop polling. The current state is treated as the final result of the + /// wait-for-condition operation and returned to the caller. + /// + public static WaitDecision Stop() => new(false, TimeSpan.Zero); + + /// + /// Continue polling after the given delay. The Lambda is suspended until + /// the delay elapses, at which point the service re-invokes and the + /// condition is re-evaluated. + /// + public static WaitDecision ContinueAfter(TimeSpan delay) => new(true, delay); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs new file mode 100644 index 000000000..90cf1f420 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for the composite +/// +/// operation. Inherits the callback's and +/// ; adds a +/// for the submitter step. +/// +public class WaitForCallbackConfig : CallbackConfig +{ + /// + /// Retry strategy applied to the submitter step. When null (default), + /// submitter failures are not retried — the submitter step fails terminally + /// and surfaces as . + /// + public IRetryStrategy? RetryStrategy { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionConfig.cs new file mode 100644 index 000000000..ea99a76ef --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionConfig.cs @@ -0,0 +1,29 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for a WaitForConditionAsync polling operation. +/// +/// +/// Both properties are required: the strategy decides "continue or stop" +/// (per-call) and the initial state seeds the very first check invocation. +/// On replay, the latest checkpointed state is restored from the previous +/// RETRY checkpoint and used in place of ; this +/// is what makes the polling loop survive Lambda re-invocations +/// deterministically. +/// +/// The state type produced by the check function. +public sealed class WaitForConditionConfig +{ + /// + /// Initial state passed to the very first invocation of the check + /// function. Subsequent invocations receive the state returned by the + /// previous call. + /// + public required TState InitialState { get; set; } + + /// + /// Strategy that decides, after each check invocation, whether to keep + /// polling and how long to wait before the next attempt. + /// + public required IWaitStrategy WaitStrategy { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionException.cs new file mode 100644 index 000000000..411f63390 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForConditionException.cs @@ -0,0 +1,42 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Thrown when a WaitForConditionAsync operation reaches its +/// strategy's max-attempts limit without the condition being met. +/// +/// +/// Designed to be subclassable: future failure modes (e.g. timeout once that's +/// implemented) should be added as derived exceptions rather than discriminator +/// flags on this type, so users can catch them by static type. +/// exposes the most recently observed state so callers +/// can incorporate it into the failure path (logging, partial results, etc.). +/// +public class WaitForConditionException : DurableExecutionException +{ + /// + /// Number of attempts the strategy made before giving up. 1-based. + /// + public int AttemptsExhausted { get; init; } + + /// + /// The most recent state observed by the check function before the + /// strategy decided to stop. Boxed because the exception type is not + /// generic; callers cast to the workflow's known state type. + /// + /// + /// Populated identically on live execution and on replay: the operation + /// serializes the last observed state into the FAIL checkpoint payload, + /// so a re-invocation that hits the cached FAIL reconstructs the same + /// LastState the original execution surfaced. Will be null + /// only if the FAIL checkpoint predates this serialization (legacy data) + /// or if the serializer cannot round-trip the state. + /// + public object? LastState { get; init; } + + /// Creates an empty . + public WaitForConditionException() { } + /// Creates a with the given message. + public WaitForConditionException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public WaitForConditionException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitStrategy.cs new file mode 100644 index 000000000..a2aea1d19 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitStrategy.cs @@ -0,0 +1,222 @@ +using Amazon.Lambda.DurableExecution.Internal; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Factory methods for built-in +/// implementations used with +/// IDurableContext.WaitForConditionAsync. +/// +/// +/// Each factory accepts an optional isDone predicate so users can +/// terminate polling declaratively when the latest state satisfies a +/// condition (e.g. state => state.IsReady) without implementing +/// themselves. If isDone is +/// null, the strategy polls until maxAttempts is exhausted — +/// at which point a is thrown by +/// the operation. Defaults are intentionally tuned for polling, not for +/// retry-on-exception: 60 attempts / 5s initial / 300s max / 1.5x backoff / +/// Full jitter (matches Python+JS+Java reference SDKs). +/// +public static class WaitStrategy +{ + /// + /// Exponential-backoff wait strategy. Defaults: 60 attempts, 5s initial + /// delay, 5min (300s) max delay, 1.5x backoff, Full jitter — matching + /// the Python, JS, and Java reference SDKs. + /// + /// Maximum polling attempts before the operation throws . + /// Delay before the second attempt; subsequent delays multiply by up to . + /// Cap on the per-attempt delay. + /// Multiplier applied per attempt. + /// Jitter strategy applied to each delay. + /// Optional predicate evaluated against the latest state; when it returns true, polling stops and the state is returned. + public static IWaitStrategy Exponential( + int maxAttempts = 60, + TimeSpan? initialDelay = null, + TimeSpan? maxDelay = null, + double backoffRate = 1.5, + JitterStrategy jitter = JitterStrategy.Full, + Func? isDone = null) + { + return new ExponentialWaitStrategy( + maxAttempts, + initialDelay ?? TimeSpan.FromSeconds(5), + maxDelay ?? TimeSpan.FromSeconds(300), + backoffRate, + jitter, + isDone); + } + + /// + /// Linear-growth wait strategy. The delay starts at + /// and grows by + /// each attempt, up to + /// . + /// + /// Maximum polling attempts before the operation throws . + /// Delay before the second attempt. + /// Amount added to the delay on each subsequent attempt. + /// Cap on the per-attempt delay; null means no cap. + /// Optional predicate evaluated against the latest state; when it returns true, polling stops and the state is returned. + public static IWaitStrategy Linear( + int maxAttempts = 60, + TimeSpan? initialDelay = null, + TimeSpan? increment = null, + TimeSpan? maxDelay = null, + Func? isDone = null) + { + return new LinearWaitStrategy( + maxAttempts, + initialDelay ?? TimeSpan.FromSeconds(5), + increment ?? TimeSpan.FromSeconds(5), + maxDelay, + isDone); + } + + /// + /// Fixed-delay wait strategy. Every poll waits the same + /// . + /// + /// Fixed delay between polls. + /// Maximum polling attempts before the operation throws . + /// Optional predicate evaluated against the latest state; when it returns true, polling stops and the state is returned. + public static IWaitStrategy Fixed( + TimeSpan delay, + int maxAttempts = 60, + Func? isDone = null) + { + return new FixedWaitStrategy(maxAttempts, delay, isDone); + } + + /// + /// Wraps an arbitrary delegate as an . + /// + public static IWaitStrategy FromDelegate(Func strategy) + => new DelegateWaitStrategy(strategy); +} + +internal sealed class ExponentialWaitStrategy : IWaitStrategy +{ + private readonly int _maxAttempts; + private readonly TimeSpan _initialDelay; + private readonly TimeSpan _maxDelay; + private readonly double _backoffRate; + private readonly JitterStrategy _jitter; + private readonly Func? _isDone; + + public ExponentialWaitStrategy( + int maxAttempts, + TimeSpan initialDelay, + TimeSpan maxDelay, + double backoffRate, + JitterStrategy jitter, + Func? isDone) + { + _maxAttempts = maxAttempts; + _initialDelay = initialDelay; + _maxDelay = maxDelay; + _backoffRate = backoffRate; + _jitter = jitter; + _isDone = isDone; + } + + public WaitDecision Decide(TState state, int attemptNumber) + { + // Predicate satisfied → stop normally (operation SUCCEEDs). + if (_isDone != null && _isDone(state)) return WaitDecision.Stop(); + + // Attempts saturated → throw WaitForConditionException directly. + // Matches the JS reference SDK (wait-strategy-config.ts:54-57); lets + // the operation distinguish "condition met" (Stop) from "gave up" + // (exception) without a discriminator on WaitDecision. The operation + // catches, populates LastState (which the strategy doesn't have + // access to), checkpoints FAIL, and rethrows. + if (attemptNumber >= _maxAttempts) + throw new WaitForConditionException( + $"WaitForCondition exceeded maximum attempts ({_maxAttempts}).") + { + AttemptsExhausted = attemptNumber + }; + + var delay = ExponentialBackoff.CalculateDelay( + attemptNumber, _initialDelay, _maxDelay, _backoffRate, _jitter); + return WaitDecision.ContinueAfter(delay); + } +} + +internal sealed class LinearWaitStrategy : IWaitStrategy +{ + private readonly int _maxAttempts; + private readonly TimeSpan _initialDelay; + private readonly TimeSpan _increment; + private readonly TimeSpan? _maxDelay; + private readonly Func? _isDone; + + public LinearWaitStrategy( + int maxAttempts, + TimeSpan initialDelay, + TimeSpan increment, + TimeSpan? maxDelay, + Func? isDone) + { + _maxAttempts = maxAttempts; + _initialDelay = initialDelay; + _increment = increment; + _maxDelay = maxDelay; + _isDone = isDone; + } + + public WaitDecision Decide(TState state, int attemptNumber) + { + if (_isDone != null && _isDone(state)) return WaitDecision.Stop(); + if (attemptNumber >= _maxAttempts) + throw new WaitForConditionException( + $"WaitForCondition exceeded maximum attempts ({_maxAttempts}).") + { + AttemptsExhausted = attemptNumber + }; + + var rawSeconds = _initialDelay.TotalSeconds + _increment.TotalSeconds * (attemptNumber - 1); + if (_maxDelay is { } cap) rawSeconds = Math.Min(rawSeconds, cap.TotalSeconds); + + // Floor at 1 second to match the service timer granularity. + var seconds = Math.Max(1, Math.Ceiling(rawSeconds)); + return WaitDecision.ContinueAfter(TimeSpan.FromSeconds(seconds)); + } +} + +internal sealed class FixedWaitStrategy : IWaitStrategy +{ + private readonly int _maxAttempts; + private readonly TimeSpan _delay; + private readonly Func? _isDone; + + public FixedWaitStrategy(int maxAttempts, TimeSpan delay, Func? isDone) + { + _maxAttempts = maxAttempts; + _delay = delay; + _isDone = isDone; + } + + public WaitDecision Decide(TState state, int attemptNumber) + { + if (_isDone != null && _isDone(state)) return WaitDecision.Stop(); + if (attemptNumber >= _maxAttempts) + throw new WaitForConditionException( + $"WaitForCondition exceeded maximum attempts ({_maxAttempts}).") + { + AttemptsExhausted = attemptNumber + }; + + var seconds = Math.Max(1, Math.Ceiling(_delay.TotalSeconds)); + return WaitDecision.ContinueAfter(TimeSpan.FromSeconds(seconds)); + } +} + +internal sealed class DelegateWaitStrategy : IWaitStrategy +{ + private readonly Func _strategy; + public DelegateWaitStrategy(Func strategy) => _strategy = strategy; + public WaitDecision Decide(TState state, int attemptNumber) => _strategy(state, attemptNumber); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md new file mode 100644 index 000000000..573ad17e3 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md @@ -0,0 +1,185 @@ +# Callbacks + +Callbacks let a workflow suspend until an external system (a human approver, a webhook, another service) delivers a result. The external system completes the callback by calling `SendDurableExecutionCallbackSuccess`, `SendDurableExecutionCallbackFailure`, or `SendDurableExecutionCallbackHeartbeat` with the `callbackId` you handed it. + +Two APIs are available: + +- `WaitForCallbackAsync` — composite operation; create the callback, hand it to the external system inside a submitter delegate, and suspend until the result arrives. +- `CreateCallbackAsync` — lower-level; allocate the callback yourself, hand the ID out in your own steps, and `await` the result separately. + +## `WaitForCallbackAsync` + +```csharp +Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The submitter receives the freshly allocated `callbackId` and an `IWaitForCallbackContext` (logger-only). Submitter failures (after retries are exhausted) surface as `CallbackSubmitterException`; callback failures and timeouts surface as `CallbackFailedException` / `CallbackTimeoutException`. + +## `CreateCallbackAsync` + +```csharp +Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The returned `ICallback` exposes: + +- `string CallbackId` — give this to the external system. +- `Task GetResultAsync(CancellationToken)` — `await` to suspend until the external system completes the callback. + +The result is deserialized using the registered `ILambdaSerializer`. Throws `CallbackFailedException` or `CallbackTimeoutException` on failure. + +## End-to-end example + +Two Lambdas: a workflow that suspends on a callback, and a separate approver Lambda that resolves it. The workflow hands its `callbackId` to the approver via `Event` invocation (fire-and-forget), then suspends. The approver runs in its own Lambda and signals completion by calling `SendDurableExecutionCallbackSuccessAsync`. + +### 1. Workflow Lambda — `WaitForCallbackAsync` + +```csharp +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderApprovalWorkflow; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main() + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(OrderInput input, IDurableContext ctx) + { + var approverFunctionName = Environment.GetEnvironmentVariable("APPROVER_FUNCTION_NAME") + ?? throw new InvalidOperationException("APPROVER_FUNCTION_NAME env var not set"); + + // Suspend until the approver Lambda calls SendDurableExecutionCallbackSuccessAsync + // with this callback ID. The submitter is invoked once with a freshly-allocated + // ID; it hands the ID to the approver and returns immediately. + var result = await ctx.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + var payload = $$"""{"callbackId":"{{callbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = approverFunctionName, + InvocationType = InvocationType.Event, // fire-and-forget + Payload = payload + }); + }, + name: "approve"); + + return result; + } +} + +public record OrderInput(string OrderId); +public record ApprovalResult(string Status, string ApprovedBy); +``` + +### 2. Approver Lambda — completes the callback + +A plain Lambda — no durable execution wrapper. It receives the callback ID, performs whatever logic the external system needs, and calls `SendDurableExecutionCallbackSuccessAsync` to resume the workflow. + +```csharp +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderApprovalWorkflow; + +public class ApproverFunction +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main() + { + var handler = new ApproverFunction(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(ApproverInput input, ILambdaContext context) + { + // The result JSON must match the T in WaitForCallbackAsync — here, ApprovalResult. + var resultJson = $$"""{"Status":"approved","ApprovedBy":"{{input.OrderId}}"}"""; + await LambdaClient.SendDurableExecutionCallbackSuccessAsync( + new SendDurableExecutionCallbackSuccessRequest + { + CallbackId = input.CallbackId, + Result = new MemoryStream(Encoding.UTF8.GetBytes(resultJson)) + }); + return null; + } +} + +public record ApproverInput(string CallbackId, string OrderId); +``` + +To signal failure instead, call `SendDurableExecutionCallbackFailureAsync` — the workflow throws `CallbackFailedException`. To extend the heartbeat deadline (when `HeartbeatTimeout` is configured), call `SendDurableExecutionCallbackHeartbeatAsync`. + +### `CreateCallbackAsync` variant + +When you need to allocate the ID before deciding how to hand it out — e.g. several steps run between callback creation and submission — use `CreateCallbackAsync` and a separate `StepAsync` for the submission. Wrapping the hand-off in a step prevents replays from re-invoking the approver. + +```csharp +private async Task Workflow(OrderInput input, IDurableContext ctx) +{ + var cb = await ctx.CreateCallbackAsync(name: "approve"); + + await ctx.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = approverFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); +} +``` + +## Configuration + +```csharp +public class CallbackConfig +{ + public TimeSpan Timeout { get; set; } // overall callback timeout, ≥ 1s or Zero (default = no timeout) + public TimeSpan HeartbeatTimeout { get; set; } // heartbeat-gap timeout, ≥ 1s or Zero (default = no timeout) +} + +public class WaitForCallbackConfig : CallbackConfig +{ + public IRetryStrategy? RetryStrategy { get; set; } // applied to the submitter step only +} +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md new file mode 100644 index 000000000..4a664e11e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md @@ -0,0 +1,46 @@ +# Child Contexts + +`RunInChildContextAsync` runs a sub-workflow inside its own deterministic operation-ID space. The child's return value is checkpointed as a single `CONTEXT` operation, so subsequent invocations replay the cached value without re-executing the contained operations. Use to group related steps under a shared error/observability boundary. + +## Signatures + +```csharp +Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + +Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); +``` + +## Example + +```csharp +var phaseResult = await ctx.RunInChildContextAsync( + async childCtx => + { + var validated = await childCtx.StepAsync(async _ => Validate(input), name: "validate"); + await childCtx.WaitAsync(TimeSpan.FromSeconds(2), name: "short_wait"); + var processed = await childCtx.StepAsync(async _ => Process(validated), name: "process"); + return processed; + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); +``` + +## Configuration + +```csharp +public sealed class ChildContextConfig +{ + public string? SubType { get; set; } // observability label + public Func? ErrorMapping { get; set; } // remap thrown exceptions +} +``` + +`ErrorMapping` lets you translate exceptions thrown inside the child context into a domain-specific exception type before they propagate to the parent. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/parallel.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/parallel.md new file mode 100644 index 000000000..5666d3f14 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/parallel.md @@ -0,0 +1,136 @@ +# Parallel + +`ParallelAsync` runs N branches concurrently, each in its own child context, and returns an `IBatchResult` aggregating the per-branch outcomes. Each branch is checkpointed independently, so the fan-out survives Lambda re-invocations: branches that already completed are restored from their checkpoints on replay rather than re-run. + +Use it to fan out independent work — calling several services at once, processing a set of items, racing redundant providers — when the branches don't depend on one another. For a sequential series of checkpointed operations, use [`StepAsync`](steps.md) instead; for an isolated single child context, use [`RunInChildContextAsync`](child-contexts.md). + +## Signature + +```csharp +// Unnamed branches — IBatchItem.Name is null; index is used for identity. +Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + +// Named branches — the name surfaces on IBatchItem.Name and in execution traces. +Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); +``` + +Each branch receives its own `IDurableContext`, so a branch can itself use steps, waits, and nested durable operations. Branch results are serialized to per-branch checkpoints via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. The operation `name` is used for observability and to derive the deterministic operation ID, so keep it stable across deployments. + +## Example + +Fan out three independent lookups and collect the results: + +```csharp +var batch = await ctx.ParallelAsync( + new[] + { + new DurableBranch("primary", async branchCtx => + await branchCtx.StepAsync(_ => primaryProvider.QuoteAsync(order), name: "quote")), + new DurableBranch("secondary", async branchCtx => + await branchCtx.StepAsync(_ => secondaryProvider.QuoteAsync(order), name: "quote")), + new DurableBranch("tertiary", async branchCtx => + await branchCtx.StepAsync(_ => tertiaryProvider.QuoteAsync(order), name: "quote")), + }, + name: "fan-out-quotes"); + +var quotes = batch.GetResults(); // all three, in original branch order +``` + +With the default completion policy (`AllSuccessful`), any single branch failure surfaces as a `ParallelException` when the result is awaited. + +## Configuration + +```csharp +public sealed class ParallelConfig +{ + public int? MaxConcurrency { get; set; } // null = unlimited; must be >= 1 when set + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + public NestingType NestingType { get; set; } = NestingType.Nested; // Flat is reserved — throws NotSupportedException +} +``` + +`MaxConcurrency` bounds how many branches run at once via a semaphore — useful to avoid overwhelming a downstream service. `NestingType.Nested` (default) gives each branch a full child context visible in traces; `NestingType.Flat` is reserved for a future checkpoint optimization and currently throws `NotSupportedException`. + +## Completion policies + +`CompletionConfig` decides when the batch resolves and whether it resolves as success or failure. Construct it via the static factories or set the threshold properties directly; multiple criteria combine, and the batch resolves as soon as any one is met or violated. + +| Factory | Behavior | +| --- | --- | +| `CompletionConfig.AllSuccessful()` | Every branch must succeed (equivalent to `ToleratedFailureCount = 0`). The first failure resolves the batch as failed. **Default.** | +| `CompletionConfig.AllCompleted()` | Run every branch to a terminal state regardless of failures; never auto-throws. Inspect `Succeeded` / `Failed` (or call `ThrowIfError`) afterward. | +| `CompletionConfig.FirstSuccessful()` | Resolve as soon as one branch succeeds (`MinSuccessful = 1`). Branches not yet dispatched are reported as `Started`. | + +For finer control, set the properties yourself: + +```csharp +public sealed class CompletionConfig +{ + public int? MinSuccessful { get; set; } // resolve once this many branches succeed; null = no minimum + public int? ToleratedFailureCount { get; set; } // fail when failures strictly exceed this count + public double? ToleratedFailurePercentage { get; set; } // fail when failure ratio strictly exceeds this [0.0–1.0] +} +``` + +The chosen policy is recorded on the result as a `CompletionReason`: `AllCompleted`, `MinSuccessfulReached`, or `FailureToleranceExceeded`. + +## Inspecting results + +`IBatchResult` exposes both aggregate counts and per-branch items: + +```csharp +batch.All // IReadOnlyList>, original index order +batch.Succeeded // items with Status == Succeeded +batch.Failed // items with Status == Failed +batch.Started // items not dispatched before a short-circuit resolved the batch + +batch.GetResults(); // IReadOnlyList of successful results — never throws +batch.GetErrors(); // IReadOnlyList of failures +batch.ThrowIfError(); // throw the first failure, if any + +batch.SuccessCount; // also FailureCount, StartedCount, TotalCount, HasFailure +batch.CompletionReason; +``` + +Each `IBatchItem` carries `Index`, `Name`, `Status` (`Succeeded` / `Failed` / `Started`), `Result` (populated only when succeeded), and `Error` (populated only when failed). + +## Failure handling + +```csharp +// Drive every branch to completion, then inspect partial results. +var batch = await ctx.ParallelAsync( + branches, + name: "process-items", + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + +foreach (var item in batch.Failed) +{ + ctx.Logger.LogWarning("Branch {Name} failed: {Error}", item.Name, item.Error?.Message); +} + +var succeeded = batch.GetResults(); +``` + +With the default `AllSuccessful` policy, awaiting a batch in which a branch failed throws `ParallelException`. The exception carries the type-erased `Result` (cast to `IBatchResult` to inspect per-branch detail) and the `CompletionReason`: + +```csharp +try +{ + var batch = await ctx.ParallelAsync(branches, name: "fan-out"); +} +catch (ParallelException ex) +{ + var result = (IBatchResult?)ex.Result; + ctx.Logger.LogWarning( + "Parallel operation failed ({Reason}); {Failed} of {Total} branches failed.", + ex.CompletionReason, result?.FailureCount, result?.TotalCount); +} +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md new file mode 100644 index 000000000..c7f9e9f22 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md @@ -0,0 +1,148 @@ +# Steps + +`StepAsync` runs a unit of work whose result is checkpointed. On replay, completed steps return their cached result without re-executing. + +## Signatures + +```csharp +Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + +Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The `IStepContext` parameter exposes the current `AttemptNumber`, the deterministic `OperationId`, and a scoped `Logger`. Returned values are serialized via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. + +## Basic step + +```csharp +var user = await ctx.StepAsync( + async _ => await userService.GetUserAsync(userId), + name: "fetch-user"); +``` + +## Multiple steps + +```csharp +var a = await ctx.StepAsync(async _ => $"a-{input.OrderId}", name: "step_1"); +var b = await ctx.StepAsync(async _ => $"{a}-b", name: "step_2"); +var c = await ctx.StepAsync(async _ => $"{b}-c", name: "step_3"); +``` + +## Step configuration + +Configure step behavior with `StepConfig`: + +```csharp +public sealed class StepConfig +{ + public IRetryStrategy? RetryStrategy { get; set; } // null = no retry + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; +} +``` + +### Retry strategies + +When a step throws, the configured `IRetryStrategy` decides whether to retry and after what delay. + +```csharp +public interface IRetryStrategy +{ + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +public readonly struct RetryDecision +{ + public bool ShouldRetry { get; } + public TimeSpan Delay { get; } + + public static RetryDecision DoNotRetry(); + public static RetryDecision RetryAfter(TimeSpan delay); +} +``` + +Built-in strategies on the `RetryStrategy` static class: + +| Member | Behavior | +| --- | --- | +| `RetryStrategy.Default` | 6 attempts, 2× backoff, 5s initial, 60s max, Full jitter. | +| `RetryStrategy.Transient` | 3 attempts, 2× backoff, 1s initial, 5s max, Half jitter. | +| `RetryStrategy.None` | 1 attempt only — no retry. | +| `RetryStrategy.Exponential(...)` | Builder for custom exponential strategies. | +| `RetryStrategy.FromDelegate(Func)` | Wrap a custom decision function. | + +`Exponential` parameters: + +```csharp +public static IRetryStrategy Exponential( + int maxAttempts = 3, + TimeSpan? initialDelay = null, // default 5s + TimeSpan? maxDelay = null, // default 300s + double backoffRate = 2.0, + JitterStrategy jitter = JitterStrategy.Full, + Type[]? retryableExceptions = null, + string[]? retryableMessagePatterns = null); + +public enum JitterStrategy { None, Full, Half } +``` + +When `retryableExceptions` and `retryableMessagePatterns` are both null (default), every exception is retried up to `maxAttempts`. If either is set, only matching exceptions are retried. + +#### Step with retries + +```csharp +var result = await ctx.StepAsync( + async stepCtx => + { + if (stepCtx.AttemptNumber < 3) + throw new InvalidOperationException($"flake on attempt {stepCtx.AttemptNumber}"); + return $"ok on attempt {stepCtx.AttemptNumber}"; + }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); +``` + +### Step semantics + +Control how a step behaves when interrupted mid-execution: + +```csharp +public enum StepSemantics +{ + AtLeastOncePerRetry, // default — body may re-execute if Lambda is re-invoked mid-attempt + AtMostOncePerRetry // body executes at most once per retry attempt +} +``` + +| Semantic | Behavior | Use case | +| --- | --- | --- | +| `AtLeastOncePerRetry` (default) | Re-executes the step if interrupted before completion. | Idempotent operations (database upserts, API calls with idempotency keys). | +| `AtMostOncePerRetry` | Never re-executes; throws if interrupted. | Non-idempotent operations (sending email, charging payments). | + +These semantics apply *per retry attempt*, not per overall execution. To achieve true at-most-once across the whole workflow, combine with `RetryStrategy.None`: + +```csharp +var result = await ctx.StepAsync( + async _ => await paymentService.ChargeAsync(amount), + name: "charge-payment", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.None + }); +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait-for-condition.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait-for-condition.md new file mode 100644 index 000000000..93ea3f4d9 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait-for-condition.md @@ -0,0 +1,116 @@ +# Wait For Condition + +`WaitForConditionAsync` polls a check function until a wait strategy decides to stop. Between polls the workflow is suspended — the Lambda terminates and is re-invoked when the strategy's chosen delay elapses, so you pay for compute time only while the check actually runs. + +Use it when you're waiting on something whose readiness you can only learn by *asking* (an order settling, a file landing in S3, an external job finishing) rather than waiting a fixed duration. For a fixed-duration pause, use [`WaitAsync`](wait.md) instead. + +## Signature + +```csharp +Task WaitForConditionAsync( + Func> check, + WaitForConditionConfig config, + string? name = null, + CancellationToken cancellationToken = default); +``` + +On every iteration the `check` function receives the state returned by the previous invocation — seeded by `config.InitialState` on the very first call — and returns the next state. The configured `IWaitStrategy` then decides whether to keep polling and how long to wait. State is checkpointed each iteration, so the polling loop survives Lambda re-invocations deterministically and you can carry per-poll bookkeeping (a cursor, a counter) inside the state itself. + +The `IConditionCheckContext` parameter exposes the current `AttemptNumber` (1-based) and a scoped `Logger`. The returned state is serialized via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. + +When the strategy stops because its `maxAttempts` limit is reached — rather than because the condition was met — the operation throws `WaitForConditionException` carrying `AttemptsExhausted` and the last observed `LastState`. + +## Example + +Poll an order's status until it reaches a terminal value: + +```csharp +var finalStatus = await ctx.WaitForConditionAsync( + check: async (state, checkCtx) => + { + checkCtx.Logger.LogInformation("Polling order on attempt {Attempt}", checkCtx.AttemptNumber); + return await orderService.GetStatusAsync(orderId); + }, + config: new WaitForConditionConfig + { + InitialState = OrderStatus.Unknown, + WaitStrategy = WaitStrategy.Exponential( + isDone: s => s == OrderStatus.Completed || s == OrderStatus.Cancelled) + }, + name: "wait-for-order-settle"); +``` + +## Configuration + +```csharp +public sealed class WaitForConditionConfig +{ + public required TState InitialState { get; set; } // seeds the first check call + public required IWaitStrategy WaitStrategy { get; set; } // decides continue/stop + delay +} +``` + +## Wait strategies + +The check function reports state; the `IWaitStrategy` decides what to do with it. Each built-in strategy on the `WaitStrategy` static class accepts an optional `isDone` predicate, so the common case — stop when the latest state satisfies a condition — stays declarative without implementing the interface yourself. + +| Member | Behavior | +| --- | --- | +| `WaitStrategy.Exponential(...)` | Delay grows by `backoffRate` each attempt, up to `maxDelay`. Defaults: 60 attempts, 5s initial, 300s max, 1.5× backoff, Full jitter. | +| `WaitStrategy.Linear(...)` | Delay grows by a fixed `increment` each attempt, optionally capped at `maxDelay`. Defaults: 60 attempts, 5s initial, 5s increment. | +| `WaitStrategy.Fixed(delay, ...)` | Every poll waits the same `delay`. Default: 60 attempts. | +| `WaitStrategy.FromDelegate(Func)` | Wrap a custom decision function. | + +These defaults are tuned for *polling*, not retry-on-exception, and match the Python, JS, and Java reference SDKs. + +```csharp +public static IWaitStrategy Exponential( + int maxAttempts = 60, + TimeSpan? initialDelay = null, // default 5s + TimeSpan? maxDelay = null, // default 300s + double backoffRate = 1.5, + JitterStrategy jitter = JitterStrategy.Full, + Func? isDone = null); +``` + +### Custom strategies + +For richer logic (wall-clock budgets, conditional jitter), use `FromDelegate` or implement `IWaitStrategy` directly. `Decide` returns a `WaitDecision`: + +```csharp +public interface IWaitStrategy +{ + WaitDecision Decide(TState state, int attemptNumber); +} + +public readonly record struct WaitDecision +{ + public bool ShouldContinue { get; } + public TimeSpan Delay { get; } // floored at 1s by the service timer + + public static WaitDecision Stop(); // condition met → return current state + public static WaitDecision ContinueAfter(TimeSpan delay); // suspend, re-evaluate after delay +} +``` + +`Stop()` ends the operation successfully and returns the latest state. `ContinueAfter(delay)` suspends the Lambda until the delay elapses. To signal "gave up," a strategy throws `WaitForConditionException` (the built-in strategies do this when `attemptNumber` reaches `maxAttempts`). + +## `WaitForCondition` vs. retries + +`IWaitStrategy` is distinct from [`IRetryStrategy`](steps.md#retry-strategies): a retry strategy decides whether to retry *after an exception* (its input is the thrown `Exception`), while a wait strategy decides whether to keep polling *based on observed state* (its input is the latest `TState`). If the check function itself throws, that error surfaces as a `StepException` — the wait strategy is not consulted. + +## Failure handling + +```csharp +try +{ + var status = await ctx.WaitForConditionAsync(check, config, name: "poll-job"); +} +catch (WaitForConditionException ex) +{ + // Strategy exhausted its attempts without the condition being met. + var attempts = ex.AttemptsExhausted; + var last = (JobStatus?)ex.LastState; // boxed — cast to your state type + ctx.Logger.LogWarning("Gave up after {Attempts} polls; last status was {Status}", attempts, last); +} +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md new file mode 100644 index 000000000..d7d2679f4 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md @@ -0,0 +1,28 @@ +# Wait + +`WaitAsync` suspends the workflow for a duration. The Lambda terminates and is re-invoked when the timer fires — you pay for compute time only on the resume side. + +## Signature + +```csharp +Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); +``` + +`duration` must be at least 1 second and at most 31,622,400 seconds (~1 year). + +## Example + +```csharp +await ctx.WaitAsync(TimeSpan.FromHours(2), name: "warehouse-processing"); +``` + +## Step + Wait + Step + +```csharp +var validated = await ctx.StepAsync(async _ => Validate(input), name: "validate"); +await ctx.WaitAsync(TimeSpan.FromSeconds(3), name: "short_wait"); +var processed = await ctx.StepAsync(async _ => Process(validated), name: "process"); +``` diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj new file mode 100644 index 000000000..ec4d0ffd0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj @@ -0,0 +1,24 @@ + + + + Exe + net8.0 + enable + enable + true + true + full + false + true + IL2026,IL2067,IL2075,IL3050 + false + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs new file mode 100644 index 000000000..41404ca96 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace Amazon.Lambda.DurableExecution.AotPublishTest; + +/// +/// AOT publish smoke check. This program must publish under NativeAOT with +/// zero IL2026/IL3050 warnings (promoted to errors by the csproj). The serializer +/// registered with is the same one DurableExecution +/// reads via , so AOT-safety is fully determined +/// by the user's choice of serializer (here, ). +/// +public class Program +{ + public static async Task Main() + { + var serializer = new SourceGeneratorLambdaJsonSerializer(); + Func> handler = HandlerAsync; + await LambdaBootstrapBuilder + .Create(handler, serializer) + .Build() + .RunAsync(); + } + + public static Task HandlerAsync( + DurableExecutionInvocationInput input, ILambdaContext context) => + DurableFunction.WrapAsync(WorkflowAsync, input, context); + + private static async Task WorkflowAsync(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + return new ValidationResult { IsValid = true }; + }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + + return new OrderResult { Status = validation.IsValid ? "approved" : "rejected", OrderId = input.OrderId }; + } + + public class OrderEvent + { + public string? OrderId { get; set; } + } + + public class OrderResult + { + public string? Status { get; set; } + public string? OrderId { get; set; } + } + + public class ValidationResult + { + public bool IsValid { get; set; } + } +} + +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(Program.OrderEvent))] +[JsonSerializable(typeof(Program.OrderResult))] +[JsonSerializable(typeof(Program.ValidationResult))] +public partial class AotJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj new file mode 100644 index 000000000..8dda2b047 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj @@ -0,0 +1,44 @@ + + + + + + + $(DefaultPackageTargets) + enable + enable + false + true + $(NoWarn);NU1903;CS1591 + + + + + + + + + + + + + PreserveNewest + + + + + + + + + + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs new file mode 100644 index 000000000..ae36c22dd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class AtMostOnceCrashReplayTest +{ + private readonly ITestOutputHelper _output; + public AtMostOnceCrashReplayTest(ITestOutputHelper output) => _output = output; + + /// + /// Validates the AtMostOncePerRetry crash-recovery wire path: the Lambda + /// process is killed mid-step on attempt 1 (after START flush, before + /// SUCCEED). On re-invocation the SDK sees a STARTED checkpoint with no + /// terminal record and routes through the retry strategy rather than + /// re-executing the step. Attempt 2 succeeds. + /// + /// This is the only path that exercises the StepInterruptedException + /// synthesis — the unit-test analogue + /// (StepAsync_AtMostOnce_StartedReplay_TriggersRetryHandler) fakes the + /// STARTED state in-memory and never proves the service actually delivers + /// it on a real crash. + /// + [Fact] + public async Task AtMostOnce_StepCrashesMidExecution_RecoversViaRetry() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("AtMostOnceCrashFunction"), + "amocrash", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 2s retry delay + initial-attempt cold-start + recovery invoke. Generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepSucceededDetails != null && e.Name == "crash_then_recover") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Attempt 1 was crashed (no SUCCEED), attempt 2 recovered. + // We expect exactly one StepSucceeded carrying "recovered on attempt 2". + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "crash_then_recover"); + Assert.NotNull(succeeded); + Assert.Equal("\"recovered on attempt 2\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // Two StepStarted events: one per invocation. + Assert.True( + events.Count(e => e.EventType == EventType.StepStarted) >= 2, + "Expected at least 2 StepStarted events (attempt 1 crashed, attempt 2 recovered)."); + + // The crash-recovery branch records the synthesized StepInterruptedException + // as a StepFailed event for attempt 1, with a message identifying the lost + // attempt rather than a user exception type. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "crash_then_recover") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.NotEmpty(failures); + Assert.Contains(failures, m => m.Contains("Step result lost", StringComparison.OrdinalIgnoreCase) + || m.Contains("interrupted", StringComparison.OrdinalIgnoreCase) + || m.Contains("previous attempt", StringComparison.OrdinalIgnoreCase)); + + // The execution actually crossed at least one invocation boundary + // (otherwise replay wasn't exercised at all). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves crash + replay), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs new file mode 100644 index 000000000..3a1e6c2c9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CallbackFailedTest +{ + private readonly ITestOutputHelper _output; + public CallbackFailedTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end failure path for CreateCallbackAsync: + /// a paired RejecterFunction Lambda (Event-invoked from the workflow) + /// reports a failure via SendDurableExecutionCallbackFailure. The SDK + /// raises from GetResultAsync, + /// and the workflow surfaces FAILED with that exception type recorded. + /// + /// + /// The callback delivery has to come from a separate Lambda — not from the + /// test process — because the test's synchronous InvokeAsync blocks + /// until the durable execution reaches a terminal state. If the test tried + /// to deliver the callback itself, it would deadlock against its own + /// blocked Invoke. + /// + [Fact] + public async Task CallbackFailed_SurfacesAsCallbackFailedException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CallbackFailedFunction"), + "cb-failed", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("RejecterFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The workflow's surfaced exception is CallbackFailedException — the SDK + // wraps the external error message into the exception's Message. Verify + // the recorded error type is the SDK's CallbackFailedException and that + // the original failure message survives. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackFailedException).FullName, execution.Error.ErrorType); + Assert.Contains("rejected", execution.Error.ErrorMessage); + + // History records both Started and Failed for the same callback. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackFailed) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs new file mode 100644 index 000000000..7f50091c9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CallbackTimeoutTest +{ + private readonly ITestOutputHelper _output; + public CallbackTimeoutTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end timeout path for CreateCallbackAsync: + /// the workflow waits on a callback whose + /// elapses before any result is delivered. The service marks the callback as + /// TIMED_OUT, the SDK throws , and the + /// workflow surfaces FAILED with that exception type recorded. + /// + [Fact] + public async Task CallbackTimeout_SurfacesAsCallbackTimeoutException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CallbackTimeoutFunction"), + "cb-timeout", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Capture the CallbackId before the timeout fires so we can assert it + // on the surfaced exception. CallbackStarted has the ID; CallbackTimedOut + // typically does not echo it back on the event. + var callbackId = await WaitForCallbackIdAsync(deployment, arn!, TimeSpan.FromSeconds(30)); + Assert.False(string.IsNullOrEmpty(callbackId)); + _output.WriteLine($"Service-allocated CallbackId: {callbackId}"); + + // The configured timeout is 10s; allow generous headroom for service + // latency (timer scheduling + re-invoke + Lambda cold start). + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The execution surfaces the SDK's CallbackTimeoutException to the user. + // ErrorObject.FromException records ErrorType as the FullName; verify both + // the type and that the recorded message mentions "timed out". + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackTimeoutException).FullName, execution.Error.ErrorType); + Assert.Contains("timed out", execution.Error.ErrorMessage, StringComparison.OrdinalIgnoreCase); + + // History records both Started and TimedOut for the same callback. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackTimedOut) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackTimedOut)); + } + + private static async Task WaitForCallbackIdAsync( + DurableFunctionDeployment deployment, string arn, TimeSpan timeout) + { + var history = await deployment.WaitForHistoryAsync( + arn, + h => h.Events?.Any(e => + e.CallbackStartedDetails != null + && !string.IsNullOrEmpty(e.CallbackStartedDetails.CallbackId)) ?? false, + timeout); + return history.Events? + .Where(e => e.CallbackStartedDetails != null + && !string.IsNullOrEmpty(e.CallbackStartedDetails.CallbackId)) + .Select(e => e.CallbackStartedDetails.CallbackId) + .FirstOrDefault(); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs new file mode 100644 index 000000000..b2bcc76f8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextFailsTest +{ + private readonly ITestOutputHelper _output; + public ChildContextFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end RunInChildContextAsync failure path: the user func inside the + /// child throws, the SDK emits a CONTEXT FAIL checkpoint, the child's prior + /// inner step is preserved, and the workflow is marked FAILED with the + /// original exception details surfaced via ContextFailedDetails.Error. + /// + [Fact] + public async Task ChildContext_FailureSurfacesAsContextFailed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextFailsFunction"), + "childctxfail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-fail"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload; locate the execution by name. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("intentional child context failure", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + Assert.Equal("OrderProcessing", contextStarted!.SubType); + // The child context op itself is at root — its boundary opens at the parent scope. + Assert.Null(contextStarted.ParentId); + + // The CONTEXT FAIL record carries the original exception details and + // closes the boundary back at the parent scope (root, ParentId=null). + var contextFailed = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed && e.Name == "phase"); + Assert.NotNull(contextFailed); + Assert.Null(contextFailed!.ParentId); + var error = contextFailed.ContextFailedDetails.Error?.Payload; + Assert.NotNull(error); + Assert.Contains("intentional child context failure", error!.ErrorMessage ?? string.Empty); + Assert.Equal(typeof(InvalidOperationException).FullName, error.ErrorType); + // The wire ErrorObject preserves StackTrace from ToSdkError end-to-end — + // the service stores it and returns it on replay (or directly in the + // history event), so user-facing ChildContextException.OriginalStackTrace + // is populated rather than dropped. + Assert.NotNull(error.StackTrace); + Assert.NotEmpty(error.StackTrace); + + // The step that ran before the throw was checkpointed under the child. + var contextOpId = contextStarted.Id; + var innerStep = events.SingleOrDefault( + e => e.StepSucceededDetails != null && e.Name == "prepare" && e.ParentId == contextOpId); + Assert.NotNull(innerStep); + Assert.Equal("\"prepared-integ-test-fail\"", innerStep!.StepSucceededDetails.Result?.Payload); + + // Every inner step/wait event for this workflow is parented under the + // child context — the child is a single observability boundary. + var innerOpEvents = events + .Where(e => e.StepStartedDetails != null + || e.StepSucceededDetails != null + || e.StepFailedDetails != null + || e.WaitStartedDetails != null + || e.WaitSucceededDetails != null) + .ToList(); + Assert.NotEmpty(innerOpEvents); + Assert.All(innerOpEvents, e => Assert.Equal(contextOpId, e.ParentId)); + + // The child never reached SUCCEED; the workflow body past the throw is unreachable. + Assert.DoesNotContain(events, e => e.EventType == EventType.ContextSucceeded); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs new file mode 100644 index 000000000..90fafcd61 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs @@ -0,0 +1,114 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextRetryFailsTest +{ + private readonly ITestOutputHelper _output; + public ChildContextRetryFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end: a step inside a child context retries until exhausted, then + /// the child closes as ContextFailed. Validates the child is a single + /// retry/error boundary — every per-attempt StepStarted/StepFailed (and the + /// terminal ContextFailed's surfaced exception) reflect the same logical + /// failure under the same parent op id. + /// + [Fact] + public async Task ChildContext_RetryExhaustionInsideChild_AllAttemptsParentedUnderChild() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextRetryFailsFunction"), + "childctxretry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-retry"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 attempts with 2s + 4s retry delays plus service-driven re-invokes. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("always-fails", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + var contextOpId = contextStarted!.Id; + Assert.NotNull(contextOpId); + + // All 3 step attempts (with their per-attempt StepFailed records) ran + // inside the child boundary. + var stepStarted = events.Where(e => e.EventType == EventType.StepStarted && e.Name == "always_fails").ToList(); + Assert.Equal(3, stepStarted.Count); + Assert.All(stepStarted, e => Assert.Equal(contextOpId, e.ParentId)); + + var stepFailed = events.Where(e => e.StepFailedDetails != null && e.Name == "always_fails").ToList(); + Assert.Equal(3, stepFailed.Count); + Assert.All(stepFailed, e => Assert.Equal(contextOpId, e.ParentId)); + + // The per-attempt failure messages reflect the user's exception. + var failureMessages = stepFailed + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failureMessages, m => m.Contains("attempt 1")); + Assert.Contains(failureMessages, m => m.Contains("attempt 2")); + Assert.Contains(failureMessages, m => m.Contains("attempt 3")); + + // Each StepFailed event preserves StackTrace through the wire — proves + // StepDetails.Error mapping doesn't drop frames. + Assert.All(stepFailed, e => + { + var stack = e.StepFailedDetails.Error?.Payload?.StackTrace; + Assert.NotNull(stack); + Assert.NotEmpty(stack); + }); + + // The child closes the boundary at the parent scope (root) and surfaces + // the underlying exception type — a single retry/error envelope. + var contextFailed = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed && e.Name == "phase"); + Assert.NotNull(contextFailed); + Assert.Null(contextFailed!.ParentId); + var contextError = contextFailed.ContextFailedDetails.Error?.Payload; + Assert.NotNull(contextError); + Assert.Contains("always-fails", contextError!.ErrorMessage ?? string.Empty); + // StackTrace round-trips end-to-end — the service preserves it from the + // checkpointed FAIL update and returns it on replay/history. + Assert.NotNull(contextError.StackTrace); + Assert.NotEmpty(contextError.StackTrace); + + Assert.DoesNotContain(events, e => e.StepSucceededDetails != null); + Assert.DoesNotContain(events, e => e.EventType == EventType.ContextSucceeded); + + // Service honored retry delays: with 2s + 4s and no jitter, the gap + // between first and last StepStarted should be >= 6s. + var startedTimestamps = stepStarted + .Where(e => e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays inside child: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs new file mode 100644 index 000000000..6216e8d90 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs @@ -0,0 +1,112 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextTest +{ + private readonly ITestOutputHelper _output; + public ChildContextTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end RunInChildContextAsync: the workflow runs a child context that + /// performs step + wait + step and returns a typed result. The unit tests + /// fake state transitions in-memory; this test verifies the service actually + /// round-trips CONTEXT START/SUCCEED records, parents the inner step/wait + /// events under the context op, and persists the child's return value as + /// the ContextSucceeded payload. + /// + [Fact] + public async Task ChildContext_CompletesViaService() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextFunction"), + "childctx", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-456"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded) ?? false) + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Exactly one child context was opened and closed successfully. + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + Assert.Equal("OrderProcessing", contextStarted!.SubType); + + // The child boundary opens and closes at the parent scope (root, ParentId=null). + Assert.Null(contextStarted.ParentId); + + var contextSucceeded = events.SingleOrDefault(e => e.EventType == EventType.ContextSucceeded && e.Name == "phase"); + Assert.NotNull(contextSucceeded); + Assert.Null(contextSucceeded!.ParentId); + + // The child's return value was checkpointed as the CONTEXT SUCCEED payload. + Assert.Equal( + "\"processed-validated-integ-test-456\"", + contextSucceeded.ContextSucceededDetails.Result?.Payload); + + // Inner operations are parented to the context op so the service + // visualizes them nested under the child. + var contextOpId = contextStarted.Id; + Assert.NotNull(contextOpId); + + var innerStepEvents = events + .Where(e => e.EventType == EventType.StepStarted && e.ParentId == contextOpId) + .OrderBy(e => e.EventTimestamp) + .ToList(); + Assert.Equal(2, innerStepEvents.Count); + Assert.Equal("validate", innerStepEvents[0].Name); + Assert.Equal("process", innerStepEvents[1].Name); + + var innerWaitStarted = events.SingleOrDefault( + e => e.WaitStartedDetails != null && e.Name == "short_wait" && e.ParentId == contextOpId); + Assert.NotNull(innerWaitStarted); + Assert.Equal(2, innerWaitStarted!.WaitStartedDetails.Duration); + + // Inner step results chain: validate -> wait -> process. + var stepResults = events + .Where(e => e.StepSucceededDetails != null && e.ParentId == contextOpId) + .OrderBy(e => e.EventTimestamp) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("validate", stepResults[0].Name); + Assert.Equal("validated-integ-test-456", stepResults[0].Payload); + Assert.Equal("process", stepResults[1].Name); + Assert.Equal("processed-validated-integ-test-456", stepResults[1].Payload); + + // Every inner step/wait event for this workflow is parented under the + // child context — the child is a single observability boundary. + var innerOpEvents = events + .Where(e => e.StepStartedDetails != null + || e.StepSucceededDetails != null + || e.StepFailedDetails != null + || e.WaitStartedDetails != null + || e.WaitSucceededDetails != null) + .ToList(); + Assert.NotEmpty(innerOpEvents); + Assert.All(innerOpEvents, e => Assert.Equal(contextOpId, e.ParentId)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs new file mode 100644 index 000000000..d4629350a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CreateCallbackHappyPathTest +{ + private readonly ITestOutputHelper _output; + public CreateCallbackHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy path for CreateCallbackAsync: + /// the workflow suspends inside GetResultAsync; a paired + /// ApproverFunction Lambda (Event-invoked from the workflow) acts + /// as the external system and delivers a result via + /// SendDurableExecutionCallbackSuccess; the workflow resumes and + /// returns the delivered payload. + /// + /// + /// The callback delivery has to come from a separate Lambda — not from the + /// test process — because the test's synchronous InvokeAsync blocks + /// until the durable execution reaches a terminal state. If the test tried + /// to deliver the callback itself, it would deadlock against its own + /// blocked Invoke. + /// + [Fact] + public async Task CreateCallback_DeliversResultViaSendSuccess() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CreateCallbackHappyPathFunction"), + "cb-happy", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("ApproverFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId":"integ-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The execution result mirrors the payload the approver sent — proves + // GetResultAsync deserialized the wire-level callback Result and the + // workflow returned it. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("approved", execution.Result); + Assert.Contains("integ-test", execution.Result); + + // History shows the canonical callback lifecycle: Started then Succeeded. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackSucceeded) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackSucceeded)); + + var succeeded = events.First(e => e.CallbackSucceededDetails != null); + Assert.Equal("approve", succeeded.Name); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs new file mode 100644 index 000000000..e3247e26c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs @@ -0,0 +1,859 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using System.Text.Json; +using Amazon; +using Amazon.ECR; +using Amazon.ECR.Model; +using Amazon.IdentityManagement; +using Amazon.IdentityManagement.Model; +using Amazon.Lambda; +using Amazon.Lambda.Model; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +/// +/// Builds, deploys, and invokes a single durable Lambda function for an integration test. +/// Manages the full lifecycle: IAM role, ECR repo, Docker image, Lambda function. +/// All resources are torn down on DisposeAsync. +/// +internal sealed class DurableFunctionDeployment : IAsyncDisposable +{ + private readonly ITestOutputHelper _output; + private readonly IAmazonLambda _lambdaClient; + private readonly IAmazonECR _ecrClient; + private readonly IAmazonIdentityManagementService _iamClient; + + private readonly string _functionName; + private readonly string _repoName; + private readonly string _roleName; + private string? _roleArn; + private string? _imageUri; + private string? _functionArn; + private bool _functionCreated; + private bool _ecrRepoCreated; + private readonly List _inlinePolicyNames = new(); + + // Optional paired "external system" Lambda — a plain (non-durable) function + // that the workflow's submitter invokes. Models a real-world callback flow + // where an out-of-band service resolves the durable execution. + private readonly string _externalFunctionName; + private readonly string _externalRepoName; + private readonly string _externalRoleName; + private string? _externalRoleArn; + private bool _externalFunctionCreated; + private bool _externalEcrRepoCreated; + + public string FunctionName => _functionName; + public string? ExternalFunctionName => _externalFunctionCreated ? _externalFunctionName : null; + + /// + /// The fully-qualified function ARN (unqualified). Available after + /// or completes. Use $"{FunctionArn}:$LATEST" + /// when constructing a qualified identifier for chained invocation. + /// + public string FunctionArn => _functionArn + ?? throw new InvalidOperationException("Function ARN is not available until the function has been created."); + + public IAmazonLambda LambdaClient => _lambdaClient; + + private DurableFunctionDeployment(ITestOutputHelper output, string suffix) + { + _output = output; + _lambdaClient = new AmazonLambdaClient(RegionEndpoint.USEast1); + _ecrClient = new AmazonECRClient(RegionEndpoint.USEast1); + _iamClient = new AmazonIdentityManagementServiceClient(RegionEndpoint.USEast1); + + // Truncate the GUID (not the suffix) so CloudTrail entries stay readable. + // Keep the GUID short enough that the total stays well under 40 chars even for long suffixes. + static string ShortId() => Guid.NewGuid().ToString("N")[..Math.Min(8, 32)]; + _functionName = $"durable-integ-{suffix}-{ShortId()}"; + _repoName = $"durable-integ-{suffix}-{ShortId()}"; + _roleName = $"durable-integ-{suffix}-{ShortId()}"; + _externalFunctionName = $"durable-integ-{suffix}-ext-{ShortId()}"; + _externalRepoName = $"durable-integ-{suffix}-ext-{ShortId()}"; + _externalRoleName = $"durable-integ-{suffix}-ext-{ShortId()}"; + } + + public static async Task CreateAsync( + string testFunctionDir, + string scenarioSuffix, + ITestOutputHelper output, + string? externalFunctionDir = null, + IDictionary? environment = null, + IReadOnlyList? invokeAllowedFunctionArns = null, + bool enableTenancy = false) + { + var deployment = new DurableFunctionDeployment(output, scenarioSuffix); + try + { + await deployment.InitializeAsync(testFunctionDir, externalFunctionDir, environment, invokeAllowedFunctionArns, enableTenancy); + } + catch + { + // Tear down anything that did get created (IAM role, ECR repo) so we + // don't leak resources when init fails part-way through. + await deployment.DisposeAsync(); + throw; + } + return deployment; + } + + /// + /// Two-step deployment for chained-invoke scenarios: deploys the downstream (callee) + /// function first, captures its ARN, then deploys the parent (caller) with + /// DOWNSTREAM_FUNCTION_ARN set in the parent's environment and the parent's + /// role granted lambda:InvokeFunction on the downstream's ARN. + /// + /// + /// The parent and downstream are independent + /// instances; both are returned so the caller can dispose them in the right order + /// (parent first, then downstream — the caller is the one in flight when the test ends). + /// The DOWNSTREAM_FUNCTION_ARN env var carries a qualified identifier + /// (arn:...:function:name:$LATEST) so the parent can pass it directly to + /// ctx.InvokeAsync(...) without further manipulation. + /// + public static async Task<(DurableFunctionDeployment Parent, DurableFunctionDeployment Downstream)> + CreateWithDownstreamAsync( + string parentTestFunctionDir, + string downstreamTestFunctionDir, + string scenarioSuffix, + ITestOutputHelper output, + IDictionary? extraParentEnvironment = null, + bool enableDownstreamTenancy = false) + { + // Deploy downstream first so we can pass its ARN to the parent's environment. + var downstream = await CreateAsync( + downstreamTestFunctionDir, + scenarioSuffix + "-d", + output, + enableTenancy: enableDownstreamTenancy); + + DurableFunctionDeployment? parent = null; + try + { + // Use a qualified identifier — the durable execution service rejects + // unqualified ARNs. $LATEST is fine for integration tests; production + // should use a version or alias. + var qualifiedDownstreamArn = downstream.FunctionArn + ":$LATEST"; + var parentEnv = new Dictionary(StringComparer.Ordinal) + { + ["DOWNSTREAM_FUNCTION_ARN"] = qualifiedDownstreamArn, + }; + if (extraParentEnvironment != null) + { + foreach (var kv in extraParentEnvironment) + parentEnv[kv.Key] = kv.Value; + } + + parent = await CreateAsync( + parentTestFunctionDir, + scenarioSuffix + "-p", + output, + environment: parentEnv, + invokeAllowedFunctionArns: new[] { downstream.FunctionArn }); + } + catch + { + // Parent failed to deploy — tear down the downstream we already created + // so we don't leak resources. + await downstream.DisposeAsync(); + throw; + } + + return (parent!, downstream); + } + + private const string LambdaAssumeRolePolicy = """ + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + }] + } + """; + + private async Task InitializeAsync( + string testFunctionDir, + string? externalFunctionDir, + IDictionary? environment, + IReadOnlyList? invokeAllowedFunctionArns, + bool enableTenancy) + { + // 1. Create the workflow's IAM role. + _output.WriteLine($"Creating IAM role: {_roleName}"); + var createRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest + { + RoleName = _roleName, + AssumeRolePolicyDocument = LambdaAssumeRolePolicy + }); + _roleArn = createRoleResponse.Role.Arn; + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _roleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + }); + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _roleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy" + }); + + // 2. (optional) Create the external function's IAM role up front so its + // sts:AssumeRole and lambda:SendDurableExecutionCallbackSuccess + // permissions propagate alongside the workflow role's permissions + // (single 10-second sleep covers both). + if (externalFunctionDir != null) + { + _output.WriteLine($"Creating external IAM role: {_externalRoleName}"); + var extRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest + { + RoleName = _externalRoleName, + AssumeRolePolicyDocument = LambdaAssumeRolePolicy + }); + _externalRoleArn = extRoleResponse.Role.Arn; + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _externalRoleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + }); + + // Inline policy lets the external function call the durable callback API. + // Resource "*" because we don't yet know the workflow's ARN at this point — + // the external function only resolves callbacks belonging to executions the + // workflow created, so the blast radius is bounded by the role's lifetime. + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _externalRoleName, + PolicyName = "SendDurableExecutionCallback", + PolicyDocument = """ + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "lambda:SendDurableExecutionCallbackSuccess", + "lambda:SendDurableExecutionCallbackFailure" + ], + "Resource": "*" + }] + } + """ + }); + + // Workflow function will Invoke the external function — grant via inline policy. + // Scoped to the external function name we just minted. + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _roleName, + PolicyName = "InvokeExternalFunction", + PolicyDocument = $$""" + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": "lambda:InvokeFunction", + "Resource": "arn:aws:lambda:*:*:function:{{_externalFunctionName}}" + }] + } + """ + }); + _inlinePolicyNames.Add("InvokeExternalFunction"); + } + + // Grant cross-Lambda invoke when the parent of a chained-invoke scenario + // needs to call out to a downstream function. The durable execution service + // is the one that actually drives the chained invocation in production — + // attaching this directly to the parent's role keeps the parent role + // capable of being used in non-durable contexts (e.g. for diagnostic + // direct invokes from the test harness). + if (invokeAllowedFunctionArns != null && invokeAllowedFunctionArns.Count > 0) + { + // Allow both the unqualified ARN and any qualifier (alias/version/$LATEST). + var resources = new List(invokeAllowedFunctionArns.Count * 2); + foreach (var arn in invokeAllowedFunctionArns) + { + resources.Add(arn); + resources.Add(arn + ":*"); + } + var resourceJson = "[" + string.Join(",", resources.Select(r => $"\"{r}\"")) + "]"; + var policyDoc = $$""" + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": ["lambda:InvokeFunction"], + "Resource": {{resourceJson}} + }] + } + """; + const string PolicyName = "AllowChainedInvoke"; + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _roleName, + PolicyName = PolicyName, + PolicyDocument = policyDoc + }); + _inlinePolicyNames.Add(PolicyName); + } + + // Wait for IAM propagation. + await Task.Delay(TimeSpan.FromSeconds(10)); + + // 3. Create the workflow ECR repo + image. + _output.WriteLine($"Creating ECR repository: {_repoName}"); + var createRepoResponse = await _ecrClient.CreateRepositoryAsync(new CreateRepositoryRequest + { + RepositoryName = _repoName + }); + _ecrRepoCreated = true; + var repositoryUri = createRepoResponse.Repository.RepositoryUri; + + _output.WriteLine($"Building and pushing Docker image from {testFunctionDir}..."); + _imageUri = await BuildAndPushImage(testFunctionDir, repositoryUri); + _output.WriteLine($"Image pushed: {_imageUri}"); + + // 4. (optional) Create + push the external function image and create the Lambda. + // Done before the workflow Lambda so the workflow function's environment can + // reference the external function name (which is already known from the ctor). + if (externalFunctionDir != null) + { + _output.WriteLine($"Creating external ECR repository: {_externalRepoName}"); + var extRepoResponse = await _ecrClient.CreateRepositoryAsync(new CreateRepositoryRequest + { + RepositoryName = _externalRepoName + }); + _externalEcrRepoCreated = true; + var extRepositoryUri = extRepoResponse.Repository.RepositoryUri; + + _output.WriteLine($"Building external Docker image from {externalFunctionDir}..."); + var extImageUri = await BuildAndPushImage(externalFunctionDir, extRepositoryUri); + _output.WriteLine($"External image pushed: {extImageUri}"); + + _output.WriteLine($"Creating external Lambda function: {_externalFunctionName}"); + await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest + { + FunctionName = _externalFunctionName, + PackageType = PackageType.Image, + Role = _externalRoleArn, + Code = new FunctionCode { ImageUri = extImageUri }, + Timeout = 30, + MemorySize = 256 + // No DurableConfig — this is a plain function. + }); + _externalFunctionCreated = true; + + _output.WriteLine("Waiting for external function to become Active..."); + await WaitForFunctionActive(_externalFunctionName); + } + + // 5. Create the workflow Lambda. + _output.WriteLine($"Creating Lambda function: {_functionName}"); + var createFunctionRequest = new CreateFunctionRequest + { + FunctionName = _functionName, + PackageType = PackageType.Image, + Role = _roleArn, + Code = new FunctionCode { ImageUri = _imageUri }, + Timeout = 30, + MemorySize = 256, + DurableConfig = new DurableConfig { ExecutionTimeout = 60 } + }; + + // Tenant isolation must be set at function-creation time (Lambda rejects + // post-create modification). Without it, the durable execution service + // refuses chained invokes that carry a TenantId — so the tenant-routing + // integration test needs the *callee* deployed with PER_TENANT. + if (enableTenancy) + { + createFunctionRequest.TenancyConfig = new TenancyConfig + { + TenantIsolationMode = TenantIsolationMode.PER_TENANT + }; + } + + // Build the function's environment: start with the caller-supplied vars, then + // tack on EXTERNAL_FUNCTION_NAME if a paired external function exists. + var envVars = new Dictionary(StringComparer.Ordinal); + if (environment != null) + { + foreach (var kv in environment) + envVars[kv.Key] = kv.Value; + } + if (externalFunctionDir != null) + { + envVars["EXTERNAL_FUNCTION_NAME"] = _externalFunctionName; + } + if (envVars.Count > 0) + { + createFunctionRequest.Environment = new Amazon.Lambda.Model.Environment + { + Variables = envVars + }; + } + + var createFunctionResponse = await _lambdaClient.CreateFunctionAsync(createFunctionRequest); + _functionCreated = true; + _functionArn = createFunctionResponse.FunctionArn; + + _output.WriteLine($"Waiting for function to become Active... (ARN: {_functionArn})"); + await WaitForFunctionActive(_functionName); + } + + public async Task<(InvokeResponse Response, string ExecutionName)> InvokeAsync(string payload, string? executionName = null) + { + var name = executionName ?? $"integ-test-{Guid.NewGuid():N}"; + var response = await _lambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = _functionName, + Qualifier = "$LATEST", + Payload = payload, + DurableExecutionName = name + }); + return (response, name); + } + + /// + /// Polls ListDurableExecutionsByFunction until an execution with the given name appears. + /// Useful when the synchronous Invoke response gives no ARN (e.g., failed workflows return null). + /// + public async Task FindDurableExecutionArnByNameAsync(string executionName, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + var attempt = 0; + _output.WriteLine($"[FindArn] Starting search for execution name '{executionName}' on function '{_functionName}' (timeout: {timeout.TotalSeconds}s)"); + + while (DateTime.UtcNow < deadline) + { + attempt++; + try + { + var resp = await _lambdaClient.ListDurableExecutionsByFunctionAsync( + new ListDurableExecutionsByFunctionRequest + { + FunctionName = _functionName, + DurableExecutionName = executionName // server-side exact match + }); + + var count = resp.DurableExecutions?.Count ?? 0; + _output.WriteLine($"[FindArn] attempt {attempt}: List returned {count} executions"); + + if (count > 0) + { + foreach (var e in resp.DurableExecutions!) + { + _output.WriteLine($"[FindArn] - name='{e.DurableExecutionName}' status={e.Status} arn={e.DurableExecutionArn}"); + } + var match = resp.DurableExecutions.FirstOrDefault(e => e.DurableExecutionName == executionName); + if (match != null) + { + _output.WriteLine($"[FindArn] matched on attempt {attempt}"); + return match.DurableExecutionArn; + } + } + } + catch (Exception ex) + { + _output.WriteLine($"[FindArn] attempt {attempt} error (will retry): {ex.Message}"); + } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + _output.WriteLine($"[FindArn] gave up after {attempt} attempts ({timeout.TotalSeconds}s)"); + return null; + } + + public async Task PollForCompletionAsync(string durableExecutionArn, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + + while (DateTime.UtcNow < deadline) + { + try + { + var resp = await _lambdaClient.GetDurableExecutionAsync( + new GetDurableExecutionRequest { DurableExecutionArn = durableExecutionArn }); + + var status = resp.Status?.ToString(); + if (status == "SUCCEEDED" || status == "FAILED" || + status == "TIMED_OUT" || status == "STOPPED") + { + return status; + } + } + catch (Exception ex) + { + _output.WriteLine($"Poll error (will retry): {ex.Message}"); + } + + await Task.Delay(TimeSpan.FromSeconds(2)); + } + + return "TIMEOUT"; + } + + public async Task GetExecutionAsync(string durableExecutionArn) + => await _lambdaClient.GetDurableExecutionAsync( + new GetDurableExecutionRequest { DurableExecutionArn = durableExecutionArn }); + + public async Task GetHistoryAsync(string durableExecutionArn, bool includeExecutionData = true) + => await _lambdaClient.GetDurableExecutionHistoryAsync( + new GetDurableExecutionHistoryRequest + { + DurableExecutionArn = durableExecutionArn, + IncludeExecutionData = includeExecutionData + }); + + /// + /// Repeatedly fetches history until is satisfied or the + /// timeout elapses. Needed because the history endpoint is eventually consistent — + /// the execution status can flip to SUCCEEDED before all events are indexed. + /// + public async Task WaitForHistoryAsync( + string durableExecutionArn, + Func predicate, + TimeSpan timeout, + bool includeExecutionData = true) + { + var deadline = DateTime.UtcNow + timeout; + GetDurableExecutionHistoryResponse? last = null; + var attempt = 0; + + while (DateTime.UtcNow < deadline) + { + attempt++; + try + { + last = await GetHistoryAsync(durableExecutionArn, includeExecutionData); + var eventCount = last.Events?.Count ?? 0; + var typeCounts = last.Events? + .GroupBy(e => e.EventType?.Value ?? "") + .Select(g => $"{g.Key}:{g.Count()}") + .OrderBy(s => s); + _output.WriteLine($"[WaitForHistory] attempt {attempt}: {eventCount} events [{string.Join(",", typeCounts ?? Enumerable.Empty())}]"); + if (predicate(last)) + { + DumpEvents(last); + return last; + } + } + catch (Exception ex) + { + _output.WriteLine($"[WaitForHistory] attempt {attempt} error (will retry): {ex.Message}"); + } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + + _output.WriteLine($"[WaitForHistory] gave up after {attempt} attempts; returning last response with {last?.Events?.Count ?? 0} events"); + if (last != null) DumpEvents(last); + return last ?? throw new TimeoutException($"GetDurableExecutionHistory never succeeded within {timeout.TotalSeconds}s"); + } + + private void DumpEvents(GetDurableExecutionHistoryResponse history) + { + var events = history.Events ?? new List(); + _output.WriteLine($"[WaitForHistory] event dump ({events.Count} total):"); + for (int i = 0; i < events.Count; i++) + { + var e = events[i]; + _output.WriteLine($" [{i}] type={e.EventType?.Value ?? ""} name={e.Name ?? ""} ts={e.EventTimestamp:O}"); + } + } + + public string? ExtractDurableExecutionArn(string responsePayload) + { + try + { + var doc = JsonDocument.Parse(responsePayload); + if (doc.RootElement.TryGetProperty("durableExecutionArn", out var arnProp)) + return arnProp.GetString(); + } + catch { } + return null; + } + + private async Task WaitForFunctionActive(string functionName) + { + for (int i = 0; i < 60; i++) + { + try + { + var config = await _lambdaClient.GetFunctionConfigurationAsync( + new GetFunctionConfigurationRequest { FunctionName = functionName }); + if (config.State == State.Active) return; + if (config.State == State.Failed) + throw new Exception($"Function '{functionName}' creation failed: {config.StateReasonCode} - {config.StateReason}"); + } + catch (ResourceNotFoundException) { } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + throw new TimeoutException($"Function '{functionName}' did not become Active within 120 seconds"); + } + + private async Task BuildAndPushImage(string testFunctionDir, string repositoryUri) + { + // `dotnet test` spins up one testhost per TargetFramework (net8.0 + net10.0) and + // runs them concurrently. Both testhosts invoke the same test classes, which means + // two processes can race on the same TestFunctions// source dir — wiping bin/ + // and obj/ under each other's feet. Symptom: MSB3030 "Could not copy bootstrap.dll" + // because one process deleted obj/ while the other was mid-publish. Serialize the + // per-source-dir build with a cross-process file lock so different test functions + // can still build in parallel. (A Mutex would have thread-affinity issues across + // awaits; an exclusive FileStream avoids that.) Lock file goes under temp — keeping + // it out of the source tree avoids polluting git status across worktrees. + var lockKey = Convert.ToHexString(System.Security.Cryptography.SHA256.HashData( + Encoding.UTF8.GetBytes(testFunctionDir.ToLowerInvariant())))[..16]; + var lockPath = Path.Combine(Path.GetTempPath(), $"durable-integ-build-{lockKey}.lock"); + using var lockHandle = await AcquireExclusiveFileLockAsync(lockPath, TimeSpan.FromMinutes(10)); + + var publishDir = Path.Combine(testFunctionDir, "bin", "publish"); + if (Directory.Exists(publishDir)) Directory.Delete(publishDir, true); + + // MSBuild's up-to-date check leaves stale .Up2Date markers under obj/ that + // make `dotnet publish` skip the copy-to-output step on a second run after + // we've wiped bin/publish/. Result: empty publish dir → empty Docker build + // context → "COPY bin/publish/ … not found". Nuking obj/ guarantees a real + // publish each time the helper is invoked. Cheap (each test function is small). + var objDir = Path.Combine(testFunctionDir, "obj"); + if (Directory.Exists(objDir)) Directory.Delete(objDir, true); + var binDir = Path.Combine(testFunctionDir, "bin"); + if (Directory.Exists(binDir)) Directory.Delete(binDir, true); + + await RunProcess("dotnet", + $"publish -c Release -r linux-x64 --self-contained true -o \"{publishDir}\"", + testFunctionDir); + + var imageTag = $"{repositoryUri}:latest"; + await RunProcess("docker", + $"build --platform linux/amd64 --provenance=false -t {imageTag} .", + testFunctionDir); + + var authResponse = await _ecrClient.GetAuthorizationTokenAsync(new GetAuthorizationTokenRequest()); + var authData = authResponse.AuthorizationData[0]; + var token = Encoding.UTF8.GetString(Convert.FromBase64String(authData.AuthorizationToken)); + var parts = token.Split(':'); + var registryUrl = authData.ProxyEndpoint; + + await RunProcess("docker", + $"login --username {parts[0]} --password-stdin {registryUrl}", + testFunctionDir, + stdin: parts[1]); + + await RunProcess("docker", $"push {imageTag}", testFunctionDir); + + return imageTag; + } + + private static async Task AcquireExclusiveFileLockAsync(string lockPath, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + while (true) + { + try + { + return new FileStream(lockPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None); + } + catch (IOException) + { + if (DateTime.UtcNow >= deadline) + throw new TimeoutException($"Timed out waiting for build lock '{lockPath}' after {timeout.TotalSeconds:F0}s"); + await Task.Delay(TimeSpan.FromMilliseconds(500)); + } + } + } + + private async Task RunProcess(string fileName, string arguments, string workingDir, string? stdin = null) + { + _output.WriteLine($"Running: {fileName} {arguments}"); + var psi = new System.Diagnostics.ProcessStartInfo + { + FileName = fileName, + Arguments = arguments, + WorkingDirectory = workingDir, + RedirectStandardOutput = true, + RedirectStandardError = true, + RedirectStandardInput = stdin != null, + UseShellExecute = false + }; + + var process = System.Diagnostics.Process.Start(psi)!; + + if (stdin != null) + { + await process.StandardInput.WriteAsync(stdin); + process.StandardInput.Close(); + } + + var stdoutTask = process.StandardOutput.ReadToEndAsync(); + var stderrTask = process.StandardError.ReadToEndAsync(); + + await Task.WhenAny( + process.WaitForExitAsync(), + Task.Delay(TimeSpan.FromMinutes(5))); + + if (!process.HasExited) + { + process.Kill(); + throw new TimeoutException($"{fileName} timed out after 5 minutes"); + } + + var stdout = await stdoutTask; + var stderr = await stderrTask; + + if (process.ExitCode != 0) + { + // Dump the FULL streams on failure — diagnosing build errors with + // truncated output is painful, and these only fire on test failure. + _output.WriteLine($"stdout: {stdout}"); + _output.WriteLine($"stderr: {stderr}"); + var detail = !string.IsNullOrWhiteSpace(stderr) ? stderr : stdout; + throw new Exception($"{fileName} failed (exit {process.ExitCode}): {detail}"); + } + + if (!string.IsNullOrWhiteSpace(stdout)) + _output.WriteLine($"stdout: {stdout[..Math.Min(stdout.Length, 1000)]}"); + } + + public async ValueTask DisposeAsync() + { + if (_functionCreated) + { + try + { + _output.WriteLine($"Deleting function: {_functionName}"); + await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _functionName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (function): {ex.Message}"); } + } + + if (_externalFunctionCreated) + { + try + { + _output.WriteLine($"Deleting external function: {_externalFunctionName}"); + await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _externalFunctionName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (external function): {ex.Message}"); } + } + + if (_ecrRepoCreated) + { + try + { + _output.WriteLine($"Deleting ECR repository: {_repoName}"); + await _ecrClient.DeleteRepositoryAsync(new DeleteRepositoryRequest + { + RepositoryName = _repoName, + Force = true + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (ECR): {ex.Message}"); } + } + + if (_externalEcrRepoCreated) + { + try + { + _output.WriteLine($"Deleting external ECR repository: {_externalRepoName}"); + await _ecrClient.DeleteRepositoryAsync(new DeleteRepositoryRequest + { + RepositoryName = _externalRepoName, + Force = true + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (external ECR): {ex.Message}"); } + } + + if (_roleArn != null) + { + // Detach each policy independently — if one detach fails (e.g., the + // policy was never attached because init bailed out early) we still + // want to attempt the others and the final DeleteRole. + await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); + await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy"); + + // Inline policies must be deleted (not detached) before DeleteRole succeeds. + foreach (var inline in _inlinePolicyNames) + { + await TryDeleteInline(_roleName, inline); + } + + try + { + await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _roleName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole): {ex.Message}"); } + } + + if (_externalRoleArn != null) + { + await TryDetachManaged(_externalRoleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); + await TryDeleteInline(_externalRoleName, "SendDurableExecutionCallback"); + try + { + await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _externalRoleName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole external): {ex.Message}"); } + } + + async Task TryDetachManaged(string roleName, string policyArn) + { + try + { + await _iamClient.DetachRolePolicyAsync(new DetachRolePolicyRequest + { + RoleName = roleName, + PolicyArn = policyArn + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM Detach {policyArn}): {ex.Message}"); } + } + + async Task TryDeleteInline(string roleName, string policyName) + { + try + { + await _iamClient.DeleteRolePolicyAsync(new DeleteRolePolicyRequest + { + RoleName = roleName, + PolicyName = policyName + }); + } + catch (NoSuchEntityException) { /* policy was never attached — fine */ } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteInline {policyName}): {ex.Message}"); } + } + } + + public static string FindTestFunctionDir(string functionDirName) + { + var dir = AppContext.BaseDirectory; + while (dir != null) + { + var candidate = Path.Combine(dir, "TestFunctions", functionDirName); + if (Directory.Exists(candidate)) + return candidate; + + // Also check legacy "TestFunction" location for backwards compat + var legacy = Path.Combine(dir, functionDirName); + if (Directory.Exists(legacy) && File.Exists(Path.Combine(legacy, $"{functionDirName}.csproj"))) + return legacy; + + dir = Path.GetDirectoryName(dir); + } + + // Fallback: relative from test source directory + var fallback = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "TestFunctions", functionDirName)); + if (Directory.Exists(fallback)) + return fallback; + + throw new DirectoryNotFoundException( + $"Could not find TestFunctions/{functionDirName}/ directory. Looked up from: {AppContext.BaseDirectory}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs new file mode 100644 index 000000000..1b967588d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeFailureTest +{ + private readonly ITestOutputHelper _output; + public InvokeFailureTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_ChildThrows_ParentSurfacesInvokeFailedException() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeFailureParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeFailureChildFunction"), + scenarioSuffix: "invokefail", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-fail"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // The parent catches InvokeFailedException and returns normally — + // the parent execution itself SUCCEEDS even though the chained + // invocation FAILED. This is the value of the SDK's exception + // surface: failure is observable but not necessarily fatal. + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false) + && (h.Events?.Any(e => e.ChainedInvokeFailedDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Exactly one chained invoke was issued and it FAILED — the parent + // did not retry the invoke (no retry semantics for InvokeAsync yet). + Assert.Equal(1, events.Count(e => e.EventType == EventType.ChainedInvokeStarted)); + var failed = events.FirstOrDefault(e => e.ChainedInvokeFailedDetails != null); + Assert.NotNull(failed); + Assert.Equal("call_failing_child", failed!.Name); + + var error = failed.ChainedInvokeFailedDetails.Error?.Payload; + Assert.NotNull(error); + // The child's exception type and message propagate through the + // service into the parent's history. Some service implementations + // record only the simple type name and others the fully-qualified + // one — match either by checking for the substring. + Assert.Contains("InvalidOperationException", error!.ErrorType ?? string.Empty); + Assert.Contains("intentional child failure", error.ErrorMessage ?? string.Empty); + + // The parent's terminal result encodes "parent-saw-" — confirms + // the parent's catch block ran AND the exception's ErrorType field + // was populated by the SDK on resume from the FAILED chained invoke. + // Without the Result assertions, a regression that left ErrorType + // null would still produce a SUCCEEDED execution (parent-saw-unknown) + // and silently pass. + var execution = await parent.GetExecutionAsync(arn!); + Assert.Null(execution.Error); + Assert.NotNull(execution.Result); + Assert.Contains("parent-saw-", execution.Result); + Assert.DoesNotContain("parent-saw-unknown", execution.Result); + Assert.Contains("InvalidOperationException", execution.Result); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs new file mode 100644 index 000000000..4d884d24e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeHappyPathTest +{ + private readonly ITestOutputHelper _output; + public InvokeHappyPathTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_HappyPath_ChildResultPropagatesToParent() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeHappyPathParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeHappyPathChildFunction"), + scenarioSuffix: "invokehappy", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-happy"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + // Locate the parent execution and wait for terminal status. Chained + // invoke suspends the parent — the synchronous Invoke response + // carries no data — so we drive completion via the listing API. + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The chained invoke's result surfaces in the parent's history as a + // ChainedInvokeSucceeded event. The parent then returns that result + // verbatim from its workflow. + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false) + && (h.Events?.Any(e => e.ChainedInvokeSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var started = events.FirstOrDefault(e => e.EventType == EventType.ChainedInvokeStarted); + Assert.NotNull(started); + Assert.Equal(downstream.FunctionArn + ":$LATEST", started!.ChainedInvokeStartedDetails.FunctionName); + + var succeeded = events.FirstOrDefault(e => e.ChainedInvokeSucceededDetails != null); + Assert.NotNull(succeeded); + // The child returned the JSON-encoded string "got-42". + var childPayload = succeeded!.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal("got-42", childPayload); + + // The chained invoke event names what was invoked; cross-check against + // the deployed downstream's name so we know the parent really called + // the function we wired in. + Assert.Equal("call_child", succeeded.Name); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs new file mode 100644 index 000000000..9be5eeecb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs @@ -0,0 +1,122 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public InvokeReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_ReplayDeterminism_OperationIdsStableAcrossInvocations() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeReplayDeterminismParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeReplayDeterminismChildFunction"), + scenarioSuffix: "invokerply", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-replay"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — wait until both step-succeeded + // events AND the chained-invoke-succeeded event are visible. + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.ChainedInvokeSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Each step ran exactly once across the entire workflow — proves + // the chained invoke's suspend/resume cycle did NOT cause the + // pre-invoke step to re-execute. (Replay returned the cached + // checkpoint instead.) + var stepSucceededByName = events + .Where(e => e.StepSucceededDetails != null) + .GroupBy(e => e.Name) + .ToDictionary(g => g.Key!, g => g.Count()); + Assert.Equal(1, stepSucceededByName["before_invoke"]); + Assert.Equal(1, stepSucceededByName["after_invoke"]); + + // Exactly ONE chained invoke fired — replay didn't double-fire + // the InvokeAsync. Same invariant we check for steps. + Assert.Equal(1, events.Count(e => e.EventType == EventType.ChainedInvokeStarted)); + Assert.Equal(1, events.Count(e => e.ChainedInvokeSucceededDetails != null)); + + var beforeInvokeEvent = events.First(e => e.StepSucceededDetails != null && e.Name == "before_invoke"); + var generatedGuid = beforeInvokeEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.NotNull(generatedGuid); + Assert.True(Guid.TryParse(generatedGuid, out _), + $"before_invoke should produce a valid GUID, got: {generatedGuid}"); + + // The downstream's echo carries through to after_invoke verbatim, + // proving the cached chained-invoke result was used on resume. + var chainedSucceeded = events.First(e => e.ChainedInvokeSucceededDetails != null); + var chainedPayload = chainedSucceeded.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal($"echoed:{generatedGuid}", chainedPayload); + + var afterInvokeEvent = events.First(e => e.StepSucceededDetails != null && e.Name == "after_invoke"); + var afterPayload = afterInvokeEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal($"final:echoed:{generatedGuid}", afterPayload); + + // The chained invoke's suspend/resume forced at least 2 invocations + // of the parent — proves replay actually happened (not just a + // single straight-through execution that skipped suspension). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves replay happened), got {invocations.Count}"); + + // Operation IDs are stable across all replays of the same logical + // position. The Started event and the corresponding Succeeded event + // for each operation share the same ID — that's the clearest + // observable proof the SDK's deterministic ID generator is working. + // The SDK hashes "<counter>" at the root, so each ID is a + // 64-char lowercase hex SHA-256 digest. + var startedIds = events + .Where(e => e.EventType == EventType.StepStarted || e.EventType == EventType.ChainedInvokeStarted) + .Select(e => (e.Name, Id: e.Id)) + .ToList(); + var succeededIds = events + .Where(e => e.StepSucceededDetails != null || e.ChainedInvokeSucceededDetails != null) + .Select(e => (e.Name, Id: e.Id)) + .ToList(); + + // All operation IDs are populated and look like SHA-256 hex digests. + foreach (var (name, id) in startedIds) + { + Assert.False(string.IsNullOrEmpty(id), $"Operation '{name}' has no Id on its Started event"); + Assert.Equal(64, id!.Length); + Assert.Matches("^[0-9a-f]{64}$", id); + } + + // Every started operation ID must appear in a succeeded event — + // proves the deterministic IDs from the Start path matched the IDs + // the service used to record the terminal event. + foreach (var (name, id) in startedIds) + { + Assert.True( + succeededIds.Any(s => s.Name == name && s.Id == id), + $"Operation '{name}' (id={id}) started but did not produce a matching SUCCEEDED event with the same ID"); + } + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs new file mode 100644 index 000000000..6fbfcb27f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs @@ -0,0 +1,66 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeWithTenantIdTest +{ + private readonly ITestOutputHelper _output; + public InvokeWithTenantIdTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_WithTenantId_PropagatesToChainedInvokeOptions() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeWithTenantIdFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeChildTenantFunction"), + scenarioSuffix: "invoketenant", + output: _output, + // The downstream must be PER_TENANT for the service to accept a + // chained invoke carrying a TenantId. The parent stays default. + enableDownstreamTenancy: true); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "tenant-test"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await parent.WaitForHistoryAsync( + arn!, + h => h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var started = events.FirstOrDefault(e => e.EventType == EventType.ChainedInvokeStarted); + Assert.NotNull(started); + + // The tenant ID flows through ChainedInvokeOptions -> service -> + // ChainedInvokeStartedDetails. This is the load-bearing assertion: + // it proves the SDK's InvokeConfig.TenantId reaches the wire. + Assert.Equal("test-tenant", started!.ChainedInvokeStartedDetails.TenantId); + + // The chained call still produced a result — proves nothing in the + // tenant-routing path silently dropped the invocation. + var succeeded = events.FirstOrDefault(e => e.ChainedInvokeSucceededDetails != null); + Assert.NotNull(succeeded); + var childPayload = succeeded!.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal("tenant-aware-7", childPayload); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs new file mode 100644 index 000000000..94dbfc0a5 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class LongRetryChainTest +{ + private readonly ITestOutputHelper _output; + public LongRetryChainTest(ITestOutputHelper output) => _output = output; + + /// + /// Long retry chain across many invocations: step fails 5 times before + /// succeeding on attempt 6. Validates that StepDetails.Attempt increments + /// monotonically across invocations (no off-by-one, no skipped attempts) + /// and that IStepContext.AttemptNumber on the user side matches the wire + /// value on each attempt. + /// + [Fact] + public async Task FailsFiveTimesThenSucceeds_AttemptCounterIsMonotonic() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("LongRetryChainFunction"), + "longretry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total retry delay budget: 1+2+3+4+5 = 15s. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 6 + && (h.Events?.Any(e => e.StepSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Six attempts total: five failures + one success. + Assert.Equal(6, events.Count(e => e.EventType == EventType.StepStarted)); + Assert.Equal(5, events.Count(e => e.StepFailedDetails != null && e.Name == "long_retry_step")); + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "long_retry_step"); + Assert.NotNull(succeeded); + + // The user-facing AttemptNumber on the final (winning) attempt was 6 — + // proves IStepContext.AttemptNumber tracks the wire attempt counter + // across invocations, not just within a single invocation. + Assert.Equal("\"ok on attempt 6\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // Each failure carries a unique per-attempt message — confirms the user-side + // counter incremented exactly once per invocation, no duplicates or skips. + var failureMessages = events + .Where(e => e.StepFailedDetails != null && e.Name == "long_retry_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Equal(5, failureMessages.Count); + for (int i = 1; i <= 5; i++) + { + Assert.Contains(failureMessages, m => m.Contains($"attempt {i}")); + } + + // The chain was executed across multiple invocations (proves the + // service actually re-invoked us between retries instead of holding + // a single Lambda alive through all six attempts). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 5, + $"Expected at least 5 InvocationCompleted events (one per retry boundary), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs new file mode 100644 index 000000000..cb66e3e04 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class LongerWaitTest +{ + private readonly ITestOutputHelper _output; + public LongerWaitTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task LongerWait_ExpiresAndCompletes() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("LongerWaitFunction"), + "longwait", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "long-wait-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(90)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Steps before and after the wait both ran, with the post-wait step seeing + // the pre-wait step's value via replay. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("before_wait", stepResults[0].Name); + Assert.Equal("started-long-wait-test", stepResults[0].Payload); + Assert.Equal("after_wait", stepResults[1].Name); + Assert.Equal("after_wait-started-long-wait-test", stepResults[1].Payload); + + // The wait was checkpointed for the configured 15-second duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "long_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(15, waitStarted!.WaitStartedDetails.Duration); + + // The wait spanned at least two invocations: one to schedule it and at + // least one to resume after the timer fires. + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs new file mode 100644 index 000000000..06ab716c0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public MapFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five items, two fail, ToleratedFailureCount=1. The map must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// MapException (not ParallelException) propagates as the + /// workflow's terminal error. + /// + [Fact] + public async Task Map_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFailureToleranceFunction"), + "mtol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // MapException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("MapException", StringComparison.Ordinal) + || errorMessage.Contains("Map", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate MapException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed item contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs new file mode 100644 index 000000000..737e70a2f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public MapFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four items with staggered durable waits, FirstSuccessful: as soon + /// as one item completes, the map resolves. In-flight items remain in + /// rather than being cancelled. + /// Validates the cross-cutting decision: orphan units are NOT cancelled, and + /// short-circuit reports them as Started. + /// + [Fact] + public async Task Map_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFirstSuccessfulFunction"), + "mfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + + // At least one item succeeded — the workflow short-circuited as soon as + // the first win materialised. The fastest item is index 1 (1s wait). + Assert.True(successCount >= 1, $"Expected >= 1 successful item, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid item index, got {winnerIndex}"); + Assert.NotNull(winnerName); + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least the winning item CONTEXT + // succeeded. Other items' final state is timing-dependent (the + // orchestrator does not cancel in-flight units on short-circuit). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning item's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs new file mode 100644 index 000000000..b1c3f1e1a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs @@ -0,0 +1,126 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public MapFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Item op ids are + /// SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way under + /// the item op id. Reproduced locally because OperationIdGenerator is internal + /// to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end map: three items, each with a + /// step + a durable wait (the wait forces a suspend/resume cycle so the map + /// actually replays). Verifies the Flat-specific contract against the real + /// durable-execution service: + /// 1. NO per-item CONTEXT events are emitted — only the parent Map CONTEXT. + /// 2. Each item's inner step/wait ops RE-PARENT to the Map op (the nearest + /// non-virtual ancestor), since the virtual item emits no CONTEXT + /// checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the item op id space. + /// 4. The per-item result survives replay (read back from the inline parent + /// payload, not a per-item checkpoint). + /// + [Fact] + public async Task Map_Flat_SuppressesItemContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFlatNestingFunction"), + "mflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "mf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var itemOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each item's "generate" step is the 1st inner op under that item's own + // id space: SHA256("-1"). + var expectedStepIds = itemOpIds.Select(i => HashOpId($"{i}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three items' inner step + // + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Map op. No per-item + // CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && itemOpIds.Contains(e.Id))); + + // 2. Each item's "generate" step re-parents to the Map op (NOT to its + // virtual item op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-item id space, so + // the three items' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-item checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response carries the joined per-item results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs new file mode 100644 index 000000000..6ee451049 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapHappyPathTest +{ + private readonly ITestOutputHelper _output; + public MapHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path map: three items each processed in a step, and the + /// workflow returns the joined results. Validates the parent CONTEXT and + /// per-item CONTEXT checkpoints all land in the service-side history with the + /// correct (ItemNamer-derived) names and ordering. + /// + [Fact] + public async Task Map_AllItemsSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapHappyPathFunction"), + "mhappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three item outputs in index + // order (the SDK preserves index order even when items race). + Assert.Contains("order-1-m1", responsePayload); + Assert.Contains("order-2-m1", responsePayload); + Assert.Contains("order-3-m1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three item CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 items = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three items show up by their ItemNamer name on their own + // ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("process_all", startedNames); + Assert.Contains("item-order-1", startedNames); + Assert.Contains("item-order-2", startedNames); + Assert.Contains("item-order-3", startedNames); + + // Each item ran one step => 3 StepSucceeded. + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepSucceeded)); + + // No item failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs new file mode 100644 index 000000000..7c55418e7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public MapMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 items, each with a 2-second durable wait, MaxConcurrency = 2. Validates + /// the semaphore actually throttles dispatch: timestamps must cluster into + /// waves rather than all six firing simultaneously. Timing tolerance is + /// intentionally generous to avoid CI flakiness; the load-bearing assertion + /// is "not all 6 ran at once". + /// + [Fact] + public async Task Map_MaxConcurrency_ThrottlesItemDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapMaxConcurrencyFunction"), + "mmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("Timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: with MaxConcurrency=2 and 2s waits, the first wave + // should hold ~2 items. Strict 3-wave clustering can be flaky under + // service jitter, so we assert the weaker (still meaningful) property: + // not all 6 items fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 items; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — proving items did + // NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected items to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs new file mode 100644 index 000000000..6a29c18df --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public MapPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three items, one throws, two succeed — with NO config supplied. Map's + /// default CompletionConfig is AllCompleted() (permissive), + /// unlike Parallel's AllSuccessful(). This validates the headline + /// Map-vs-Parallel behavioral difference end-to-end: a partial failure does + /// NOT fail the workflow; it surfaces success/failure counts and per-item + /// errors through the service round-trip and back into the rebuilt + /// . + /// + [Fact] + public async Task Map_PartialFailure_DefaultIsPermissive_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapPartialFailureFunction"), + "mpartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // Permissive default means partial failure is NOT a workflow failure — + // the workflow accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 items = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok items); 1 ContextFailed (the boom item). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing item's checkpoint preserves the exception message. Its + // branch name is the default index ("1", the middle item). + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("1", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs new file mode 100644 index 000000000..02b867958 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs @@ -0,0 +1,114 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public MapReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each item's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching OperationIdGenerator's CreateChild contract). Reproduced locally + /// because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three map items, each containing a step + a durable wait (the wait forces + /// a suspend/resume cycle so the map actually replays). Verifies: + /// 1. The item operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used by + /// OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each item's user-visible step result is preserved across replay (the + /// GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Map_ItemOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapReplayDeterminismFunction"), + "mreplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedItemIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each item's CONTEXT SUCCEEDED is visible AND each item's + // step/wait events are visible (they live under the item operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Item operation IDs match the deterministic hash. + var itemStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedItemIds = itemStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedItemIds.Count); + foreach (var expected in expectedItemIds) + { + Assert.Contains(expected, observedItemIds); + } + + // 2. Each item's CONTEXT succeeded (parent named "fanout" excluded). + var itemSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, itemSucceededEvents.Count); + + // 3. Each item's "generate" step succeeded exactly once — proving replay + // returned the cached step result rather than re-executing. + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains the per-item step results + // (proving they survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs new file mode 100644 index 000000000..73fdbf0e3 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs @@ -0,0 +1,62 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MultipleStepsTest +{ + private readonly ITestOutputHelper _output; + public MultipleStepsTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task MultipleSteps_AllCheckpointed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MultipleStepsFunction"), + "multi", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "chain"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — the execution can be SUCCEEDED before + // all events are indexed. Wait until we see all 5 step-succeeded events. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 5 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 5, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(5, events.Count(e => e.EventType == EventType.StepStarted)); + + // Each step ran exactly once (no replay-induced duplicates) in declaration order, + // and each step's output chained from the previous one. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => $"{e.Name}={e.StepSucceededDetails.Result?.Payload?.Trim('"')}") + .ToList(); + Assert.Equal( + new[] + { + "step_1=a-chain", + "step_2=a-chain-b", + "step_3=a-chain-b-c", + "step_4=a-chain-b-c-d", + "step_5=a-chain-b-c-d-e", + }, + stepResults); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs new file mode 100644 index 000000000..77305ebef --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public ParallelFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five branches, two fail, ToleratedFailureCount=1. The parallel must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// ParallelException propagates as the workflow's terminal error. + /// + [Fact] + public async Task Parallel_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFailureToleranceFunction"), + "ptol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // ParallelException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("ParallelException", StringComparison.Ordinal) + || errorMessage.Contains("Parallel", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate ParallelException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed branch contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // At least 2 branches failed (the third may or may not have been + // dispatched depending on race; the parent CONTEXT itself also fails). + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs new file mode 100644 index 000000000..fedc538fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs @@ -0,0 +1,81 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public ParallelFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four branches with staggered durable waits, FirstSuccessful: as + /// soon as one branch completes, the parallel resolves. In-flight branches + /// remain in rather than being + /// cancelled. Validates the cross-cutting decision: orphan branches are NOT + /// cancelled, and short-circuit reports them as Started. + /// + [Fact] + public async Task Parallel_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFirstSuccessfulFunction"), + "pfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for + // CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The workflow's response payload reports the winning branch. + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + + // At least one branch succeeded — the workflow short-circuited as soon + // as the first win materialised. + Assert.True(successCount >= 1, $"Expected >= 1 successful branch, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid branch index, got {winnerIndex}"); + Assert.NotNull(winnerName); + + // CompletionReason is MinSuccessfulReached only if some branch was left + // un-dispatched at the time the threshold was met. With unbounded + // concurrency every branch dispatches immediately, so the reason is + // AllCompleted (all dispatched branches finished). Either reason is + // acceptable — just ensure it isn't FailureToleranceExceeded. + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least one branch CONTEXT + // succeeded. Other branches' final state is timing-dependent — they + // could be Started (left in flight) or Succeeded (completed before + // the parent's CONTEXT SUCCEED was flushed). The orchestrator + // deliberately does not cancel in-flight branches once the + // short-circuit fires. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning branch's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs new file mode 100644 index 000000000..0f3450aa2 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs @@ -0,0 +1,135 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public ParallelFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Branch op ids + /// are SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way + /// under the branch op id. Reproduced locally because OperationIdGenerator is + /// internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end parallel: three branches, each + /// with a step + a durable wait (the wait forces a suspend/resume cycle so the + /// parallel actually replays). Verifies the Flat-specific contract against the + /// real durable-execution service: + /// 1. NO per-branch CONTEXT events are emitted — only the parent Parallel + /// CONTEXT. (Under Nested there would be 4 ContextStarted; under Flat, + /// exactly 1.) + /// 2. Each branch's inner step/wait ops RE-PARENT to the Parallel op (the + /// nearest non-virtual ancestor), since the virtual branch emits no + /// CONTEXT checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the branch op id (so the two + /// branches' first steps don't collide), even though they report the + /// Parallel op as parent. + /// 4. The per-branch result survives replay (the GUID generated inside + /// generate is preserved across suspend/resume — read back from the + /// inline parent payload, not a per-branch checkpoint). + /// + [Fact] + public async Task Parallel_Flat_SuppressesBranchContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFlatNestingFunction"), + "pflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "pf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var branchOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each branch's "generate" step is the 1st inner op under that branch's + // own id space: SHA256("-1"). + var expectedStepIds = branchOpIds.Select(b => HashOpId($"{b}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three branches' inner + // step + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Parallel op. No + // per-branch CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && branchOpIds.Contains(e.Id))); + + // 2. Each branch's "generate" step re-parents to the Parallel op (NOT to + // its virtual branch op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-branch id space, + // so the three branches' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The "generate" step succeeded exactly once per branch — proving + // replay returned the cached result rather than re-executing. + Assert.Equal(3, generateSteps.Count); + + // 5. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-branch checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 6. The user-visible response carries the joined per-branch results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs new file mode 100644 index 000000000..0895f8796 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs @@ -0,0 +1,72 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelHappyPathTest +{ + private readonly ITestOutputHelper _output; + public ParallelHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path parallel: three branches run concurrently, each + /// produces a string, and the workflow returns the joined results. Validates + /// the parent CONTEXT and per-branch CONTEXT checkpoints all land in the + /// service-side history with the correct names and ordering. + /// + [Fact] + public async Task Parallel_AllBranchesSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelHappyPathFunction"), + "phappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three branch outputs in + // declaration order (the SDK preserves index order even when branches + // race). + Assert.Contains("alpha-p1", responsePayload); + Assert.Contains("beta-p1", responsePayload); + Assert.Contains("gamma-p1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three child CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 branches = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three branches show up by name on their own ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("fanout", startedNames); + Assert.Contains("alpha", startedNames); + Assert.Contains("beta", startedNames); + Assert.Contains("gamma", startedNames); + + // No branch failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs new file mode 100644 index 000000000..e228cdc22 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs @@ -0,0 +1,76 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public ParallelMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 branches, each with a 2-second durable wait, MaxConcurrency = 2. + /// Validates the semaphore actually throttles dispatch: timestamps must + /// cluster into 3 waves of 2 (not all six firing simultaneously). Timing + /// tolerance is intentionally generous (±2s per wave gap) to avoid CI + /// flakiness; if the wave-clustering proves flaky, fall back to + /// "all 6 succeeded". + /// + [Fact] + public async Task Parallel_MaxConcurrency_ThrottlesBranchDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelMaxConcurrencyFunction"), + "pmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom + // for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("Timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + // Sort timestamps and check whether they cluster into 3 groups of 2. + // Wave-N timestamps should be roughly 2s apart from wave-(N-1). + // Use generous tolerance (±1500ms within a wave; >= 800ms gap between + // waves) — service-driven invocations have observable jitter. + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: split timestamps by 1500ms gaps. With + // MaxConcurrency=2 and 2s waits, we expect at least 2 distinct waves. + // Strict 3-wave clustering can be flaky due to service jitter, so we + // assert the weaker (but still meaningful) property: not all 6 + // branches fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 branches; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — i.e., total + // elapsed must exceed ~2s, proving branches did NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected branches to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs new file mode 100644 index 000000000..28adf7549 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs @@ -0,0 +1,74 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public ParallelPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three branches, one throws, two succeed. With CompletionConfig.AllCompleted() + /// the parallel does NOT throw — it surfaces success/failure counts and the + /// per-branch errors. Validates per-branch error preservation through the + /// service round-trip and back into the rebuilt . + /// + [Fact] + public async Task Parallel_PartialFailure_AllCompleted_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelPartialFailureFunction"), + "ppartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // AllCompleted means partial failure is NOT a workflow failure — the + // user accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // Decode the workflow result payload and verify the counts surface correctly. + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + // The originating exception type is captured on the rebuilt + // ChildContextException when reconstructing the batch. + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 branches = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok branches); 1 ContextFailed (the boom branch). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing branch's checkpoint preserves the exception message. + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("boom", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs new file mode 100644 index 000000000..1ad44790a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs @@ -0,0 +1,122 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public ParallelReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each branch's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching the OperationIdGenerator's CreateChild contract). Reproduced + /// locally because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three parallel branches, each containing a step + a durable wait + /// (the wait forces a suspend/resume cycle so the parallel actually + /// replays). Verifies: + /// 1. The branch operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used + /// by OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each branch's user-visible step result is preserved across replay + /// (the GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Parallel_BranchOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelReplayDeterminismFunction"), + "preplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedBranchIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each branch's CONTEXT SUCCEEDED is visible AND each + // branch's step/wait events are visible (they live under the branch + // operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + // Parent + 3 branch CONTEXTs all succeeded. + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + // Each branch ran one step and one wait => 3 step succeeds + 3 wait succeeds. + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Branch operation IDs match the deterministic hash. + var branchStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedBranchIds = branchStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedBranchIds.Count); + foreach (var expected in expectedBranchIds) + { + Assert.Contains(expected, observedBranchIds); + } + + // 2. Every step under a branch parents to that branch's deterministic ID + // (proves the child generator's ID space is correctly seeded). + var branchSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, branchSucceededEvents.Count); + + // 3. Each branch's "generate" step succeeded exactly once — proving + // replay returned the cached step result rather than re-executing. + // (Re-execution would manifest as duplicate StepSucceeded events for + // the same operation ID.) + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations: one to schedule each + // wait, and at least one to resume after the timer fires. This proves + // replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains 3 valid GUIDs separated by commas + // (proving the per-branch step result survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayAwareLoggerTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayAwareLoggerTest.cs new file mode 100644 index 000000000..8280ed6a3 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayAwareLoggerTest.cs @@ -0,0 +1,197 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.CloudWatchLogs; +using Amazon.CloudWatchLogs.Model; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +/// +/// End-to-end proof of the replay-aware logger: a workflow with a Wait between +/// two steps re-invokes Lambda once. Lines emitted via +/// context.Logger.LogInformation in the workflow body and after step 1 +/// must appear ONCE in CloudWatch (suppressed on the replay invocation), +/// while parallel Console.WriteLine control lines must appear TWICE +/// (proving the function genuinely replayed). +/// +public class ReplayAwareLoggerTest +{ + private readonly ITestOutputHelper _output; + public ReplayAwareLoggerTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task ReplayAwareLogger_SuppressesDuplicateLogsOnReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ReplayAwareLoggerFunction"), + "logreplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "log-replay"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // Sanity check the durable history: two step events, one wait, one + // re-invocation. Confirms the workflow really did replay. + await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + + // CloudWatch is eventually consistent — wait until ALL log lines we + // expect have been ingested. The stop condition demands the full + // expected count of every marker so the test never short-circuits with + // a still-arriving "after_step1" record (which is emitted at a + // different timestamp than workflow_start and indexes independently). + using var logs = new AmazonCloudWatchLogsClient(RegionEndpoint.USEast1); + var logGroup = $"/aws/lambda/{deployment.FunctionName}"; + + var allEvents = await PollForLogEvents( + logs, logGroup, + stopWhen: events => + // Replay-aware: 1 each (suppressed on the second invocation). + CountMatching(events, "LOG_REPLAY_TEST workflow_start") >= 1 && + CountMatching(events, "LOG_REPLAY_TEST inside_step1") >= 1 && + CountMatching(events, "LOG_REPLAY_TEST after_step1") >= 1 && + CountMatching(events, "LOG_REPLAY_TEST workflow_end") >= 1 && + // Control: workflow_start and after_step1 emit on both + // invocations (2 each); workflow_end only on the second (1). + CountMatching(events, "LOG_REPLAY_CONTROL workflow_start") >= 2 && + CountMatching(events, "LOG_REPLAY_CONTROL after_step1") >= 2 && + CountMatching(events, "LOG_REPLAY_CONTROL workflow_end") >= 1, + timeout: TimeSpan.FromMinutes(2)); + + var messages = allEvents.Select(e => e.Message ?? string.Empty).ToList(); + _output.WriteLine($"Collected {messages.Count} log events from {logGroup}"); + + // Replay-aware lines: each must appear exactly once across both invocations. + Assert.Equal(1, CountMatching(messages, "LOG_REPLAY_TEST workflow_start")); + Assert.Equal(1, CountMatching(messages, "LOG_REPLAY_TEST inside_step1")); + Assert.Equal(1, CountMatching(messages, "LOG_REPLAY_TEST after_step1")); + Assert.Equal(1, CountMatching(messages, "LOG_REPLAY_TEST workflow_end")); + + // Control lines (Console.WriteLine, not replay-aware): the + // workflow-start and after_step1 markers run on both invocations and + // must appear twice; workflow_end runs only on the second invocation + // (after the Wait completes) so it appears once. + Assert.Equal(2, CountMatching(messages, "LOG_REPLAY_CONTROL workflow_start")); + Assert.Equal(2, CountMatching(messages, "LOG_REPLAY_CONTROL after_step1")); + Assert.Equal(1, CountMatching(messages, "LOG_REPLAY_CONTROL workflow_end")); + + // The function runs with AWS_LAMBDA_LOG_FORMAT=JSON, so the runtime + // emits one JSON object per log record. The replay-aware lines were + // emitted under DurableFunction's execution-level BeginScope; the + // inside_step1 line was additionally inside StepOperation's per-step + // BeginScope. LambdaCoreLogger appends the scope KVPs as named + // placeholders, which the runtime's JSON formatter promotes to + // top-level fields. Verify that. + AssertScopeFieldsOnRecord(messages, "LOG_REPLAY_TEST workflow_start", + requireExecutionScope: true, requireStepScope: false); + AssertScopeFieldsOnRecord(messages, "LOG_REPLAY_TEST inside_step1", + requireExecutionScope: true, requireStepScope: true); + } + + private void AssertScopeFieldsOnRecord( + List messages, string substring, + bool requireExecutionScope, bool requireStepScope) + { + var record = messages.FirstOrDefault(m => m.Contains(substring, StringComparison.Ordinal)); + Assert.NotNull(record); + + // CloudWatch occasionally prefixes the JSON line with text (e.g., when + // the runtime falls back to plain stdout); slice from the first '{'. + var braceIdx = record!.IndexOf('{'); + Assert.True(braceIdx >= 0, $"No JSON object in record: {record}"); + + using var doc = JsonDocument.Parse(record[braceIdx..]); + var root = doc.RootElement; + _output.WriteLine($"[scope-check] {substring} → {record[braceIdx..]}"); + + if (requireExecutionScope) + { + Assert.True(root.TryGetProperty("durableExecutionArn", out _), + $"durableExecutionArn missing on record: {record}"); + Assert.True(root.TryGetProperty("awsRequestId", out _), + $"awsRequestId missing on record: {record}"); + } + if (requireStepScope) + { + Assert.True(root.TryGetProperty("operationId", out _), + $"operationId missing on record: {record}"); + Assert.True(root.TryGetProperty("operationName", out _), + $"operationName missing on record: {record}"); + Assert.True(root.TryGetProperty("attempt", out _), + $"attempt missing on record: {record}"); + } + } + + private static int CountMatching(IEnumerable events, string substring) + => events.Count(e => e.Message != null && e.Message.Contains(substring, StringComparison.Ordinal)); + + private static int CountMatching(IEnumerable messages, string substring) + => messages.Count(m => m.Contains(substring, StringComparison.Ordinal)); + + private async Task> PollForLogEvents( + IAmazonCloudWatchLogs logs, + string logGroupName, + Func, bool> stopWhen, + TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + var attempt = 0; + var lastSeen = new List(); + + // Filter only on our marker prefix to keep payload size small. + const string filterPattern = "\"LOG_REPLAY_\""; + + while (DateTime.UtcNow < deadline) + { + attempt++; + try + { + var events = new List(); + string? nextToken = null; + do + { + var resp = await logs.FilterLogEventsAsync(new FilterLogEventsRequest + { + LogGroupName = logGroupName, + FilterPattern = filterPattern, + NextToken = nextToken, + }); + if (resp.Events != null) events.AddRange(resp.Events); + nextToken = resp.NextToken; + } while (!string.IsNullOrEmpty(nextToken)); + + _output.WriteLine($"[CW poll {attempt}] events={events.Count}"); + lastSeen = events; + if (stopWhen(events)) return events; + } + catch (Amazon.CloudWatchLogs.Model.ResourceNotFoundException) + { + // Log group not yet provisioned — Lambda creates it on first + // invocation, but it can lag behind the function being Active. + _output.WriteLine($"[CW poll {attempt}] log group not yet present: {logGroupName}"); + } + catch (Exception ex) + { + _output.WriteLine($"[CW poll {attempt}] error (will retry): {ex.Message}"); + } + await Task.Delay(TimeSpan.FromSeconds(3)); + } + + _output.WriteLine($"[CW poll] gave up after {attempt} attempts; returning last-seen ({lastSeen.Count} events)"); + return lastSeen; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs new file mode 100644 index 000000000..053e2b299 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public ReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task ReplayDeterminism_SameGuidAcrossInvocations() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ReplayDeterminismFunction"), + "replay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "replay-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — wait until both step-succeeded events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Each step succeeded exactly once — generate_id was NOT re-executed on replay + // (a duplicate would show up as two succeeded events for the same name). + var stepSucceededEvents = events.Where(e => e.StepSucceededDetails != null).ToList(); + Assert.Equal(2, stepSucceededEvents.Count); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "generate_id")); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "echo_id")); + + var generateEvent = stepSucceededEvents.First(e => e.Name == "generate_id"); + var echoEvent = stepSucceededEvents.First(e => e.Name == "echo_id"); + + var generatedGuid = generateEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + var echoedResult = echoEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.NotNull(generatedGuid); + Assert.NotNull(echoedResult); + Assert.True(Guid.TryParse(generatedGuid, out _), + $"generate_id should produce a valid GUID, got: {generatedGuid}"); + + // The echoed value matches the cached GUID — proves replay returned the + // checkpointed value rather than running generate_id again. + Assert.Equal($"echo:{generatedGuid}", echoedResult); + + // The boundary wait actually caused a suspend/resume cycle. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "boundary_wait"); + Assert.NotNull(waitStarted); + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves replay actually happened), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs new file mode 100644 index 000000000..982f72e98 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class RetryExhaustionTest +{ + private readonly ITestOutputHelper _output; + public RetryExhaustionTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end retry exhaustion: step always throws, maxAttempts=3. + /// Validates that the SDK records exactly three StepStarted/StepFailed pairs, + /// the final attempt produces a FAIL checkpoint (not RETRY), and the workflow + /// terminates FAILED with the original exception surfaced through the + /// execution-level error. + /// + [Fact] + public async Task AlwaysFailsStep_ExhaustsRetries_TerminatesFailed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("RetryExhaustionFunction"), + "rexhaust", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload synchronously; locate the execution by name. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 2s + 4s of retry delays + 3x execution overhead. Generous headroom for scheduling. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // Execution-level error is the original exception from the final attempt. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("attempt 3", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.StepFailedDetails != null) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Three attempts ran in total — no extra (off-by-one) and no truncation. + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepStarted)); + + // Three failures recorded; no successes. + Assert.Equal(3, events.Count(e => e.StepFailedDetails != null && e.Name == "always_fails_step")); + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + + // Each recorded failure carries the right per-attempt message. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "always_fails_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failures, m => m.Contains("attempt 1")); + Assert.Contains(failures, m => m.Contains("attempt 2")); + Assert.Contains(failures, m => m.Contains("attempt 3")); + + // Service honored the retry delays. No-jitter exponential backoff at 2s/4s + // means the gap between the first and last StepStarted is >= 6s. + var startedTimestamps = events + .Where(e => e.EventType == EventType.StepStarted && e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs new file mode 100644 index 000000000..1dcf48249 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class RetryTest +{ + private readonly ITestOutputHelper _output; + public RetryTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end retry: step throws on attempts 1 and 2, succeeds on attempt 3. + /// Validates that the service honors the RETRY checkpoint, schedules the + /// requested delay, and re-invokes the Lambda — none of which the unit + /// tests can prove (they fake state transitions in-memory). + /// + [Fact] + public async Task FlakyStep_RetriesAndSucceedsOnThirdAttempt() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("RetryFunction"), + "retry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Initial invoke returns when the SDK suspends after the first failure. + // The execution continues asynchronously via service-driven re-invokes. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total expected wall time: 2s + 4s of retry delay + execution overhead. + // Allow generous headroom for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3 + && (h.Events?.Any(e => e.StepSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Three attempts ran (attempts 1, 2, 3). + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepStarted)); + + // Two failed attempts recorded retry metadata; the final attempt succeeded. + Assert.Equal(2, events.Count(e => e.StepFailedDetails != null && e.Name == "flaky_step")); + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "flaky_step"); + Assert.NotNull(succeeded); + Assert.Equal("\"ok on attempt 3\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // The two recorded failure messages reflect the per-attempt exception. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "flaky_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failures, m => m.Contains("attempt 1")); + Assert.Contains(failures, m => m.Contains("attempt 2")); + + // Timing check: the service must have actually waited between attempts. + // With initialDelay=2s, backoffRate=2.0, no jitter: delays are 2s and 4s. + // The gap between the first and last StepStarted should be >= 6s. + var startedTimestamps = events + .Where(e => e.EventType == EventType.StepStarted && e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs new file mode 100644 index 000000000..7e3a546fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs @@ -0,0 +1,57 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class StepFailsTest +{ + private readonly ITestOutputHelper _output; + public StepFailsTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task StepFails_PropagatesAsFailedStatus() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("StepFailsFunction"), + "stepfail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller. Locate the execution + // by name and verify the service marked it FAILED. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("intentional failure", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.StepStarted) ?? false) + && (h.Events?.Any(e => e.StepFailedDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(1, events.Count(e => e.EventType == EventType.StepStarted)); + + // The failing step recorded a StepFailed event with the exception message. + var stepFailed = events.FirstOrDefault(e => e.StepFailedDetails != null && e.Name == "fail_step"); + Assert.NotNull(stepFailed); + Assert.Contains("intentional failure", stepFailed!.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty); + + // No step ever succeeded — the workflow body was unreachable past the throw. + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs new file mode 100644 index 000000000..55a34e895 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class StepWaitStepTest +{ + private readonly ITestOutputHelper _output; + public StepWaitStepTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task StepWaitStep_CompletesViaService() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("StepWaitStepFunction"), + "stepwait", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-123"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Both steps ran in order and produced the expected chained outputs. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("validate", stepResults[0].Name); + Assert.Equal("validated-integ-test-123", stepResults[0].Payload); + Assert.Equal("process", stepResults[1].Name); + Assert.Equal("processed-validated-integ-test-123", stepResults[1].Payload); + + // The wait was actually scheduled with the expected duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "short_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(3, waitStarted!.WaitStartedDetails.Duration); + var waitSucceeded = events.FirstOrDefault(e => e.WaitSucceededDetails != null && e.Name == "short_wait"); + Assert.NotNull(waitSucceeded); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj new file mode 100644 index 000000000..92fe96678 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs new file mode 100644 index 000000000..4991290d4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace ApproverFunction; + +/// +/// Plain Lambda that acts as the "external system" in the WaitForCallback +/// integration test. Receives a callback ID + payload bits, builds the result +/// JSON, and resolves the durable execution by calling +/// SendDurableExecutionCallbackSuccess. Modeled after the real-world pattern +/// where an out-of-band service signals workflow completion. +/// +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(ApproverInput input, ILambdaContext context) + { + if (string.IsNullOrEmpty(input.CallbackId)) + throw new ArgumentException("CallbackId is required"); + + var resultJson = $$"""{"Status":"approved","ApprovedBy":"{{input.OrderId}}"}"""; + await LambdaClient.SendDurableExecutionCallbackSuccessAsync( + new SendDurableExecutionCallbackSuccessRequest + { + CallbackId = input.CallbackId, + Result = new MemoryStream(Encoding.UTF8.GetBytes(resultJson)) + }); + return null; + } +} + +public class ApproverInput +{ + public string? CallbackId { get; set; } + public string? OrderId { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs new file mode 100644 index 000000000..443d05b8a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +/// +/// Exercises the AtMostOncePerRetry crash-recovery path end-to-end. +/// +/// On attempt 1 the step kills the Lambda process AFTER the START checkpoint +/// has been flushed but BEFORE any SUCCEED checkpoint can be written. The +/// service re-invokes us; replay sees STARTED with no terminal record, so the +/// SDK routes through the retry strategy with a synthesized +/// StepInterruptedException. Attempt 2 succeeds normally. +/// +/// The per-attempt counter is read from the input payload — the durable +/// service preserves it across re-invokes so we can drive deterministic crash +/// behavior on attempt 1 only. +/// +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber == 1) + { + // Hard process exit AFTER the SDK has flushed the START + // checkpoint (sync flush is part of the AtMostOncePerRetry + // contract). The service will see a STARTED record with no + // terminal counterpart on the next invocation. + Environment.Exit(137); + } + return $"recovered on attempt {ctx.AttemptNumber}"; + }, + name: "crash_then_recover", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs new file mode 100644 index 000000000..721302ed3 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs @@ -0,0 +1,59 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Hand the service-allocated callback ID to the paired RejecterFunction + // (Event invocation — fire-and-forget). The rejecter calls + // SendDurableExecutionCallbackFailure out-of-band, which surfaces in + // GetResultAsync as CallbackFailedException — uncaught here, so the + // workflow ends FAILED with that exception type recorded. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var cb = await context.CreateCallbackAsync(name: "approve"); + + // Wrap the hand-off in a step so replays don't re-invoke the rejecter. + await context.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs new file mode 100644 index 000000000..58fe3c75e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The test deliberately never delivers the callback. The service should + // fire the timeout, mark the callback TIMED_OUT, and the SDK should + // surface CallbackTimeoutException to the workflow. + var cb = await context.CreateCallbackAsync( + name: "approve", + config: new CallbackConfig { Timeout = TimeSpan.FromSeconds(10) }); + var result = await cb.GetResultAsync(); + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs new file mode 100644 index 000000000..ae3134f24 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs @@ -0,0 +1,48 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Throw inside a child context to validate the CONTEXT FAIL path: the + // service must record a ContextFailed event with the error details and + // mark the workflow FAILED. + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"prepared-{input.OrderId}"; }, + name: "prepare"); + + throw new InvalidOperationException("intentional child context failure for integration test"); + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs new file mode 100644 index 000000000..507f1df0f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Run a child context that itself does step + wait + step. The child's + // return value is checkpointed at the parent level as a CONTEXT + // SUCCEED record, so on replay we'd see it returned from cache. + var phaseResult = await context.RunInChildContextAsync( + async (childCtx) => + { + var validated = await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"validated-{input.OrderId}"; }, + name: "validate"); + + await childCtx.WaitAsync(TimeSpan.FromSeconds(2), name: "short_wait"); + + var processed = await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"processed-{validated}"; }, + name: "process"); + + return processed; + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "completed", Data = phaseResult }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs new file mode 100644 index 000000000..521a7fa50 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // A retry-then-exhaust step inside a child context: every retry + // checkpoint should be parented under the child, and the child should + // close as ContextFailed when retries are exhausted — proving the + // child is a single retry/error boundary. + await context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + throw new InvalidOperationException( + $"always-fails on attempt {ctx.AttemptNumber} for {input.OrderId}"); + }, + name: "always_fails", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs new file mode 100644 index 000000000..e9712e6ea --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Hand the service-allocated callback ID to the paired ApproverFunction + // (Event invocation — fire-and-forget). The approver runs in its own Lambda + // and resolves the callback out-of-band by calling + // SendDurableExecutionCallbackSuccess. This mirrors WaitForCallbackHappyPath's + // topology so the test process never has to play "external system" — the + // synchronous Invoke from the test would otherwise deadlock against the + // suspended workflow. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var cb = await context.CreateCallbackAsync(name: "approve"); + + // Wrap the hand-off in a step so replays don't re-invoke the approver. + await context.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"integ-test"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs new file mode 100644 index 000000000..240565384 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + var formatted = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"tenant-aware-{input}"; }, + name: "tenant_step"); + return formatted; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs new file mode 100644 index 000000000..7e96ff0c8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + // Throw inside a step so the workflow records a step-failed event AND + // surfaces a FAILED execution status. The parent's InvokeAsync sees a + // FAILED chained invocation and raises InvokeFailedException with the + // step's error type (System.InvalidOperationException) attached. + await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional child failure"); + }, + name: "fail_step"); + + return "unreachable"; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs new file mode 100644 index 000000000..40bfa3079 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs @@ -0,0 +1,56 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + try + { + await context.InvokeAsync( + downstreamArn, + payload: 1, + name: "call_failing_child"); + + // Should not reach — the child throws and the parent surfaces + // InvokeFailedException on the resume. + return new TestResult { Status = "unexpected_success", Data = null }; + } + catch (InvokeFailedException ex) + { + // The parent catches and converts the exception into a normal result — + // the workflow itself succeeds, even though the chained invoke failed. + return new TestResult + { + Status = "completed", + Data = $"parent-saw-{ex.ErrorType ?? "unknown"}" + }; + } + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs new file mode 100644 index 000000000..898021cdd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + var prefixed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"got-{input}"; }, + name: "format"); + return prefixed; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs new file mode 100644 index 000000000..4a2e93f8c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Parent receives the downstream function ARN via env var so the test + // harness can wire arbitrary downstream functions without rebuilding + // the parent image. + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + var result = await context.InvokeAsync( + downstreamArn, + payload: 42, + name: "call_child"); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs new file mode 100644 index 000000000..5115101e1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(string input, IDurableContext context) + { + var echoed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"echoed:{input}"; }, + name: "child_echo"); + return echoed; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs new file mode 100644 index 000000000..b00be9c95 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + // Step 1 generates a fresh GUID. On replay this MUST return the + // checkpointed value — proves the SDK's deterministic operation IDs + // line up with the service's view of the state. + var generatedId = await context.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "before_invoke"); + + // The chained invoke forces a suspend/resume cycle. After the resume, + // step 1 must replay (returning the cached GUID) and the invoke must + // not be re-fired (cached result is returned immediately). + var invokeResult = await context.InvokeAsync( + downstreamArn, + payload: generatedId, + name: "echo_invoke"); + + var afterInvoke = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"final:{invokeResult}"; }, + name: "after_invoke"); + + return new TestResult { Status = "completed", Data = afterInvoke }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs new file mode 100644 index 000000000..a11eba6d2 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + var result = await context.InvokeAsync( + downstreamArn, + payload: 7, + name: "call_with_tenant", + config: new InvokeConfig { TenantId = "test-tenant" }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs new file mode 100644 index 000000000..7d3c0f0e1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +/// +/// Five-failure retry chain: the step throws on attempts 1-5 and succeeds on +/// attempt 6. The result payload echoes ctx.AttemptNumber on each attempt so +/// the integration test can verify the SDK's user-facing attempt counter +/// matches the wire-format StepDetails.Attempt value across multiple +/// invocations. +/// +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber < 6) + throw new InvalidOperationException($"flake on attempt {ctx.AttemptNumber}"); + return $"ok on attempt {ctx.AttemptNumber}"; + }, + name: "long_retry_step", + config: new StepConfig + { + // Short delays so the test wall time stays manageable: 1s, 2s, 3s, 4s, 5s. + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 6, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 1.5, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs new file mode 100644 index 000000000..401066c0e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"started-{input.OrderId}"; }, + name: "before_wait"); + + await context.WaitAsync(TimeSpan.FromSeconds(15), name: "long_wait"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"after_wait-{step1}"; }, + name: "after_wait"); + + return new TestResult { Status = "completed", Data = step2 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..62712b6a4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs @@ -0,0 +1,55 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five items, two throw. ToleratedFailureCount = 1 means a second failure + // exceeds tolerance and the map surfaces a MapException — terminating the + // workflow FAILED. + var items = new[] { "ok1", "bad1", "ok2", "bad2", "ok3" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item.StartsWith("bad")) + throw new InvalidOperationException($"{item} boom"); + return item; + }, + name: "tolerance", + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the map must throw MapException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..d083a054b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four items, each waits a different (durable) duration. The shortest + // wait should win and short-circuit the map via FirstSuccessful. Wait + // durations are at least 1s (service timer granularity). The item value + // IS the wait-seconds; the result is the item's index. + var waitSeconds = new[] { 8, 1, 5, 6 }; + + var batch = await context.MapAsync( + waitSeconds, + async (ctx, seconds, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(seconds), name: $"wait_{index}"); + return index; + }, + name: "race", + config: new MapConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs new file mode 100644 index 000000000..9cd54aaba --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items run under NestingType.Flat. Each item generates a fresh + // GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-item result — and under Flat that result lives inline on the parent + // Map payload, not on a per-item CONTEXT checkpoint (none are emitted). + // If Flat replay is broken, the GUID would change between the original + // execution and replay, or the inner step/wait ops would reference a + // non-existent item parent. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout", + config: new MapConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs new file mode 100644 index 000000000..14da119f8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs @@ -0,0 +1,45 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var orders = new[] { "order-1", "order-2", "order-3" }; + + // Each item is processed inside a step so the per-item child context + // owns a leaf operation. ItemNamer gives each item a readable branch + // name in the service-side history. + var batch = await context.MapAsync( + orders, + async (ctx, orderId, index, all) => + await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return $"{orderId}-{input.OrderId}"; }, + name: "process"), + name: "process_all", + config: new MapConfig { ItemNamer = (item, index) => $"item-{item}" }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..0499a7a93 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 items, MaxConcurrency = 2. Each item does a 2-second durable wait + // then captures the post-wait wall-clock as a unix-ms timestamp. The + // expected outcome is 3 waves of 2 items; total elapsed ~6s. Use + // IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT durable + // and would skew this measurement under replay. + var items = new[] { 0, 1, 2, 3, 4, 5 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{index}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }, + name: "throttled", + config: new MapConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs new file mode 100644 index 000000000..39676c3ed --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items, the middle one throws. Map's DEFAULT CompletionConfig is + // AllCompleted() (permissive) — unlike Parallel's AllSuccessful() — so NO + // config is supplied here and the map must still drive every item to a + // terminal state without throwing. This is the key Map-vs-Parallel + // behavioral difference, validated end-to-end. + var items = new[] { "ok1", "boom", "ok2" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item == "boom") + throw new InvalidOperationException("intentional partial failure"); + return item; + }, + name: "partial"); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..9a75cbd5e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs @@ -0,0 +1,53 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items. Each item generates a fresh GUID inside a step, then does + // a durable wait. The wait forces a suspend/resume cycle, so the second + // invocation MUST replay the cached GUID rather than re-running the step. + // If replay determinism is broken, the GUID would change between the + // original execution and replay. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs new file mode 100644 index 000000000..cdf5992b6 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs @@ -0,0 +1,53 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"a-{input.OrderId}"; }, + name: "step_1"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step1}-b"; }, + name: "step_2"); + + var step3 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step2}-c"; }, + name: "step_3"); + + var step4 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step3}-d"; }, + name: "step_4"); + + var step5 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step4}-e"; }, + name: "step_5"); + + return new TestResult { Status = "completed", Data = step5 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..9c697710d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs @@ -0,0 +1,60 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five branches, two throw. ToleratedFailureCount = 1 means a second + // failure exceeds tolerance and the parallel surfaces a ParallelException. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "1"; }), + new DurableBranch("bad1", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad1 boom"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "2"; }), + new DurableBranch("bad2", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad2 boom"); + }), + new DurableBranch("ok3", async (_) => { await Task.CompletedTask; return "3"; }), + }, + name: "tolerance", + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the parallel must throw ParallelException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..2fa932dd7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs @@ -0,0 +1,79 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four branches with different durable wait durations. The shortest + // wait should win and short-circuit the parallel via FirstSuccessful. + // Wait durations are at least 1s (service timer granularity). + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("slowest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(8), name: "wait_3"); + return 3; + }), + new DurableBranch("fastest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(1), name: "wait_0"); + return 0; + }), + new DurableBranch("mid1", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(5), name: "wait_1"); + return 1; + }), + new DurableBranch("mid2", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(6), name: "wait_2"); + return 2; + }), + }, + name: "race", + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + // The winner is whichever branch came back first. Surface the index + + // its name so the test can assert one branch won. + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs new file mode 100644 index 000000000..dfbd6a345 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches run under NestingType.Flat. Each branch generates a + // fresh GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-branch result — and under Flat that result lives inline on the + // parent Parallel payload, not on a per-branch CONTEXT checkpoint (none + // are emitted). If Flat replay is broken, the GUID would change between + // the original execution and replay, or the inner step/wait ops would + // reference a non-existent branch parent. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs new file mode 100644 index 000000000..b6b027f9b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs @@ -0,0 +1,40 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.CompletedTask; return $"alpha-{input.OrderId}"; }), + new DurableBranch("beta", async (_) => { await Task.CompletedTask; return $"beta-{input.OrderId}"; }), + new DurableBranch("gamma", async (_) => { await Task.CompletedTask; return $"gamma-{input.OrderId}"; }), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..72f69913a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs @@ -0,0 +1,67 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 branches, MaxConcurrency = 2. Each branch does a 2-second durable + // wait then captures the post-wait wall-clock as a unix-ms timestamp. + // The expected outcome is 3 waves of 2 branches; total elapsed ~6s. + // Use IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT + // durable and would skew this measurement under replay. + var branches = new DurableBranch[6]; + for (var i = 0; i < 6; i++) + { + var localIndex = i; + branches[i] = new DurableBranch( + $"b{localIndex}", + async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{localIndex}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }); + } + + var batch = await context.ParallelAsync( + branches, + name: "throttled", + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs new file mode 100644 index 000000000..51b35f19b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "first"; }), + new DurableBranch("boom", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional partial failure"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "third"; }), + }, + name: "partial", + // AllCompleted: drive every branch to terminal state regardless of failure. + // Without this, the default AllSuccessful() would throw on the first failure. + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..195c9b497 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches. Each branch generates a fresh GUID inside a step, + // then does a durable wait. The wait forces a suspend/resume cycle, + // so the second invocation MUST replay the cached GUID rather than + // re-running the step. If replay determinism is broken, the GUID + // would change between the original execution and replay. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs new file mode 100644 index 000000000..a450855a7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace RejecterFunction; + +/// +/// Plain Lambda that acts as the "external system" in the CallbackFailed +/// integration test. Receives a callback ID and resolves the durable execution +/// as failed by calling SendDurableExecutionCallbackFailure. Modeled after +/// ApproverFunction (its happy-path counterpart). +/// +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(RejecterInput input, ILambdaContext context) + { + if (string.IsNullOrEmpty(input.CallbackId)) + throw new ArgumentException("CallbackId is required"); + + await LambdaClient.SendDurableExecutionCallbackFailureAsync( + new SendDurableExecutionCallbackFailureRequest + { + CallbackId = input.CallbackId, + Error = new ErrorObject + { + ErrorType = "ApprovalRejected", + ErrorMessage = "external system rejected the request", + } + }); + return null; + } +} + +public class RejecterInput +{ + public string? CallbackId { get; set; } + public string? OrderId { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj new file mode 100644 index 000000000..92fe96678 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Dockerfile new file mode 100644 index 000000000..92f5263e9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Dockerfile @@ -0,0 +1,12 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +# Emit structured JSON logs so the integration test can parse log records and +# assert the durable-execution scope keys (durableExecutionArn, operationId, +# etc.) appear as top-level fields. +ENV AWS_LAMBDA_LOG_FORMAT=JSON + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Function.cs new file mode 100644 index 000000000..dbbcc24a9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/Function.cs @@ -0,0 +1,75 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; +using Microsoft.Extensions.Logging; + +namespace DurableExecutionTestFunction; + +/// +/// Workflow used by ReplayAwareLoggerTest. Pairs each replay-aware +/// context.Logger.LogInformation line with a control +/// Console.WriteLine so the test can prove the SDK suppresses replay +/// duplicates: the LogInformation lines should appear exactly once across the +/// two invocations a Wait-driven workflow produces, while the Console.WriteLine +/// control lines should appear once per invocation. +/// +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Workflow-level: emitted on invocation 1, suppressed on invocation 2 (replay). + context.Logger.LogInformation("LOG_REPLAY_TEST workflow_start order={OrderId}", input.OrderId); + Console.WriteLine($"LOG_REPLAY_CONTROL workflow_start order={input.OrderId}"); + + var step1 = await context.StepAsync( + async (_) => + { + // Emitted inside the step's BeginScope, so the line carries + // both execution-level scope (durableExecutionArn, awsRequestId) + // AND step-level scope (operationId, operationName, attempt). + context.Logger.LogInformation("LOG_REPLAY_TEST inside_step1 order={OrderId}", input.OrderId); + await Task.CompletedTask; + return $"validated-{input.OrderId}"; + }, + name: "validate"); + + // Between-step log: invocation 1 emits, invocation 2 is still in Replay + // (Wait-on-SUCCEEDED replay does not flip the mode), so it must be suppressed. + context.Logger.LogInformation("LOG_REPLAY_TEST after_step1 result={Result}", step1); + Console.WriteLine($"LOG_REPLAY_CONTROL after_step1 result={step1}"); + + await context.WaitAsync(TimeSpan.FromSeconds(3), name: "short_wait"); + + // Step 2 runs fresh on invocation 2 — its EnterExecutionMode flips the + // logger from suppress to passthrough. The next LogInformation lands. + var step2 = await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + return $"processed-{step1}"; + }, + name: "process"); + + context.Logger.LogInformation("LOG_REPLAY_TEST workflow_end final={Final}", step2); + Console.WriteLine($"LOG_REPLAY_CONTROL workflow_end final={step2}"); + + return new TestResult { Status = "completed", Data = step2 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/ReplayAwareLoggerFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/ReplayAwareLoggerFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayAwareLoggerFunction/ReplayAwareLoggerFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..22f919900 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Step 1 generates a fresh GUID. On replay, this MUST return the cached value. + var generatedId = await context.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate_id"); + + // Force a suspend/resume cycle to trigger replay + await context.WaitAsync(TimeSpan.FromSeconds(3), name: "boundary_wait"); + + // Step 2 echoes the GUID. After replay, it should see the SAME GUID from step 1. + var echoed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"echo:{generatedId}"; }, + name: "echo_id"); + + return new TestResult { Status = "completed", Data = echoed }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs new file mode 100644 index 000000000..3e78ffd9d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs @@ -0,0 +1,50 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + throw new InvalidOperationException($"always-fails attempt {ctx.AttemptNumber}"); + }, + name: "always_fails_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs new file mode 100644 index 000000000..800dc075f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber < 3) + throw new InvalidOperationException($"flake on attempt {ctx.AttemptNumber}"); + return $"ok on attempt {ctx.AttemptNumber}"; + }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs new file mode 100644 index 000000000..de0246a50 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional failure for integration test"); + }, + name: "fail_step"); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs new file mode 100644 index 000000000..97f7edd51 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"validated-{input.OrderId}"; }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(3), name: "short_wait"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"processed-{step1}"; }, + name: "process"); + + return new TestResult { Status = "completed", Data = step2 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs new file mode 100644 index 000000000..129344d25 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + // Reuse a single Lambda client across submitter invocations. + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The submitter is called once with a freshly-allocated callback ID. + // It hands that ID off to the paired ApproverFunction (Event invocation — + // fire-and-forget, modelling a real external system). The submitter + // returns immediately, the SDK suspends, and the approver eventually + // calls SendDurableExecutionCallbackSuccess to resolve the workflow + // out-of-band. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var result = await context.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + var payload = $$"""{"callbackId":"{{callbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, + name: "approve"); + + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs new file mode 100644 index 000000000..19b60d567 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The submitter throws on every attempt. With RetryStrategy.None the + // SDK should fail terminally on the first attempt and surface the + // failure as CallbackSubmitterException. The workflow does not catch + // it, so the durable execution surfaces FAILED with that exception. + var result = await context.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + await Task.CompletedTask; + throw new InvalidOperationException("submitter intentional failure"); + }, + name: "approve", + config: new WaitForCallbackConfig { RetryStrategy = RetryStrategy.None }); + + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Function.cs new file mode 100644 index 000000000..d73161e60 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/Function.cs @@ -0,0 +1,66 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Exponential strategy with no jitter so the timing is predictable. + // Done flips on attempt 3 (1-based). With initialDelay=1s, + // backoffRate=1.5, maxDelay=4s, no jitter: delays are 1s, 1.5s + // (which the SDK ceilings to 2s due to 1s timer granularity). + var finalState = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + await Task.CompletedTask; + var done = ctx.AttemptNumber >= 3; + return new State(done, ctx.AttemptNumber); + }, + config: new WaitForConditionConfig + { + InitialState = new State(false, 0), + WaitStrategy = WaitStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(4), + backoffRate: 1.5, + jitter: JitterStrategy.None, + isDone: s => s.Done) + }, + name: "exp_poll"); + + return new TestResult + { + Status = "completed", + AttemptsTaken = finalState.AttemptNumber, + Done = finalState.Done + }; + } +} + +public record State(bool Done, int AttemptNumber); + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int AttemptsTaken { get; set; } + public bool Done { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/WaitForConditionExponentialFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/WaitForConditionExponentialFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionExponentialFunction/WaitForConditionExponentialFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Function.cs new file mode 100644 index 000000000..086eb6bba --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Counter increments every poll. isDone fires once it hits 3. + // Each poll iteration is a separate Lambda invocation; the state is + // carried across iterations via the RETRY checkpoint payload. + var finalState = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + await Task.CompletedTask; + return new State(state.Counter + 1, ctx.AttemptNumber); + }, + config: new WaitForConditionConfig + { + InitialState = new State(0, 0), + WaitStrategy = WaitStrategy.Fixed( + delay: TimeSpan.FromSeconds(2), + maxAttempts: 10, + isDone: s => s.Counter >= 3) + }, + name: "happy_poll"); + + return new TestResult + { + Status = "completed", + Counter = finalState.Counter, + AttemptsTaken = finalState.AttemptNumber + }; + } +} + +public record State(int Counter, int AttemptNumber); + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int Counter { get; set; } + public int AttemptsTaken { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/WaitForConditionHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/WaitForConditionHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionHappyPathFunction/WaitForConditionHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Function.cs new file mode 100644 index 000000000..8f631fe86 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/Function.cs @@ -0,0 +1,62 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Condition is never satisfied (isDone is always false), so the + // strategy will eventually exhaust maxAttempts and the operation will + // throw WaitForConditionException. The workflow catches it and + // surfaces AttemptsExhausted in the result so the test can assert on + // it without inspecting the FAILED status. + try + { + await context.WaitForConditionAsync( + check: async (state, _) => + { + await Task.CompletedTask; + return state + 1; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed( + delay: TimeSpan.FromSeconds(1), + maxAttempts: 3, + isDone: _ => false) + }, + name: "exhausting_poll"); + + return new TestResult { Status = "should_not_reach", AttemptsExhausted = -1 }; + } + catch (WaitForConditionException ex) + { + return new TestResult { Status = "exhausted", AttemptsExhausted = ex.AttemptsExhausted }; + } + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int AttemptsExhausted { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/WaitForConditionMaxAttemptsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/WaitForConditionMaxAttemptsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionMaxAttemptsFunction/WaitForConditionMaxAttemptsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..6300bb6fe --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Step 1: capture a fresh value. On replay this MUST return the + // checkpointed value rather than re-executing. + var generatedId = await context.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "before_poll"); + + // Wait-for-condition with 3 polls. Each poll iteration is a separate + // invocation, and the operation's deterministic ID + RETRY-payload + // state must round-trip across re-invocations. + var pollResult = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + await Task.CompletedTask; + return new Counter(state.Count + 1); + }, + config: new WaitForConditionConfig + { + InitialState = new Counter(0), + WaitStrategy = WaitStrategy.Fixed( + delay: TimeSpan.FromSeconds(2), + maxAttempts: 10, + isDone: c => c.Count >= 3) + }, + name: "determinism_poll"); + + // Step 2: echo the generated ID. After replay, this should see the + // SAME GUID from step 1 — proves replay returned the cached value. + var echoed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"echo:{generatedId}:{pollResult.Count}"; }, + name: "after_poll"); + + return new TestResult { Status = "completed", Data = echoed }; + } +} + +public record Counter(int Count); + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/WaitForConditionReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/WaitForConditionReplayDeterminismFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionReplayDeterminismFunction/WaitForConditionReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Function.cs new file mode 100644 index 000000000..404114dc4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/Function.cs @@ -0,0 +1,66 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The check function throws on attempt 2. Per the WaitForCondition + // contract, the check-thrown exception is checkpointed as FAIL and + // surfaced through the SDK as a StepException carrying the original + // exception type ("System.InvalidOperationException"). The workflow + // catches it and reports the captured ErrorType so the test can assert + // without requiring the workflow to FAIL outright. + try + { + await context.WaitForConditionAsync( + check: async (state, ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber == 2) + throw new InvalidOperationException("intentional check failure on attempt 2"); + return state + 1; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed( + delay: TimeSpan.FromSeconds(1), + maxAttempts: 10, + isDone: _ => false) + }, + name: "throwing_poll"); + + return new TestResult { Status = "should_not_reach", ErrorType = null }; + } + catch (StepException ex) + { + return new TestResult { Status = "caught_step_exception", ErrorType = ex.ErrorType, ErrorMessage = ex.Message }; + } + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public string? ErrorType { get; set; } + public string? ErrorMessage { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/WaitForConditionUserCheckThrowsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/WaitForConditionUserCheckThrowsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForConditionUserCheckThrowsFunction/WaitForConditionUserCheckThrowsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs new file mode 100644 index 000000000..8bfd7b7cd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs @@ -0,0 +1,34 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + await context.WaitAsync(TimeSpan.FromSeconds(5), name: "only_wait"); + return new TestResult { Status = "completed", Data = "wait_only" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs new file mode 100644 index 000000000..3d6ad3d86 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForCallbackHappyPathTest +{ + private readonly ITestOutputHelper _output; + public WaitForCallbackHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy path for WaitForCallbackAsync using a real + /// two-Lambda flow: the workflow's submitter Event-invokes a paired + /// ApproverFunction, which calls SendDurableExecutionCallbackSuccess + /// out-of-band. The workflow suspends after the submitter step completes, + /// the service re-invokes the workflow once the approver resolves the + /// callback, and WaitForCallbackAsync returns the deserialized result. + /// + [Fact] + public async Task WaitForCallback_SubmitterDeliversResult_WorkflowCompletes() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForCallbackHappyPathFunction"), + "wfcb-happy", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("ApproverFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "approver-1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The execution returns the payload the submitter delivered. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("approved", execution.Result); + Assert.Contains("approver-1", execution.Result); + + // History records the canonical WaitForCallback lifecycle: + // submitter step Started + Succeeded, callback Started + Succeeded, + // and a containing context (CONTEXT operation) wrapping the pair. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackSucceeded) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.StepSucceeded) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackSucceeded)); + + // The submitter ran exactly once and succeeded — the SDK's "callback + // already resolved" branch must NOT have re-run it on replay. Filter + // on a name that the SDK uses for the submitter step (typically + // matches the WaitForCallback name). + var submitterSteps = events + .Where(e => e.EventType == EventType.StepSucceeded + || e.EventType == EventType.StepStarted) + .ToList(); + Assert.NotEmpty(submitterSteps); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs new file mode 100644 index 000000000..e172a4ab0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs @@ -0,0 +1,69 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForCallbackSubmitterFailsTest +{ + private readonly ITestOutputHelper _output; + public WaitForCallbackSubmitterFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end submitter-failure path for WaitForCallbackAsync: + /// the submitter throws on attempt 1 with ; + /// the SDK fails the composite operation terminally and surfaces + /// . The workflow surfaces FAILED. + /// + [Fact] + public async Task WaitForCallback_SubmitterThrows_SurfacesAsCallbackSubmitterException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForCallbackSubmitterFailsFunction"), + "wfcb-fail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The workflow surfaces CallbackSubmitterException — the SDK's wrapper + // type around the failed submitter step. Verify both the recorded + // ErrorType and that the original "submitter intentional failure" + // message survives in the error chain. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackSubmitterException).FullName, execution.Error.ErrorType); + // ErrorObject.FromException records the outer exception's Message; that + // message should reference the submitter failure context. Be lenient + // about exact wording since the SDK may prepend / wrap the inner. + Assert.False(string.IsNullOrEmpty(execution.Error.ErrorMessage)); + + // History records the submitter step failed exactly once — RetryStrategy.None + // means no retries — and no callback was ever started since the submitter + // never delivered the ID. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => h.Events?.Any(e => e.StepFailedDetails != null) ?? false, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var stepFailures = events.Where(e => e.StepFailedDetails != null).ToList(); + Assert.Single(stepFailures); + var failureMessage = stepFailures[0].StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty; + Assert.Contains("submitter intentional failure", failureMessage); + + // No SUCCEEDED step events — the submitter never succeeded. + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionExponentialTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionExponentialTest.cs new file mode 100644 index 000000000..f5493fe2f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionExponentialTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForConditionExponentialTest +{ + private readonly ITestOutputHelper _output; + public WaitForConditionExponentialTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end exponential-backoff polling. The check function flips + /// Done on attempt 3, so the strategy stops after exactly 3 + /// iterations. Validates that the service honors the per-iteration delay + /// (which grows with each retry) without any in-process Thread.Sleep. + /// Timing is asserted loosely because the service's scheduling latency + /// dominates short delays — we only require the gap to be at least the + /// configured floor. + /// + [Fact] + public async Task WaitForCondition_ExponentialBackoff_CompletesOnExpectedAttempt() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForConditionExponentialFunction"), + "wfcexp", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wfc-exp"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total expected wall time: 1s + 2s of timer = ~3s + execution + // overhead. Allow generous headroom for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepSucceededDetails != null && e.Name == "exp_poll") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Each polling iteration surfaces as a StepSucceeded event (one per + // RETRY plus one for the terminal SUCCEED). The last one carries the + // terminal state. + var succeededEvents = events.Where(e => e.StepSucceededDetails != null && e.Name == "exp_poll").ToList(); + Assert.NotEmpty(succeededEvents); + var succeeded = succeededEvents.Last(); + + var finalPayload = succeeded.StepSucceededDetails.Result?.Payload; + Assert.False(string.IsNullOrEmpty(finalPayload)); + + using var doc = JsonDocument.Parse(finalPayload!); + Assert.True(doc.RootElement.GetProperty("Done").GetBoolean()); + Assert.Equal(3, doc.RootElement.GetProperty("AttemptNumber").GetInt32()); + + // The polling caused real suspend/resume cycles — at least 3 + // invocations (one per attempt). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 3, + $"Expected at least 3 InvocationCompleted events (one per poll), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionHappyPathTest.cs new file mode 100644 index 000000000..d26f7627a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionHappyPathTest.cs @@ -0,0 +1,74 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForConditionHappyPathTest +{ + private readonly ITestOutputHelper _output; + public WaitForConditionHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path polling. The check function increments a counter + /// every iteration; the strategy's isDone predicate fires once the counter + /// hits 3. Validates that the service honors the RETRY-with-delay pattern, + /// re-invokes the Lambda for each poll iteration, and that state survives + /// across re-invocations via the RETRY payload — none of which the unit + /// tests can prove (they fake state transitions in-memory). + /// + [Fact] + public async Task WaitForCondition_PollsUntilConditionMet() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForConditionHappyPathFunction"), + "wfchappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wfc-happy"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total expected wall time: 3 attempts with ~2s delay between them = + // ~4s of timer + execution overhead. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Exactly one START emitted on the first iteration (subsequent + // iterations resume from a RETRY checkpoint and skip START). + Assert.Equal(1, events.Count(e => e.EventType == EventType.StepStarted && e.Name == "happy_poll")); + + // Each polling iteration surfaces as a StepSucceeded event (one per + // RETRY plus one for the terminal SUCCEED). The last one carries the + // terminal state. + var succeededEvents = events.Where(e => e.StepSucceededDetails != null && e.Name == "happy_poll").ToList(); + Assert.NotEmpty(succeededEvents); + var succeeded = succeededEvents.Last(); + + var finalPayload = succeeded.StepSucceededDetails.Result?.Payload; + Assert.False(string.IsNullOrEmpty(finalPayload), + "final SUCCEED payload should carry the terminal state"); + + using var doc = JsonDocument.Parse(finalPayload!); + Assert.Equal(3, doc.RootElement.GetProperty("Counter").GetInt32()); + Assert.Equal(3, doc.RootElement.GetProperty("AttemptNumber").GetInt32()); + + // The polling actually caused suspend/resume cycles — at least one + // invocation per iteration (3 polls = 3+ invocations). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 3, + $"Expected at least 3 InvocationCompleted events (one per poll iteration), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionMaxAttemptsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionMaxAttemptsTest.cs new file mode 100644 index 000000000..3e65a48a1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionMaxAttemptsTest.cs @@ -0,0 +1,68 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForConditionMaxAttemptsTest +{ + private readonly ITestOutputHelper _output; + public WaitForConditionMaxAttemptsTest(ITestOutputHelper output) => _output = output; + + /// + /// Validates that when the strategy's max-attempts limit is reached + /// without isDone being satisfied, the operation throws + /// with the correct + /// AttemptsExhausted count, and the FAILED checkpoint records the + /// exception type. The workflow catches the exception and returns the + /// count, so we expect the workflow itself to SUCCEED. + /// + [Fact] + public async Task WaitForCondition_MaxAttemptsExhausted_ThrowsWithCount() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForConditionMaxAttemptsFunction"), + "wfcmax", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wfc-max"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 attempts at ~1s delay between them = ~2s of timer + execution + // overhead. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The workflow caught the WaitForConditionException and returned a + // result containing AttemptsExhausted. Verify the final payload from + // the workflow itself (parsed from the GetExecution response). + var execution = await deployment.GetExecutionAsync(arn!); + var resultPayload = execution.Result; + Assert.False(string.IsNullOrEmpty(resultPayload), + "workflow result payload should be present"); + + using var doc = JsonDocument.Parse(resultPayload!); + Assert.Equal("exhausted", doc.RootElement.GetProperty("Status").GetString()); + // The exact attempts count is 3 — strategy maxAttempts. + Assert.Equal(3, doc.RootElement.GetProperty("AttemptsExhausted").GetInt32()); + + // Verify the operation itself was checkpointed as FAILED with the + // WaitForConditionException type, even though the workflow recovers. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepFailedDetails != null && e.Name == "exhausting_poll") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var stepFailed = events.FirstOrDefault(e => e.StepFailedDetails != null && e.Name == "exhausting_poll"); + Assert.NotNull(stepFailed); + Assert.Contains("WaitForConditionException", + stepFailed!.StepFailedDetails.Error?.Payload?.ErrorType ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionReplayDeterminismTest.cs new file mode 100644 index 000000000..809a2e6bc --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionReplayDeterminismTest.cs @@ -0,0 +1,90 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForConditionReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public WaitForConditionReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end replay-determinism check for a step + wait-for-condition + + /// step workflow. The wait-for-condition triggers multiple suspend/resume + /// cycles (one per polling iteration), so the surrounding steps are + /// replayed multiple times. Verifies that: + /// 1. The leading step is re-replayed (not re-executed) across all + /// iterations — its checkpointed GUID flows through to the trailing + /// step regardless of how many polling iterations happen. + /// 2. The wait-for-condition operation is checkpointed exactly once + /// (one StepStarted), with one terminal SUCCEED carrying the final + /// counter state. + /// 3. Multiple invocations were recorded (proves real replay happened). + /// + [Fact] + public async Task WaitForCondition_ReplayPreservesIdentityAndState() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForConditionReplayDeterminismFunction"), + "wfcrep", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wfc-replay"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 polls with ~2s delay = ~4s of timer + 2 step invocations. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — wait until both step-succeeded + // AND the polling op-succeeded events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Each named step / polling op started exactly once. The leading and + // trailing steps each have one StepStarted; the polling op also has + // one (sub-iterations replay from RETRY/READY/PENDING and skip START). + Assert.Single(events.Where(e => e.EventType == EventType.StepStarted && e.Name == "before_poll")); + Assert.Single(events.Where(e => e.EventType == EventType.StepStarted && e.Name == "after_poll")); + Assert.Single(events.Where(e => e.EventType == EventType.StepStarted && e.Name == "determinism_poll")); + + // Plain steps SUCCEED exactly once (replay returns cached values). + // The polling op surfaces one StepSucceeded per iteration (RETRYs + + // terminal SUCCEED), so we just require >= 1 there. + var stepSucceededEvents = events.Where(e => e.StepSucceededDetails != null).ToList(); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "before_poll")); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "after_poll")); + Assert.NotEmpty(stepSucceededEvents.Where(e => e.Name == "determinism_poll")); + + // Verify the trailing step received the GUID from the leading step + // verbatim, AND the final counter — proves the cached step value and + // the WaitForCondition's terminal payload both round-tripped through + // replay. + var beforeEvent = stepSucceededEvents.First(e => e.Name == "before_poll"); + var afterEvent = stepSucceededEvents.First(e => e.Name == "after_poll"); + var generatedGuid = beforeEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + var echoedResult = afterEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.NotNull(generatedGuid); + Assert.NotNull(echoedResult); + Assert.True(Guid.TryParse(generatedGuid, out _), + $"before_poll should produce a valid GUID, got: {generatedGuid}"); + Assert.Equal($"echo:{generatedGuid}:3", echoedResult); + + // The wait-for-condition truly drove suspend/resume — one invocation + // per poll iteration plus one for the final continuation. With 3 + // polls we expect at least 3 InvocationCompleted events. + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 3, + $"Expected at least 3 InvocationCompleted events (one per poll iteration), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionUserCheckThrowsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionUserCheckThrowsTest.cs new file mode 100644 index 000000000..7da2ba87f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForConditionUserCheckThrowsTest.cs @@ -0,0 +1,71 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForConditionUserCheckThrowsTest +{ + private readonly ITestOutputHelper _output; + public WaitForConditionUserCheckThrowsTest(ITestOutputHelper output) => _output = output; + + /// + /// Validates the user-check-throws path: when the check function throws + /// on a polling iteration, the operation checkpoints FAIL with the + /// original exception type and the SDK surfaces a + /// carrying that ErrorType. Mirrors the unit test + /// WaitForConditionOperationTests.CheckThrows_CheckpointsFailAndThrows. + /// + [Fact] + public async Task WaitForCondition_UserCheckThrows_SurfacesAsStepException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForConditionUserCheckThrowsFunction"), + "wfcthrow", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wfc-throw"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Attempt 1 succeeds (returns state+1=1), strategy schedules ~1s + // delay, then attempt 2 throws. ~2s of timer + execution overhead. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The workflow caught the StepException. Verify it captured the + // expected error type via the workflow's returned payload. + var execution = await deployment.GetExecutionAsync(arn!); + var resultPayload = execution.Result; + Assert.False(string.IsNullOrEmpty(resultPayload), + "workflow result payload should be present"); + + using var doc = JsonDocument.Parse(resultPayload!); + Assert.Equal("caught_step_exception", doc.RootElement.GetProperty("Status").GetString()); + Assert.Equal("System.InvalidOperationException", + doc.RootElement.GetProperty("ErrorType").GetString()); + Assert.Contains("intentional check failure", + doc.RootElement.GetProperty("ErrorMessage").GetString() ?? string.Empty); + + // Verify the polling op itself was checkpointed as FAILED with the + // original exception type (NOT WaitForConditionException — that's + // reserved for max-attempts exhaustion). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepFailedDetails != null && e.Name == "throwing_poll") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var stepFailed = events.FirstOrDefault(e => e.StepFailedDetails != null && e.Name == "throwing_poll"); + Assert.NotNull(stepFailed); + Assert.Equal("System.InvalidOperationException", + stepFailed!.StepFailedDetails.Error?.Payload?.ErrorType); + Assert.Contains("intentional check failure", + stepFailed.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs new file mode 100644 index 000000000..a8ab9b22b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitOnlyTest +{ + private readonly ITestOutputHelper _output; + public WaitOnlyTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task WaitOnly_NoSteps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitOnlyFunction"), + "waitonly", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wait-only"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // The wait was checkpointed and ran for the configured duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "only_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(5, waitStarted!.WaitStartedDetails.Duration); + + var waitSucceeded = events.FirstOrDefault(e => e.WaitSucceededDetails != null && e.Name == "only_wait"); + Assert.NotNull(waitSucceeded); + + // No step events: this workflow body contains only a wait. + Assert.Empty(events.Where(e => e.StepStartedDetails != null)); + + // The wait genuinely caused a suspend/resume, not an in-process delay: + // expect at least 2 invocations recorded (initial + resume after timer fires). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (initial + post-wait resume), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json new file mode 100644 index 000000000..b6de9b357 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://xunit.net/schema/current/xunit.runner.schema.json", + "parallelizeTestCollections": false, + "parallelizeAssembly": false, + "maxParallelThreads": 1 +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj new file mode 100644 index 000000000..6f9abfe62 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj @@ -0,0 +1,35 @@ + + + + + + $(DefaultPackageTargets) + Amazon.Lambda.DurableExecution.Tests + Amazon.Lambda.DurableExecution.Tests + true + ..\..\..\buildtools\public.snk + true + enable + enable + $(NoWarn);CS1591 + true + + + + + + + + + + + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs new file mode 100644 index 000000000..c70dc75fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs @@ -0,0 +1,486 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class CallbackOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + /// + /// Wires a recorder so that the next CALLBACK START flush stamps the given + /// callback ID into — modeling the durable-execution + /// service's NewExecutionState response that allocates the ID. + /// + private static void WireServiceCallbackIdAllocation( + RecordingBatcher recorder, ExecutionState state, string callbackId) + { + recorder.OnFlush = ops => + { + foreach (var op in ops) + { + if (op.Type == OperationTypes.Callback && op.Action == "START") + { + state.AddOperations(new[] + { + new Operation + { + Id = op.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = op.Name, + CallbackDetails = new CallbackDetails { CallbackId = callbackId } + } + }); + } + } + }; + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_FlushesStartAndReturnsCallbackId() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-abc-123"); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + Assert.Equal("cb-abc-123", callback.CallbackId); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + + // CreateCallbackAsync sync-flushes a single START checkpoint. + var single = Assert.Single(recorder.Flushed); + Assert.Equal(OperationTypes.Callback, single.Type); + Assert.Equal("START", single.Action); + Assert.Equal(OperationSubTypes.Callback, single.SubType); + Assert.Equal("approval", single.Name); + Assert.Equal(IdAt(1), single.Id); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_NoConfig_DoesNotEmitCallbackOptions() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync(name: "no_options"); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.Null(single.CallbackOptions); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_WithConfig_EmitsCallbackOptions() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync( + name: "with_options", + config: new CallbackConfig + { + Timeout = TimeSpan.FromHours(1), + HeartbeatTimeout = TimeSpan.FromMinutes(5) + }); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.NotNull(single.CallbackOptions); + Assert.Equal(3600, single.CallbackOptions.TimeoutSeconds); + Assert.Equal(300, single.CallbackOptions.HeartbeatTimeoutSeconds); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_OnlyTimeout_EmitsOnlyTimeout() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync( + config: new CallbackConfig { Timeout = TimeSpan.FromSeconds(45) }); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.NotNull(single.CallbackOptions); + Assert.Equal(45, single.CallbackOptions.TimeoutSeconds); + // HeartbeatTimeout was not set → property remains at its default + // (the AWS SDK Marshaller will not serialize the field). + Assert.True( + single.CallbackOptions.HeartbeatTimeoutSeconds == null + || single.CallbackOptions.HeartbeatTimeoutSeconds == 0); + } + + [Fact] + public async Task CreateCallbackAsync_ServiceMissingCallbackId_ThrowsNonDeterministic() + { + // Service doesn't stamp a CallbackId — RecordingBatcher's OnFlush left unset. + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "broken")); + Assert.Contains("CallbackId", ex.Message); + } + + [Fact] + public async Task GetResultAsync_FreshExecution_SuspendsExecution() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + // GetResultAsync should signal termination and return a never-completing task. + var resultTask = callback.GetResultAsync(); + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + } + + [Fact] + public async Task ReplayStarted_DoesNotReFlushStart_AndSuspendsOnGetResult() + { + // STARTED on replay = service has stamped CallbackId but no terminal yet. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-replay-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + Assert.Equal("cb-replay-1", callback.CallbackId); + Assert.False(tm.IsTerminated); + + var resultTask = callback.GetResultAsync(); + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + + // No new checkpoints — replay path doesn't re-flush START. + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ReplaySucceeded_GetResultDeserializes_NoSuspension() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-done-1", + Result = "\"approved\"" + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + var result = await callback.GetResultAsync(); + + Assert.Equal("cb-done-1", callback.CallbackId); + Assert.Equal("approved", result); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ReplaySucceeded_NullResultReturnsDefault() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "no_payload", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "no_payload"); + var result = await callback.GetResultAsync(); + Assert.Null(result); + } + + [Fact] + public async Task ReplayFailed_GetResultThrowsCallbackFailedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Failed, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-fail-1", + Error = new ErrorObject + { + ErrorType = "ExternalSystemError", + ErrorMessage = "rejected by reviewer", + ErrorData = "{\"reviewer\":\"jane\"}" + } + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.IsAssignableFrom(ex); + Assert.Equal("rejected by reviewer", ex.Message); + Assert.Equal("cb-fail-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + Assert.Equal("{\"reviewer\":\"jane\"}", ex.ErrorData); + } + + [Fact] + public async Task ReplayTimedOut_GetResultThrowsCallbackTimeoutException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-to-1", + Error = new ErrorObject + { + ErrorMessage = "callback timed out after 24h" + } + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.IsAssignableFrom(ex); + Assert.Equal("callback timed out after 24h", ex.Message); + Assert.Equal("cb-to-1", ex.CallbackId); + } + + [Fact] + public async Task ReplayTimedOut_NoErrorDetails_DefaultMessage() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.Equal("Callback timed out", ex.Message); + } + + [Fact] + public async Task ReplayUnknownStatus_ThrowsNonDeterministic() + { + // Replay must throw on unexpected statuses (CANCELLED, garbage, etc.) + // rather than silently degrading to a suspend. Mirrors WaitOperation + // and ChildContextOperation's `default:` arms. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = "CANCELLED", + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "approval")); + Assert.Contains("unexpected status", ex.Message); + Assert.Contains("CANCELLED", ex.Message); + } + + [Fact] + public async Task ReplayMissingCallbackId_ThrowsNonDeterministic() + { + // Replay path expects the CallbackId to be present. If it's absent, surface + // a clear non-deterministic error rather than letting users see a NRE later. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "broken", + CallbackDetails = new CallbackDetails { CallbackId = null } + } + } + }); + + await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "broken")); + } + + [Fact] + public async Task ReplayDeterministic_CallbackIdStableAcrossReplays() + { + // Round-trip: STARTED checkpoint with CallbackId X must yield the same X + // on replay so external systems' references remain valid. + const string id = "stable-cb-id-12345"; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = id } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + Assert.Equal(id, callback.CallbackId); + } + + [Fact] + public async Task ReplayTypeMismatch_ThrowsNonDeterministic() + { + // What was a CALLBACK on a previous invocation is now arriving as something + // else — code drift detection. ExecutionState.ValidateReplayConsistency + // is the gate. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval", + StepDetails = new StepDetails { Result = "\"ok\"" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "approval")); + } + + [Fact] + public async Task CreateCallbackAsync_CallbackIdAccessBeforeStart_Throws() + { + // Direct construction of the CallbackOperation without going through + // ExecuteAsync — guard against bugs that try to read CallbackId early. + var op = new CallbackOperation( + "op-id", "name", parentId: null, null, new DefaultLambdaJsonSerializer(), + new ExecutionState(), new TerminationManager(), "arn", batcher: null); + + Assert.Throws(() => _ = ((ICallback)op).CallbackId); + await Task.CompletedTask; + } + + [Fact] + public async Task CreateCallbackAsync_NoSerializer_Throws() + { + // No ILambdaSerializer registered on the LambdaContext — surface a clear + // error instead of letting users see a NRE later. + var state = new ExecutionState(); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no Serializer set + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "no-serializer")); + Assert.Contains("ILambdaSerializer", ex.Message); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs new file mode 100644 index 000000000..effeb5804 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs @@ -0,0 +1,216 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class CheckpointBatcherTests +{ + private static SdkOperationUpdate Update(string id) => new() + { + Id = id, + Type = "STEP", + Action = "SUCCEED" + }; + + [Fact] + public async Task EnqueueAsync_AwaitsUntilBatchFlushes() + { + var flushedTokens = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + flushedTokens.Add(token); + return Task.FromResult("token-1"); + }); + + await batcher.EnqueueAsync(Update("0-step")); + + Assert.Equal(new string?[] { "token-0" }, flushedTokens); + Assert.Equal("token-1", batcher.CheckpointToken); + + await batcher.DrainAsync(); + } + + [Fact] + public async Task MultipleEnqueueAsync_BatchedWithinWindow() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + batches.Add(ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(50) }); + + // Fire several enqueues concurrently and await all — they should + // coalesce into a single batch since FlushInterval > 0. + var tasks = Enumerable.Range(0, 5) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.Single(batches); + Assert.Equal(5, batches[0]); + } + + [Fact] + public async Task EnqueueAsync_OverflowOps_SplitsBatches() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + batches.Add(ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig + { + MaxBatchOperations = 3, + FlushInterval = TimeSpan.FromMilliseconds(100) + }); + + var tasks = Enumerable.Range(0, 7) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + // 7 items, max 3 per batch → 3, 3, 1 (or some permutation summing to 7 + // with no batch over 3). + Assert.Equal(7, batches.Sum()); + Assert.All(batches, count => Assert.True(count <= 3)); + Assert.True(batches.Count >= 3); + } + + [Fact] + public async Task FlushAsync_Throws_PropagatesToAllAwaiters() + { + var failure = new InvalidOperationException("service unavailable"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure), + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(50) }); + + var tasks = Enumerable.Range(0, 3) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + // Each awaiter should see the same exception. + foreach (var t in tasks) + { + var ex = await Assert.ThrowsAsync(() => t); + Assert.Equal("service unavailable", ex.Message); + } + } + + [Fact] + public async Task EnqueueAsync_AfterTerminalError_FailsFast() + { + var failure = new InvalidOperationException("kaboom"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure)); + + // First enqueue trips the terminal error. + await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("0-step"))); + + // Subsequent enqueue should fail fast with the same exception. + var second = await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("1-step"))); + Assert.Equal("kaboom", second.Message); + } + + [Fact] + public async Task DrainAsync_FlushesRemainingItems() + { + var totalFlushed = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + Interlocked.Add(ref totalFlushed, ops.Count); + return Task.FromResult(token); + }); + + // Fire enqueues without awaiting them individually. + var tasks = Enumerable.Range(0, 4) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await batcher.DrainAsync(); + await Task.WhenAll(tasks); + + Assert.Equal(4, totalFlushed); + } + + [Fact] + public async Task DrainAsync_AfterTerminalError_Throws() + { + var failure = new InvalidOperationException("nope"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure)); + + // Trip the terminal error. + await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("0-step"))); + + // Drain should rethrow. + await Assert.ThrowsAsync(() => batcher.DrainAsync()); + } + + [Fact] + public async Task EnqueueAsync_AfterDispose_Throws() + { + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromResult(token)); + + await batcher.DisposeAsync(); + + await Assert.ThrowsAnyAsync(() => batcher.EnqueueAsync(Update("0-step"))); + } + + [Fact] + public async Task CheckpointToken_UpdatesAfterEachFlush() + { + var counter = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + var next = $"token-{Interlocked.Increment(ref counter)}"; + return Task.FromResult(next); + }); + + await batcher.EnqueueAsync(Update("0-step")); + Assert.Equal("token-1", batcher.CheckpointToken); + + await batcher.EnqueueAsync(Update("1-step")); + Assert.Equal("token-2", batcher.CheckpointToken); + + await batcher.DrainAsync(); + } + + [Fact] + public async Task ConcurrentEnqueueAsync_AllComplete() + { + var totalFlushed = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + Interlocked.Add(ref totalFlushed, ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(20) }); + + var tasks = Enumerable.Range(0, 100) + .Select(i => Task.Run(() => batcher.EnqueueAsync(Update($"{i}-step")))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.Equal(100, totalFlushed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs new file mode 100644 index 000000000..3aa182248 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -0,0 +1,525 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ChildContextOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + [Fact] + public async Task RunInChildContextAsync_FreshExecution_RunsFuncAndCheckpoints() + { + var (context, recorder, tm, _) = CreateContext(); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + executed = true; + return await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "inner"; }, name: "inner_step"); + }, + name: "phase"); + + Assert.True(executed); + Assert.Equal("inner", result); + Assert.False(tm.IsTerminated); + + // CONTEXT START → STEP START (fire-and-forget, but flushed before drain) + // → STEP SUCCEED → CONTEXT SUCCEED + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] + { + "CONTEXT:START", + "STEP:START", + "STEP:SUCCEED", + "CONTEXT:SUCCEED" + }, actions); + + var contextSucceed = recorder.Flushed.Single(o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal(IdAt(1), contextSucceed.Id); + Assert.Equal("phase", contextSucceed.Name); + Assert.Equal("\"inner\"", contextSucceed.Payload); + } + + [Fact] + public async Task RunInChildContextAsync_FreshExecution_ChildOperationIdsDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "a"; }, name: "first"); + await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "b"; }, name: "second"); + return 0; + }, + name: "phase"); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstChildOpId = ChildIdAt(parentOpId, 1); + var secondChildOpId = ChildIdAt(parentOpId, 2); + + var stepStarts = recorder.Flushed.Where(o => o.Type == "STEP" && o.Action == "START").ToArray(); + Assert.Equal(2, stepStarts.Length); + Assert.Equal(firstChildOpId, stepStarts[0].Id); + Assert.Equal(secondChildOpId, stepStarts[1].Id); + } + + [Fact] + public async Task RunInChildContextAsync_ReplaySucceeded_ReturnsCachedAndDoesNotRun() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + ContextDetails = new ContextDetails { Result = "\"cached\"" } + } + } + }); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + executed = true; + await Task.CompletedTask; + return "fresh"; + }, + name: "phase"); + + Assert.False(executed); + Assert.Equal("cached", result); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayFailed_ThrowsChildContextException() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "phase", + SubType = "WaitForCallback", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "child went wrong", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "phase")); + + Assert.Equal("child went wrong", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.Equal("WaitForCallback", ex.SubType); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayFailed_AppliesErrorMapping() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "phase", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "boom" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase", + config: new ChildContextConfig + { + // Mapper sees the ChildContextException and remaps to a + // domain-specific exception, preserving the original via + // InnerException. + ErrorMapping = e => new InvalidOperationException("mapped", e) + })); + + Assert.Equal("mapped", ex.Message); + Assert.IsType(ex.InnerException); + } + + [Fact] + public async Task RunInChildContextAsync_FuncThrows_CheckpointsFailAndThrows() + { + var (context, recorder, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("inner boom"); }, + name: "phase")); + + Assert.Equal("inner boom", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + // Fresh-path failures populate OriginalStackTrace alongside ErrorType so + // ErrorMapping callbacks see the same shape on both fresh and replay paths. + Assert.NotNull(ex.OriginalStackTrace); + Assert.NotEmpty(ex.OriginalStackTrace!); + + await recorder.Batcher.DrainAsync(); + var contextActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT") + .Select(o => o.Action.ToString()) + .ToArray(); + Assert.Equal(new[] { "START", "FAIL" }, contextActions); + } + + [Fact] + public async Task RunInChildContextAsync_InnerNonDeterminism_BubblesUpWithoutCheckpointingFail() + { + // A child context whose inner step's checkpoint type doesn't match the + // user code (replay mismatch) must NOT be wrapped/checkpointed as + // CONTEXT FAIL — that would freeze the corruption into history. + var parentOpId = IdAt(1); + var innerOpId = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "phase" + }, + new() + { + Id = innerOpId, + Type = OperationTypes.Wait, // wrong type — code calls StepAsync + Status = OperationStatuses.Succeeded, + Name = "inner_step" + } + } + }); + + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "inner_step"); + }, + name: "phase")); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "FAIL"); + } + + [Fact] + public async Task RunInChildContextAsync_FuncThrows_AppliesErrorMapping() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; throw new TimeoutException("inner timeout"); }, + name: "phase", + config: new ChildContextConfig + { + ErrorMapping = e => new InvalidOperationException("mapped", e) + })); + + Assert.Equal("mapped", ex.Message); + Assert.IsType(ex.InnerException); + } + + [Fact] + public async Task RunInChildContextAsync_ChildSuspendsOnWait_TerminatesWithWaitScheduled() + { + var (context, recorder, tm, _) = CreateContext(); + + // Suspending child: the inner Wait flushes WAIT START sync, then + // returns a never-completing Task via TerminationManager.SuspendAndAwait. + // The outer ChildContextOperation awaits that and never reaches + // CONTEXT SUCCEED. DurableExecutionHandler.RunAsync's WhenAny race + // wins on the termination signal; the test below short-circuits via + // the same TerminationManager.IsTerminated check. + var task = context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.WaitAsync(TimeSpan.FromSeconds(5), name: "wait_inside"); + return "should not return"; + }, + name: "phase"); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // CONTEXT START + WAIT START have flushed; no SUCCEED/FAIL since the + // child is suspended. + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Contains("CONTEXT:START", actions); + Assert.Contains("WAIT:START", actions); + Assert.DoesNotContain("CONTEXT:SUCCEED", actions); + Assert.DoesNotContain("CONTEXT:FAIL", actions); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayStarted_ReExecutesFuncWithInnerCacheReplay() + { + var parentOpId = IdAt(1); + var innerStepOpId = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "phase" + }, + new() + { + Id = innerStepOpId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "inner_step", + StepDetails = new StepDetails { Result = "\"cached_inner\"" } + } + } + }); + + var innerExecuted = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (_) => { innerExecuted = true; await Task.CompletedTask; return "fresh_inner"; }, + name: "inner_step"); + }, + name: "phase"); + + // The user func re-runs (replay propagation), but its inner step + // replays the cached value without invoking the inner code. + Assert.False(innerExecuted); + Assert.Equal("cached_inner", result); + + await recorder.Batcher.DrainAsync(); + + // Critical: do NOT re-checkpoint CONTEXT START on replay. The original + // STARTED checkpoint is still authoritative. + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "START"); + + // The CONTEXT SUCCEED happens only this time, since the user func + // returned successfully. + Assert.Contains(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + } + + [Fact] + public async Task RunInChildContextAsync_VoidOverload_RunsAndCheckpoints() + { + var (context, recorder, _, _) = CreateContext(); + + var executed = false; + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync( + async (_) => { executed = true; await Task.CompletedTask; }, + name: "inner_void"); + }, + name: "phase"); + + Assert.True(executed); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] + { + "CONTEXT:START", + "STEP:START", + "STEP:SUCCEED", + "CONTEXT:SUCCEED" + }, actions); + + // Void overload returns a null object, which the registered + // ILambdaSerializer serializes as the literal "null" payload. + var contextSucceed = recorder.Flushed.Single(o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal("null", contextSucceed.Payload); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, // wrong type — should be CONTEXT + Status = OperationStatuses.Succeeded, + Name = "phase", + StepDetails = new StepDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase")); + + Assert.Contains("expected type 'CONTEXT'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayNameMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "old_name", + ContextDetails = new ContextDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "new_name")); + + Assert.Contains("expected name 'new_name'", ex.Message); + Assert.Contains("found 'old_name'", ex.Message); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayUnknownStatus_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = "BOGUS", + Name = "phase" + } + } + }); + + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase")); + } + + [Fact] + public async Task RunInChildContextAsync_SubTypeAndName_PropagateToCheckpoint() + { + var (context, recorder, _, _) = CreateContext(); + + await context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "ok"; }, + name: "phase", + config: new ChildContextConfig { SubType = "WaitForCallback" }); + + await recorder.Batcher.DrainAsync(); + + var contextOps = recorder.Flushed.Where(o => o.Type == "CONTEXT").ToArray(); + Assert.Equal(2, contextOps.Length); + foreach (var op in contextOps) + { + Assert.Equal("WaitForCallback", op.SubType); + Assert.Equal("phase", op.Name); + } + } + +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs new file mode 100644 index 000000000..95417b953 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ConfigTests +{ + [Fact] + public void InvokeConfig_Defaults() + { + var config = new InvokeConfig(); + Assert.Null(config.TenantId); + } + + [Fact] + public void InvokeConfig_RoundTripsProperties() + { + var config = new InvokeConfig + { + TenantId = "tenant-42" + }; + + Assert.Equal("tenant-42", config.TenantId); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs new file mode 100644 index 000000000..20411dbab --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs @@ -0,0 +1,987 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.DurableExecution.Tests.Internal; +using Amazon.Lambda.TestUtilities; +using Microsoft.Extensions.Logging; +using Xunit; +using LogLevel = Microsoft.Extensions.Logging.LogLevel; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableContextTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static DurableContext CreateContext( + InitialExecutionState? initialState = null, + TerminationManager? terminationManager = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = terminationManager ?? new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + + return new DurableContext(state, tm, idGen, "arn:aws:lambda:us-east-1:123:durable-execution:test", lambdaContext); + } + + #region StepAsync Tests + + [Fact] + public async Task StepAsync_NewExecution_RunsFunction() + { + var context = CreateContext(); + var executed = false; + + var result = await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + return 42; + }, name: "my_step"); + + Assert.True(executed); + Assert.Equal(42, result); + } + + [Fact] + public async Task StepAsync_Replay_ReturnsCachedResult() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"cached_value\"" } + } + } + }); + + var executed = false; + var result = await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + return "fresh_value"; + }, name: "cached_step"); + + Assert.False(executed); + Assert.Equal("cached_value", result); + } + + [Fact] + public async Task StepAsync_ReplayFailed_ThrowsStepException() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Failed, + StepDetails = new StepDetails + { + Error = new ErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => { await Task.CompletedTask; return "x"; }, name: "bad_step")); + + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("timed out", ex.Message); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + } + + [Fact] + public async Task StepAsync_Throws_FailsWithStepException() + { + var context = CreateContext(); + var attempts = 0; + + await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => + { + attempts++; + await Task.CompletedTask; + throw new InvalidOperationException("boom"); + }, name: "fail_step")); + + // No retry support yet — the step runs once. + Assert.Equal(1, attempts); + } + + [Fact] + public async Task StepAsync_WithStepContext_ReceivesMetadata() + { + var context = CreateContext(); + string? receivedOpId = null; + int receivedAttempt = 0; + Microsoft.Extensions.Logging.ILogger? receivedLogger = null; + + await context.StepAsync(async (step) => + { + receivedOpId = step.OperationId; + receivedAttempt = step.AttemptNumber; + receivedLogger = step.Logger; + await Task.CompletedTask; + return "done"; + }, name: "meta_step"); + + Assert.Equal(IdAt(1), receivedOpId); + Assert.Equal(1, receivedAttempt); + Assert.NotNull(receivedLogger); + } + + [Fact] + public async Task StepAsync_VoidOverload_Works() + { + var context = CreateContext(); + var executed = false; + + await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + }, name: "void_step"); + + Assert.True(executed); + } + + [Fact] + public async Task StepAsync_MultipleSteps_DeterministicIds() + { + var context = CreateContext(); + + var r1 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "a"; }, name: "first"); + var r2 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "b"; }, name: "second"); + var r3 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "c"; }); + + Assert.Equal("a", r1); + Assert.Equal("b", r2); + Assert.Equal("c", r3); + } + + [Fact] + public async Task StepAsync_ComplexType_SerializesCorrectly() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "{\"Name\":\"Alice\",\"Age\":30}" } + } + } + }); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return new TestPerson { Name = "Bob", Age = 25 }; }, + name: "fetch"); + + Assert.Equal("Alice", result.Name); + Assert.Equal(30, result.Age); + } + + [Fact] + public async Task StepAsync_NoSerializerOnContext_ThrowsInvalidOperation() + { + // The serializer comes from ILambdaContext.Serializer — without one, + // we can't checkpoint anything. The error message points users at the + // bootstrap registration point. + var state = new ExecutionState(); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no Serializer set + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => { await Task.CompletedTask; return "x"; }, name: "no_serializer")); + + Assert.Contains("ILambdaSerializer", ex.Message); + } + + [Fact] + public void Logger_Default_IsReplayAwareLogger() + { + var context = CreateContext(); + Assert.NotNull(context.Logger); + Assert.IsType(context.Logger); + } + + [Fact] + public void ConfigureLogger_NullArg_Throws() + { + var context = CreateContext(); + Assert.Throws(() => context.ConfigureLogger(null!)); + } + + [Fact] + public void ConfigureLogger_WithCustomLogger_ReachesUserLogger() + { + var context = CreateContext(); + var custom = new RecordingLogger(); + context.ConfigureLogger(new LoggerConfig { CustomLogger = custom }); + + // Default state has no checkpoint → starts in Execution mode, so + // logs flow through immediately. + context.Logger.LogInformation("hi"); + + Assert.Single(custom.Records); + Assert.Equal(LogLevel.Information, custom.Records[0].Level); + } + + [Fact] + public void ConfigureLogger_ModeAwareFalse_LogsDuringReplay() + { + // Seed a checkpoint so the context starts in Replay mode. + var custom = new RecordingLogger(); + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(99), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded + } + } + }); + + context.ConfigureLogger(new LoggerConfig { CustomLogger = custom, ModeAware = true }); + context.Logger.LogInformation("replay-default"); + Assert.Empty(custom.Records); + + context.ConfigureLogger(new LoggerConfig { ModeAware = false }); + context.Logger.LogInformation("replay-disabled"); + Assert.Single(custom.Records); + Assert.Contains("replay-disabled", custom.Records[0].Message); + } + + [Fact] + public void ExecutionContext_ExposesArn() + { + var context = CreateContext(); + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:test", context.ExecutionContext.DurableExecutionArn); + } + + [Fact] + public void LambdaContext_IsExposed() + { + var context = CreateContext(); + Assert.NotNull(context.LambdaContext); + } + + [Fact] + public async Task StepAsync_Replay_NullResult_ReturnsDefault() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = null } + } + } + }); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, + name: "no_result"); + + Assert.Null(result); + } + + [Fact] + public async Task StepAsync_CancelledToken_ThrowsOperationCanceled() + { + var context = CreateContext(); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await Assert.ThrowsAnyAsync(() => + context.StepAsync( + async (_) => + { + cts.Token.ThrowIfCancellationRequested(); + await Task.CompletedTask; + return "unreachable"; + }, + name: "cancelled_step", + cancellationToken: cts.Token)); + } + + #endregion + + #region WaitAsync Tests + + [Fact] + public async Task WaitAsync_SubSecond_ThrowsArgumentOutOfRange() + { + var context = CreateContext(); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromMilliseconds(500))); + } + + [Fact] + public async Task WaitAsync_AboveOneYear_ThrowsArgumentOutOfRange() + { + var context = CreateContext(); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromSeconds(31_622_401))); + } + + [Fact] + public async Task WaitAsync_NewExecution_SignalsTermination() + { + var tm = new TerminationManager(); + var context = CreateContext(terminationManager: tm); + + // WaitAsync should signal termination and return a never-completing task + var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "my_wait"); + + // Give it a moment to execute + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(waitTask.IsCompleted); + } + + [Fact] + public async Task WaitAsync_Elapsed_ContinuesImmediately() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds(); + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + }); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "cooldown"); + // If we got here, the wait was correctly skipped + } + + [Fact] + public async Task WaitAsync_StartedButNotExpired_ResuspendsWithoutNewCheckpoint() + { + var futureExpirationMs = DateTimeOffset.UtcNow.AddSeconds(300).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = futureExpirationMs } + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "pending_wait"); + + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(waitTask.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task WaitAsync_AlreadySucceeded_ContinuesImmediately() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Succeeded + } + } + }); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "done_wait"); + // Completed without blocking + } + + [Fact] + public async Task WaitAsync_UnknownStatus_ThrowsNonDeterministicException() + { + // Unrecognized status on a replayed wait checkpoint must surface as + // NonDeterministicExecutionException — silently re-emitting WAIT START + // would either fail at the service or duplicate work. + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = "TOTALLY_BOGUS_STATUS" + } + } + }); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromSeconds(30), name: "mystery_wait")); + } + + #endregion + + #region End-to-end: Step + Wait + Step + + [Fact] + public async Task EndToEnd_StepWaitStep_FirstInvocation_SuspendsOnWait() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + await context.StepAsync(async (_) => { await Task.CompletedTask; return "fetched"; }, name: "fetch"); + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + var final = await context.StepAsync(async (_) => { await Task.CompletedTask; return "processed"; }, name: "process"); + return final; + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + } + + [Fact] + public async Task EndToEnd_StepWaitStep_SecondInvocation_Completes() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-5).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"fetched\"" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + }); + + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + var processExecuted = false; + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + var fetched = await context.StepAsync(async (_) => { await Task.CompletedTask; return "fresh_fetch"; }, name: "fetch"); + Assert.Equal("fetched", fetched); // cached from replay + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + // wait is elapsed, continues + + var final = await context.StepAsync(async (_) => + { + processExecuted = true; + await Task.CompletedTask; + return "processed"; + }, name: "process"); + return final; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("processed", result.Result); + Assert.True(processExecuted); + } + + #endregion + + #region Non-Determinism Detection Tests + + [Fact] + public async Task StepAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Succeeded + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.StepAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "my_op")); + + Assert.Contains("expected type 'STEP'", ex.Message); + Assert.Contains("found 'WAIT'", ex.Message); + } + + [Fact] + public async Task WaitAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"hello\"" } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.WaitAsync(TimeSpan.FromSeconds(10), name: "my_op")); + + Assert.Contains("expected type 'WAIT'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + [Fact] + public async Task StepAsync_ReplayNameMismatch_ThrowsNonDeterministicException() + { + // Simulate a scenario where the operation was stored with a different name + // than what the current code passes (e.g., service returned stale data). + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "old_name", + StepDetails = new StepDetails { Result = "\"old_result\"" } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.StepAsync( + async (_) => { await Task.CompletedTask; return "new"; }, + name: "my_step")); + + Assert.Contains("expected name 'my_step'", ex.Message); + Assert.Contains("found 'old_name'", ex.Message); + } + + [Fact] + public async Task StepAsync_NoReplay_SkipsValidation() + { + var context = CreateContext(); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return "ok"; }, + name: "anything"); + + Assert.Equal("ok", result); + } + + #endregion + + private class TestPerson + { + public string? Name { get; set; } + public int Age { get; set; } + } + + #region StepAsync Retry Tests + + [Fact] + public async Task StepAsync_FailsWithRetryStrategy_CheckpointsRetryAndSuspends() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var stepTask = context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("transient"); }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(5), + jitter: JitterStrategy.None) + }); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + + // Fresh attempt 1 emits a fire-and-forget START (telemetry under + // AtLeastOncePerRetry), then a RETRY when the user code throws and + // the retry strategy decides to retry. + var checkpoints = recorder.Flushed; + Assert.Equal(2, checkpoints.Count); + Assert.Equal("START", checkpoints[0].Action); + Assert.Equal("RETRY", checkpoints[1].Action); + Assert.Equal(IdAt(1), checkpoints[1].Id); + Assert.Equal(5, checkpoints[1].StepOptions.NextAttemptDelaySeconds); + } + + [Fact] + public async Task StepAsync_FailsNoRetryStrategy_CheckpointsFail() + { + var context = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("permanent"); }, + name: "fail_step")); + + Assert.Equal("permanent", ex.Message); + } + + [Fact] + public async Task StepAsync_RetryExhausted_CheckpointsFail() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 2, + NextAttemptTimestamp = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds() + } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + // Attempt 3 (last one) — should fail after this + var ex = await Assert.ThrowsAsync(() => + context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("still failing"); }, + name: "exhaust_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, jitter: JitterStrategy.None) + })); + + Assert.Equal("still failing", ex.Message); + + // Fresh attempt 3 emits a fire-and-forget START (telemetry under + // AtLeastOncePerRetry), then a FAIL after the retry strategy gives up. + var checkpoints = recorder.Flushed; + Assert.Equal(2, checkpoints.Count); + Assert.Equal("START", checkpoints[0].Action); + Assert.Equal("FAIL", checkpoints[1].Action); + } + + [Fact] + public async Task StepAsync_PendingWithFutureTimestamp_Suspends() + { + var futureMs = DateTimeOffset.UtcNow.AddSeconds(300).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 1, + NextAttemptTimestamp = futureMs + } + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var stepTask = context.StepAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "pending_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task StepAsync_PendingWithPastTimestamp_ReExecutes() + { + var pastMs = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 1, + NextAttemptTimestamp = pastMs + } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + Assert.Equal(2, ctx.AttemptNumber); + return "retry success"; + }, + name: "retry_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + Assert.Equal("retry success", result); + } + + [Fact] + public async Task StepAsync_ReadyReplay_AdvancesAttemptAndExecutes() + { + // READY = service has post-PENDING re-invoked us; the retry timer + // already fired so no timestamp check is needed. Just advance the + // attempt counter and run. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Ready, + StepDetails = new StepDetails { Attempt = 2 } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var executed = false; + var result = await context.StepAsync( + async (ctx) => + { + executed = true; + Assert.Equal(3, ctx.AttemptNumber); + await Task.CompletedTask; + return "ok"; + }, + name: "ready_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + Assert.True(executed); + Assert.Equal("ok", result); + Assert.False(tm.IsTerminated); + Assert.False(state.IsReplaying); + } + + [Fact] + public async Task StepAsync_AtMostOnce_FlushesStartBeforeExecution() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + IReadOnlyList? flushedAtFuncEntry = null; + + var result = await context.StepAsync( + async (_) => + { + flushedAtFuncEntry = recorder.Flushed.Select(o => o.Action.ToString()).ToArray(); + await Task.CompletedTask; + return "done"; + }, + name: "amo_step", + config: new StepConfig { Semantics = StepSemantics.AtMostOncePerRetry }); + + Assert.Equal("done", result); + + // START must be flushed before user func runs (AtMostOnce invariant). + Assert.NotNull(flushedAtFuncEntry); + Assert.Equal(new[] { "START" }, flushedAtFuncEntry); + + // After step returns, SUCCEED has also been flushed. + var actions = recorder.Flushed.Select(o => o.Action.ToString()).ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, actions); + } + + [Fact] + public async Task StepAsync_AtMostOnce_StartedReplay_TriggersRetryHandler() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Started + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var executed = false; + var stepTask = context.StepAsync( + async (_) => { executed = true; await Task.CompletedTask; return "should not run"; }, + name: "amo_replay", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, jitter: JitterStrategy.None) + }); + + await Task.Delay(50); + + Assert.False(executed); + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + + var checkpoints = recorder.Flushed; + Assert.Single(checkpoints); + Assert.Equal("RETRY", checkpoints[0].Action); + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs new file mode 100644 index 000000000..76062a682 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs @@ -0,0 +1,140 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableExecutionHandlerTests +{ + [Fact] + public async Task RunAsync_UserCodeCompletes_ReturnsSucceeded() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.Delay(1); + return "hello"; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("hello", result.Result); + Assert.Null(result.Exception); + } + + [Fact] + public async Task RunAsync_UserCodeThrows_ReturnsFailed() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.Delay(1); + throw new InvalidOperationException("something broke"); + }); + + Assert.Equal(InvocationStatus.Failed, result.Status); + Assert.Equal("something broke", result.Message); + Assert.IsType(result.Exception); + } + + [Fact] + public async Task RunAsync_TerminationWins_ReturnsPending() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + // Simulate: user code hits a wait, signals termination, then blocks forever + termination.Terminate(TerminationReason.WaitScheduled, "waiting 30s"); + await new TaskCompletionSource().Task; // blocks forever + return "unreachable"; + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + Assert.Equal("waiting 30s", result.Message); + Assert.Null(result.Exception); + } + + [Fact] + public async Task RunAsync_TerminationWithException_ReturnsFailed() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + termination.Terminate( + TerminationReason.CheckpointFailed, + "checkpoint error", + new InvalidOperationException("service unavailable")); + await new TaskCompletionSource().Task; + return "unreachable"; + }); + + Assert.Equal(InvocationStatus.Failed, result.Status); + Assert.IsType(result.Exception); + } + + [Fact] + public async Task RunAsync_FastUserCode_BeatsTermination() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + // User code completes before termination is called + return 42; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal(42, result.Result); + } + + [Fact] + public async Task RunAsync_IntResult_WorksWithValueTypes() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.CompletedTask; + return 100; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal(100, result.Result); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs new file mode 100644 index 000000000..8078b0242 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs @@ -0,0 +1,783 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Net; +using System.Text.Json; +using Amazon.Lambda; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Amazon.Runtime; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableFunctionTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private readonly IAmazonLambda _mockClient = new MockLambdaClient(); + + [Fact] + public async Task WrapAsync_FreshExecution_StepThenWait_ReturnsPending() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:order-123", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-123\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + } + + [Fact] + public async Task WrapAsync_ReplayWithElapsedWait_ReturnsSucceeded() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-5).ToUnixTimeMilliseconds(); + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:order-123", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-123\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "{\"IsValid\":true}" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.NotNull(output.Result); + var result = JsonSerializer.Deserialize(output.Result!); + Assert.Equal("approved", result!.Status); + } + + [Fact] + public async Task WrapAsync_WorkflowThrows_ReturnsFailed() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:fail-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"bad-order\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => throw new InvalidOperationException("workflow error"), + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Failed, output.Status); + Assert.NotNull(output.Error); + Assert.Equal("workflow error", output.Error!.ErrorMessage); + Assert.Contains("InvalidOperationException", output.Error.ErrorType!); + } + + [Fact] + public async Task WrapAsync_VoidWorkflow_ReturnSucceeded() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:void-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + var executed = false; + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + await ctx.StepAsync(async (_) => { await Task.CompletedTask; executed = true; }, name: "do_work"); + }, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.True(executed); + } + + [Fact] + public async Task WrapAsync_CheckpointsAreSentToService() + { + var mockClient = new MockLambdaClient(); + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-test", + CheckpointToken = "initial-token", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + + // Each StepAsync emits a fire-and-forget START before user code runs + // (telemetry under AtLeastOncePerRetry). With FlushInterval = 0 the + // worker may flush the START on its own before SUCCEED arrives, so the + // exact batching of START vs SUCCEED is timing-dependent. Assert on + // the flat sequence of updates instead. + var allUpdates = mockClient.CheckpointCalls + .SelectMany(c => c.Updates) + .ToList(); + + // Expect: step START, step SUCCEED, wait START (in that order). + Assert.Equal(3, allUpdates.Count); + + Assert.Equal("STEP", allUpdates[0].Type); + Assert.Equal("START", allUpdates[0].Action); + Assert.Equal("validate", allUpdates[0].Name); + + Assert.Equal("STEP", allUpdates[1].Type); + Assert.Equal("SUCCEED", allUpdates[1].Action); + Assert.Equal("validate", allUpdates[1].Name); + Assert.NotNull(allUpdates[1].Payload); + + Assert.Equal("WAIT", allUpdates[2].Type); + Assert.Equal("START", allUpdates[2].Action); + Assert.Equal("delay", allUpdates[2].Name); + Assert.NotNull(allUpdates[2].WaitOptions); + Assert.Equal(30, allUpdates[2].WaitOptions.WaitSeconds); + + // The first call sends the initial checkpoint token. + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-test", mockClient.CheckpointCalls[0].DurableExecutionArn); + Assert.Equal("initial-token", mockClient.CheckpointCalls[0].CheckpointToken); + } + + [Fact] + public async Task WrapAsync_UserPayload_BindsCamelCaseToPascalCaseProperty() + { + // The wire payload uses camelCase ("orderId"), the user POCO uses PascalCase (OrderId). + // ExtractUserPayload must do case-insensitive binding so workflows can read input.OrderId. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:case-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"abc-123\"}" } + } + } + } + }; + + string? observedOrderId = null; + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + observedOrderId = evt.OrderId; + await Task.CompletedTask; + return new OrderResult { Status = "ok", OrderId = evt.OrderId }; + }, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.Equal("abc-123", observedOrderId); + } + + [Fact] + public async Task WrapAsync_NoExecutionOp_ThrowsMalformedEnvelope() + { + // No EXECUTION operation in the envelope — ExtractUserPayload must throw a typed + // DurableExecutionException so the malformed envelope surfaces as a clear error + // instead of leaking default!/null into user code as a NullReferenceException. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:no-exec", + InitialExecutionState = new InitialExecutionState + { + Operations = new List() + } + }; + + var ex = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + async (evt, ctx) => + { + await Task.CompletedTask; + return new OrderResult { Status = "ok" }; + }, + input, + CreateLambdaContext(), + _mockClient)); + + Assert.Contains("malformed", ex.Message, StringComparison.OrdinalIgnoreCase); + Assert.Contains("EXECUTION", ex.Message); + } + + [Fact] + public async Task WrapAsync_PaginatedInitialState_HydratesAllPages() + { + // The service can return execution state across multiple pages — the first + // page comes inline on the invocation envelope (InitialExecutionState) and + // subsequent pages must be fetched via GetDurableExecutionState. Verify the + // pagination loop in WrapAsyncCore (DurableFunction.cs:160-167) walks every + // page so the workflow sees the full operation history on replay. + var arn = "arn:aws:lambda:us-east-1:123:durable-execution:paginated"; + + // Page 0 (in InitialExecutionState): EXECUTION op + step1 SUCCEEDED. + // Page 1 (fetched with marker "marker-1"): step2 SUCCEEDED, points to marker-2. + // Page 2 (fetched with marker "marker-2"): step3 SUCCEEDED, no NextMarker — loop exits. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = arn, + CheckpointToken = "ckpt-0", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"page-0-result\"" } + } + }, + NextMarker = "marker-1" + } + }; + + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = req => req.Marker switch + { + "marker-1" => new Amazon.Lambda.Model.GetDurableExecutionStateResponse + { + Operations = new List + { + new() + { + Id = IdAt(2), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new Amazon.Lambda.Model.StepDetails { Result = "\"page-1-result\"" } + } + }, + NextMarker = "marker-2" + }, + "marker-2" => new Amazon.Lambda.Model.GetDurableExecutionStateResponse + { + Operations = new List + { + new() + { + Id = IdAt(3), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new Amazon.Lambda.Model.StepDetails { Result = "\"page-2-result\"" } + } + } + // NextMarker omitted -> loop terminates. + }, + _ => throw new InvalidOperationException($"Unexpected marker: {req.Marker}") + } + }; + + var observed = new List(); + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + // All three steps must replay the cached results from the paginated state + // without re-executing — if the loop missed a page, the corresponding step + // would run fresh and append a different value to `observed`. + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step1")); + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step2")); + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step3")); + return new OrderResult { Status = "ok", OrderId = evt.OrderId }; + }, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + + // Two GetDurableExecutionState calls — one per fetched page (page 0 was inline). + Assert.Equal(2, mockClient.GetExecutionStateCalls.Count); + Assert.Equal("marker-1", mockClient.GetExecutionStateCalls[0].Marker); + Assert.Equal(arn, mockClient.GetExecutionStateCalls[0].DurableExecutionArn); + Assert.Equal("ckpt-0", mockClient.GetExecutionStateCalls[0].CheckpointToken); + Assert.Equal("marker-2", mockClient.GetExecutionStateCalls[1].Marker); + + // The workflow saw replayed results from ALL three pages — none re-executed. + Assert.Equal(new[] { "page-0-result", "page-1-result", "page-2-result" }, observed); + + // No checkpoints were written: every step replayed from cache. + Assert.Empty(mockClient.CheckpointCalls); + } + + [Fact] + public async Task WrapAsync_NullInitialExecutionState_ThrowsMalformedEnvelope() + { + // No initial execution state at all — same malformed-envelope branch in ExtractUserPayload. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:null-state" + }; + + var ex = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + async (evt, ctx) => + { + await Task.CompletedTask; + return new OrderResult { Status = "ok" }; + }, + input, + CreateLambdaContext(), + _mockClient)); + + Assert.Contains("malformed", ex.Message, StringComparison.OrdinalIgnoreCase); + } + + // ────────────────────────────────────────────────────────────────────── + // IsTerminalCheckpointError classification (mirrors CheckpointError in + // aws-durable-execution-sdk-python): + // 4xx (except 429) → terminal (Failed envelope) + // 429 / 5xx / no status → transient (escapes to host for Lambda retry) + // Carve-out: InvalidParameterValueException "Invalid Checkpoint Token" → transient + // + // Driven through CheckpointDurableExecution: a workflow that succeeds a single Step + // forces the batcher to flush, which is wrapped by the try/catch in WrapAsyncCore. + // ────────────────────────────────────────────────────────────────────── + + public static IEnumerable TerminalCheckpointErrorCases() => new[] + { + new object[] { MakeServiceException("ResourceNotFoundException", HttpStatusCode.NotFound, "ARN not found") }, + new object[] { MakeServiceException("AccessDeniedException", HttpStatusCode.Forbidden, "denied") }, + new object[] { MakeServiceException("KMSAccessDeniedException", HttpStatusCode.BadRequest, "kms denied") }, + new object[] { MakeServiceException("ValidationException", HttpStatusCode.BadRequest, "bad input") }, + new object[] { MakeServiceException("InvalidParameterValueException", HttpStatusCode.BadRequest, "Some other parameter") }, + }; + + [Theory] + [MemberData(nameof(TerminalCheckpointErrorCases))] + public async Task WrapAsync_CheckpointThrowsTerminal_ReturnsFailed(AmazonServiceException ex) + { + // LambdaDurableServiceClient now wraps SDK exceptions in DurableExecutionException + // so user logs carry context (which call, which ARN). The outer message includes + // the inner SDK message; the classifier matches on the wrapper's InnerException. + var input = MakeCheckpointInput(); + var mockClient = new MockLambdaClient { CheckpointThrows = ex }; + + var output = await DurableFunction.WrapAsync( + SingleStepWorkflow, input, CreateLambdaContext(), mockClient); + + Assert.Equal(InvocationStatus.Failed, output.Status); + Assert.NotNull(output.Error); + Assert.Contains(ex.Message, output.Error!.ErrorMessage); + Assert.Contains("Failed to checkpoint", output.Error.ErrorMessage); + } + + public static IEnumerable TransientCheckpointErrorCases() => new[] + { + // 5xx + new object[] { MakeServiceException("InternalServerError", HttpStatusCode.InternalServerError, "boom") }, + new object[] { MakeServiceException("ServiceUnavailable", HttpStatusCode.ServiceUnavailable, "down") }, + // 429 + new object[] { MakeServiceException("TooManyRequestsException", (HttpStatusCode)429, "throttled") }, + // No status (network / SDK-internal). HttpStatusCode default (0) → classifier treats < 400 as transient. + new object[] { MakeServiceException("RequestTimeout", 0, "timeout") }, + // Carve-out: stale checkpoint token is transient. + new object[] { MakeServiceException("InvalidParameterValueException", HttpStatusCode.BadRequest, "Invalid Checkpoint Token: stale") }, + }; + + [Theory] + [MemberData(nameof(TransientCheckpointErrorCases))] + public async Task WrapAsync_CheckpointThrowsTransient_PropagatesToHost(AmazonServiceException ex) + { + // Transient SDK errors escape the IsTerminalCheckpointError catch and propagate + // to the host as DurableExecutionException wrapping the original SDK exception + // — Lambda's normal retry semantics fire on the wrapper. The original SDK + // exception is preserved as InnerException so callers can still introspect + // the original status code / error code. + var input = MakeCheckpointInput(); + var mockClient = new MockLambdaClient { CheckpointThrows = ex }; + + var thrown = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + SingleStepWorkflow, input, CreateLambdaContext(), mockClient)); + + Assert.Same(ex, thrown.InnerException); + } + + [Fact] + public async Task WrapAsync_HydrationThrows_AlwaysPropagatesToHost() + { + // State hydration is OUTSIDE the IsTerminalCheckpointError try/catch — every + // GetExecutionStateAsync failure escapes for Lambda retry. Use a 4xx that + // *would* be terminal if it came from a checkpoint flush to prove the path + // isn't classified. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:hydrate-fail", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + }, + NextMarker = "page-1" // force the hydration loop to run + } + }; + var ex = MakeServiceException("ResourceNotFoundException", HttpStatusCode.NotFound, "ARN gone"); + var mockClient = new MockLambdaClient { GetExecutionStateThrows = ex }; + + // Hydration errors are wrapped in DurableExecutionException by + // LambdaDurableServiceClient.GetExecutionStateAsync but are NOT caught by the + // IsTerminalCheckpointError filter, so they escape to the host. + var thrown = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + MyWorkflow, input, CreateLambdaContext(), mockClient)); + + Assert.Same(ex, thrown.InnerException); + Assert.Contains("Failed to fetch execution state", thrown.Message); + } + + private static AmazonServiceException MakeServiceException(string code, HttpStatusCode status, string message) + { + return new AmazonServiceException(message, innerException: null, ErrorType.Unknown, code, requestId: "req-1", statusCode: status); + } + + private static DurableExecutionInvocationInput MakeCheckpointInput() => new() + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-fail", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + private static async Task SingleStepWorkflow(OrderEvent input, IDurableContext context) + { + // One step succeed → forces a checkpoint flush, which the mock fails. + await context.StepAsync(async (_) => { await Task.CompletedTask; return "ok"; }, name: "s1"); + return new OrderResult { Status = "done" }; + } + + [Fact] + public async Task WrapAsync_CreateCallbackThenWait_AllocatesCallbackIdAndSuspends() + { + // End-to-end through the real LambdaDurableServiceClient: the mock + // client returns NewExecutionState carrying a CallbackId on the + // CALLBACK START checkpoint response, and the SDK plumbs it through. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:cb-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + } + } + } + }; + + var capturedCallbackId = (string?)null; + var mockClient = new MockLambdaClient + { + CheckpointHandler = req => + { + // Echo back any CALLBACK START as a STARTED op with a service-allocated id. + var newOps = new List(); + foreach (var u in req.Updates) + { + if (u.Type == OperationTypes.Callback && u.Action == "START") + { + newOps.Add(new Amazon.Lambda.Model.Operation + { + Id = u.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = u.Name, + CallbackDetails = new Amazon.Lambda.Model.CallbackDetails + { + CallbackId = "servicealloccbid" + } + }); + } + } + return new Amazon.Lambda.Model.CheckpointDurableExecutionResponse + { + NewExecutionState = newOps.Count == 0 + ? null + : new Amazon.Lambda.Model.CheckpointUpdatedExecutionState { Operations = newOps } + }; + } + }; + + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + capturedCallbackId = cb.CallbackId; + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + Assert.Equal("servicealloccbid", capturedCallbackId); + } + + [Fact] + public async Task WrapAsync_ReplayCallbackSucceeded_ReturnsResultAfterSuspend() + { + // Second invocation: the callback's checkpoint is now SUCCEEDED; + // the workflow returns the deserialized result. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:cb-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "servicealloccbid", + Result = "\"approved\"" + } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + new MockLambdaClient()); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.NotNull(output.Result); + var result = JsonSerializer.Deserialize(output.Result!); + Assert.Equal("approved", result!.Status); + } + + [Fact] + public async Task WrapAsync_ReplayDeterminism_CallbackIdStableAcrossInvocations() + { + // First invocation allocates a callback ID via the mock; in a real run + // that ID would be persisted in the service's checkpoint state and + // returned to the second invocation via InitialExecutionState. Verify + // the same ID survives that round-trip (we model "round-trip" by + // replaying with a STARTED checkpoint that carries the same ID). + const string id = "stablecbidreplay"; + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = id } + } + } + } + }; + + string? observed = null; + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + observed = cb.CallbackId; + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + new MockLambdaClient()); + + Assert.Equal(InvocationStatus.Pending, output.Status); + Assert.Equal(id, observed); + } + + private static async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (_) => { await Task.CompletedTask; return new ValidationResult { IsValid = true }; }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + + return new OrderResult { Status = "approved", OrderId = input.OrderId }; + } + + private class OrderEvent + { + public string? OrderId { get; set; } + } + + private class OrderResult + { + public string? Status { get; set; } + public string? OrderId { get; set; } + } + + private class ValidationResult + { + public bool IsValid { get; set; } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs new file mode 100644 index 000000000..36b9b3a70 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class EnumsTests +{ + [Fact] + public void InvocationStatus_HasExpectedValues() + { + Assert.Equal(0, (int)InvocationStatus.Succeeded); + Assert.Equal(1, (int)InvocationStatus.Failed); + Assert.Equal(2, (int)InvocationStatus.Pending); + } + + [Fact] + public void OperationTypes_HasExpectedConstants() + { + Assert.Equal("STEP", OperationTypes.Step); + Assert.Equal("WAIT", OperationTypes.Wait); + Assert.Equal("CALLBACK", OperationTypes.Callback); + Assert.Equal("CHAINED_INVOKE", OperationTypes.ChainedInvoke); + Assert.Equal("CONTEXT", OperationTypes.Context); + Assert.Equal("EXECUTION", OperationTypes.Execution); + } + + [Fact] + public void OperationStatuses_HasExpectedConstants() + { + Assert.Equal("STARTED", OperationStatuses.Started); + Assert.Equal("SUCCEEDED", OperationStatuses.Succeeded); + Assert.Equal("FAILED", OperationStatuses.Failed); + Assert.Equal("PENDING", OperationStatuses.Pending); + Assert.Equal("CANCELLED", OperationStatuses.Cancelled); + Assert.Equal("READY", OperationStatuses.Ready); + Assert.Equal("STOPPED", OperationStatuses.Stopped); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs new file mode 100644 index 000000000..f89a72cb1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs @@ -0,0 +1,267 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ExceptionsTests +{ + [Fact] + public void DurableExecutionException_IsBaseException() + { + var ex = new DurableExecutionException("test error"); + Assert.IsAssignableFrom(ex); + Assert.Equal("test error", ex.Message); + } + + [Fact] + public void DurableExecutionException_WrapsInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new DurableExecutionException("outer", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void DurableExecutionException_ParameterlessCtor() + { + var ex = new DurableExecutionException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void StepException_ParameterlessCtor() + { + var ex = new StepException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void StepException_MessageOnlyCtor() + { + var ex = new StepException("step blew up"); + Assert.Equal("step blew up", ex.Message); + } + + [Fact] + public void StepException_WithInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new StepException("wrapped", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void StepException_HasErrorProperties() + { + var ex = new StepException("step failed") + { + ErrorType = "System.TimeoutException", + ErrorData = "operation timed out", + OriginalStackTrace = new[] { "at Foo.Bar()", "at Baz.Qux()" } + }; + + Assert.IsAssignableFrom(ex); + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("operation timed out", ex.ErrorData); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + } + + [Fact] + public void CallbackException_BaseClassCtors() + { + var empty = new CallbackException(); + Assert.IsAssignableFrom(empty); + + var withMsg = new CallbackException("cb error"); + Assert.Equal("cb error", withMsg.Message); + + var inner = new InvalidOperationException("inner"); + var wrapping = new CallbackException("outer", inner); + Assert.Same(inner, wrapping.InnerException); + } + + [Fact] + public void CallbackException_InitProperties() + { + var ex = new CallbackException("rejected") + { + CallbackId = "cb-1", + ErrorType = "ExternalSystemError", + ErrorData = "{\"reviewer\":\"jane\"}", + OriginalStackTrace = new[] { "at A.B()" } + }; + + Assert.Equal("cb-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + Assert.Equal("{\"reviewer\":\"jane\"}", ex.ErrorData); + Assert.Single(ex.OriginalStackTrace!); + } + + [Fact] + public void CallbackFailedException_IsCallbackException() + { + var ex = new CallbackFailedException("rejected") { CallbackId = "cb-1" }; + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("rejected", ex.Message); + Assert.Equal("cb-1", ex.CallbackId); + } + + [Fact] + public void CallbackFailedException_AllCtors() + { + Assert.NotNull(new CallbackFailedException()); + Assert.Equal("m", new CallbackFailedException("m").Message); + var inner = new Exception("inner"); + Assert.Same(inner, new CallbackFailedException("m", inner).InnerException); + } + + [Fact] + public void CallbackTimeoutException_IsCallbackException() + { + var ex = new CallbackTimeoutException("timed out") { CallbackId = "cb-1" }; + Assert.IsAssignableFrom(ex); + Assert.Equal("timed out", ex.Message); + } + + [Fact] + public void CallbackTimeoutException_AllCtors() + { + Assert.NotNull(new CallbackTimeoutException()); + Assert.Equal("m", new CallbackTimeoutException("m").Message); + var inner = new Exception("inner"); + Assert.Same(inner, new CallbackTimeoutException("m", inner).InnerException); + } + + [Fact] + public void CallbackSubmitterException_IsCallbackException() + { + var inner = new StepException("submitter failed"); + var ex = new CallbackSubmitterException("submitter failed", inner); + Assert.IsAssignableFrom(ex); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void CallbackSubmitterException_AllCtors() + { + Assert.NotNull(new CallbackSubmitterException()); + Assert.Equal("m", new CallbackSubmitterException("m").Message); + } + + #region InvokeException tree + + [Fact] + public void InvokeException_IsDurableExecutionException() + { + var ex = new InvokeException("invoke failed"); + Assert.IsAssignableFrom(ex); + Assert.Equal("invoke failed", ex.Message); + } + + [Fact] + public void InvokeException_ParameterlessCtor() + { + var ex = new InvokeException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void InvokeException_WrapsInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new InvokeException("outer", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void InvokeException_HasInvokeProperties() + { + var ex = new InvokeException("boom") + { + FunctionName = "arn:aws:lambda:us-east-1:123:function:fn:prod", + ErrorType = "System.TimeoutException", + ErrorData = "{\"detail\":\"x\"}", + OriginalStackTrace = new[] { "at A.B()" } + }; + + Assert.Equal("arn:aws:lambda:us-east-1:123:function:fn:prod", ex.FunctionName); + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.Single(ex.OriginalStackTrace!); + } + + [Fact] + public void InvokeFailedException_IsInvokeException() + { + var ex = new InvokeFailedException("boom") { FunctionName = "fn:prod" }; + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("boom", ex.Message); + Assert.Equal("fn:prod", ex.FunctionName); + } + + [Fact] + public void InvokeFailedException_AllCtorOverloads() + { + var inner = new InvalidOperationException("inner"); + Assert.IsAssignableFrom(new InvokeFailedException()); + Assert.Equal("m", new InvokeFailedException("m").Message); + Assert.Same(inner, new InvokeFailedException("m", inner).InnerException); + } + + [Fact] + public void InvokeTimedOutException_IsInvokeException() + { + var ex = new InvokeTimedOutException("timed out"); + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("timed out", ex.Message); + } + + [Fact] + public void InvokeTimedOutException_AllCtorOverloads() + { + var inner = new TimeoutException("inner"); + Assert.IsAssignableFrom(new InvokeTimedOutException()); + Assert.Equal("m", new InvokeTimedOutException("m").Message); + Assert.Same(inner, new InvokeTimedOutException("m", inner).InnerException); + } + + [Fact] + public void InvokeStoppedException_IsInvokeException() + { + var ex = new InvokeStoppedException("stopped"); + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("stopped", ex.Message); + } + + [Fact] + public void InvokeStoppedException_AllCtorOverloads() + { + var inner = new InvalidOperationException("inner"); + Assert.IsAssignableFrom(new InvokeStoppedException()); + Assert.Equal("m", new InvokeStoppedException("m").Message); + Assert.Same(inner, new InvokeStoppedException("m", inner).InnerException); + } + + [Fact] + public void InvokeException_SubclassesCaughtByBase() + { + // Verifies the documented pattern-matching contract: catch + // (InvokeException) catches all three subclasses. + Exception failed = new InvokeFailedException("fail"); + Exception timedOut = new InvokeTimedOutException("timeout"); + Exception stopped = new InvokeStoppedException("stop"); + + Assert.True(failed is InvokeException); + Assert.True(timedOut is InvokeException); + Assert.True(stopped is InvokeException); + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs new file mode 100644 index 000000000..a55c9912e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs @@ -0,0 +1,260 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ExecutionStateTests +{ + private const string ExecutionInputId = "exec-input"; + + private static Operation ExecutionInputOp(string id = ExecutionInputId) => new() + { + Id = id, + Type = OperationTypes.Execution, + Status = OperationStatuses.Started + }; + + private static Operation StepOp(string id, string status, string? name = null) => new() + { + Id = id, + Type = OperationTypes.Step, + Status = status, + Name = name, + StepDetails = new StepDetails { Result = "true" } + }; + + [Fact] + public void LoadFromCheckpoint_NullState_NotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + + Assert.False(state.IsReplaying); + Assert.Equal(0, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_EmptyOperations_NotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState { Operations = new List() }); + + Assert.False(state.IsReplaying); + Assert.Equal(0, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_OnlyExecutionInputOp_NotReplaying() + { + // The service sends one EXECUTION-type op carrying the input payload + // even on the first invocation. That op is bookkeeping, not user + // history — it must not put us into replay mode. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List { ExecutionInputOp() } + }); + + Assert.False(state.IsReplaying); + Assert.Equal(1, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_WithReplayableOperations_IsReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0-fetch_user", OperationStatuses.Succeeded) + } + }); + + Assert.True(state.IsReplaying); + Assert.Equal(2, state.CheckpointedOperationCount); + } + + [Fact] + public void TrackReplay_FlipsOutOfReplay_OnceAllCompletedOpsVisited() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + StepOp("1", OperationStatuses.Succeeded), + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0"); + Assert.True(state.IsReplaying); // 1-of-2 completed ops visited + + state.TrackReplay("1"); + Assert.False(state.IsReplaying); // all completed ops visited → fresh + } + + [Fact] + public void TrackReplay_PendingOpDoesNotBlockTransition() + { + // A PENDING op (e.g. retry timer waiting) is not "completed" in the + // checkpoint sense — once the workflow has visited every terminally- + // completed op the SDK treats subsequent code as fresh. Terminal set + // is {SUCCEEDED, FAILED, CANCELLED, STOPPED}. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + StepOp("1", OperationStatuses.Pending), + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void TrackReplay_IsIdempotent() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + } + }); + + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + + // Second call is a no-op. + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void TrackReplay_NoOpWhenNotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + Assert.False(state.IsReplaying); + + state.TrackReplay("anything"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void GetOperation_ReturnsCheckpointedRecord() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + StepOp("0-validate", OperationStatuses.Succeeded) + } + }); + + var op = state.GetOperation("0-validate"); + Assert.NotNull(op); + Assert.Equal(OperationStatuses.Succeeded, op!.Status); + } + + [Fact] + public void GetOperation_ReturnsNull_WhenNotFound() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + + var op = state.GetOperation("0-nonexistent"); + Assert.Null(op); + } + + [Fact] + public void HasOperation_ReturnsTrueForExisting() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List { StepOp("0-step_a", OperationStatuses.Succeeded) } + }); + + Assert.True(state.HasOperation("0-step_a")); + Assert.False(state.HasOperation("1-step_b")); + } + + [Fact] + public void TrackReplay_TerminalSet_IncludesTimedOut() + { + // TIMED_OUT is a terminal state (matches Python/JS/Java reference SDKs). + // A timed-out chained-invoke that has been visited must allow the + // replay-mode flag to flip; otherwise IsReplaying would stay stuck on + // for the rest of the invocation and downstream replay-aware features + // (e.g., the future replay-aware logger) would mis-fire. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + new() + { + Id = "0-invoke", + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.TimedOut + } + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0-invoke"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void GetOperation_ReturnsLatestRecord_WhenIdAppearsMultipleTimes() + { + // Wire format: when the service replays an envelope it includes the + // most recent record per ID. We key by ID alone and rely on the service + // to provide the authoritative record. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "0-payment", + Type = OperationTypes.Step, + Status = OperationStatuses.Started + }, + new() + { + Id = "0-payment", + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"paid\"" } + } + } + }); + + var op = state.GetOperation("0-payment"); + Assert.NotNull(op); + Assert.Equal(OperationStatuses.Succeeded, op!.Status); + Assert.Equal("\"paid\"", op.StepDetails?.Result); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/LambdaCoreLoggerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/LambdaCoreLoggerTests.cs new file mode 100644 index 000000000..0bd7b50d6 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/LambdaCoreLoggerTests.cs @@ -0,0 +1,268 @@ +using System.Reflection; +using Amazon.Lambda.DurableExecution.Internal; +using Microsoft.Extensions.Logging; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests.Internal; + +/// +/// Asserts that LambdaCoreLogger preserves the original message template and +/// named placeholder arguments when forwarding to Amazon.Lambda.Core.LambdaLogger. +/// This is the contract that lets the Lambda runtime's JSON formatter emit +/// {OrderId}-style fields as top-level structured attributes. +/// +public class LambdaCoreLoggerTests : IDisposable +{ + private readonly Action? _originalLevelAction; + private readonly Action? _originalLevelAndExAction; + // The capturing delegates are invoked from concurrent tasks in the AsyncLocal + // test — guard list mutation with a lock. + private readonly object _captureLock = new(); + private readonly List<(string Level, string Template, object[] Args, Exception? Exception)> _captured = new(); + + public LambdaCoreLoggerTests() + { + _originalLevelAction = SwapLevelAction((level, template, args) => + { + lock (_captureLock) _captured.Add((level, template, args, null)); + }); + + _originalLevelAndExAction = SwapLevelAndExceptionAction((level, ex, template, args) => + { + lock (_captureLock) _captured.Add((level, template, args, ex)); + }); + } + + public void Dispose() + { + if (_originalLevelAction != null) SwapLevelAction(_originalLevelAction); + if (_originalLevelAndExAction != null) SwapLevelAndExceptionAction(_originalLevelAndExAction); + } + + [Fact] + public void Log_NamedPlaceholders_ForwardsTemplateAndArgs() + { + var logger = new LambdaCoreLogger(); + + logger.LogInformation("User {OrderId} bought {Count}", "abc-123", 7); + + var entry = Assert.Single(_captured); + Assert.Equal("Information", entry.Level); + Assert.Equal("User {OrderId} bought {Count}", entry.Template); + Assert.Equal(new object[] { "abc-123", 7 }, entry.Args); + Assert.Null(entry.Exception); + } + + [Fact] + public void Log_NamedPlaceholdersWithException_ForwardsTemplateAndArgs() + { + var logger = new LambdaCoreLogger(); + var ex = new InvalidOperationException("boom"); + + logger.LogError(ex, "Failed for {OrderId}", "abc-123"); + + var entry = Assert.Single(_captured); + Assert.Equal("Error", entry.Level); + Assert.Equal("Failed for {OrderId}", entry.Template); + Assert.Equal(new object[] { "abc-123" }, entry.Args); + Assert.Same(ex, entry.Exception); + } + + [Fact] + public void Log_PlainMessage_ForwardsAsLiteralWithEmptyArgs() + { + var logger = new LambdaCoreLogger(); + + logger.LogWarning("nothing structured here"); + + var entry = Assert.Single(_captured); + Assert.Equal("Warning", entry.Level); + Assert.Equal("nothing structured here", entry.Template); + Assert.Empty(entry.Args); + } + + [Fact] + public void Log_NonKvpState_FallsBackToFormatter() + { + var logger = new LambdaCoreLogger(); + + // Direct ILogger.Log call with a custom TState that is NOT + // FormattedLogValues. The formatter must be used to render the message. + ((ILogger)logger).Log( + LogLevel.Information, + new EventId(0), + state: 42, + exception: null, + formatter: (s, _) => $"value={s}"); + + var entry = Assert.Single(_captured); + Assert.Equal("Information", entry.Level); + Assert.Equal("value=42", entry.Template); + Assert.Empty(entry.Args); + } + + [Fact] + public void IsEnabled_None_ReturnsFalse() + { + var logger = new LambdaCoreLogger(); + Assert.False(logger.IsEnabled(LogLevel.None)); + Assert.True(logger.IsEnabled(LogLevel.Trace)); + } + + [Fact] + public void Log_WithKvpScope_AppendsScopeKeysToTemplateAndArgs() + { + var logger = new LambdaCoreLogger(); + + using (logger.BeginScope(new Dictionary + { + ["operationId"] = "op-1", + ["attempt"] = 2, + })) + { + logger.LogInformation("step done {Result}", "ok"); + } + + var entry = Assert.Single(_captured); + // The template's own placeholders come first; scope keys are appended. + Assert.Equal("step done {Result} {operationId} {attempt}", entry.Template); + Assert.Equal(new object[] { "ok", "op-1", 2 }, entry.Args); + } + + [Fact] + public void Log_WithNestedKvpScopes_InnerWinsAndOrderInnerToOuter() + { + var logger = new LambdaCoreLogger(); + + using (logger.BeginScope(new Dictionary + { + ["durableExecutionArn"] = "arn-outer", + ["awsRequestId"] = "req-1", + })) + using (logger.BeginScope(new Dictionary + { + ["operationId"] = "op-1", + ["awsRequestId"] = "req-INNER-WINS", // overrides outer + })) + { + logger.LogInformation("hello {Name}", "world"); + } + + var entry = Assert.Single(_captured); + // Inner scope keys appear before outer; the inner awsRequestId wins. + Assert.Equal( + "hello {Name} {operationId} {awsRequestId} {durableExecutionArn}", + entry.Template); + Assert.Equal( + new object[] { "world", "op-1", "req-INNER-WINS", "arn-outer" }, + entry.Args); + } + + [Fact] + public void Log_MessageArgWinsOverScopeKeyWithSameName() + { + var logger = new LambdaCoreLogger(); + + using (logger.BeginScope(new Dictionary + { + ["OrderId"] = "from-scope", + })) + { + logger.LogInformation("processing {OrderId}", "from-message"); + } + + var entry = Assert.Single(_captured); + // Scope key OrderId is dropped because the explicit message arg already + // claimed it; the runtime formatter sees only the explicit value. + Assert.Equal("processing {OrderId}", entry.Template); + Assert.Equal(new object[] { "from-message" }, entry.Args); + } + + [Fact] + public void BeginScope_PopsOnDispose_NoLeakAcrossLogCalls() + { + var logger = new LambdaCoreLogger(); + + using (logger.BeginScope(new Dictionary { ["scoped"] = "yes" })) + { + logger.LogInformation("inside"); + } + logger.LogInformation("outside"); + + Assert.Equal(2, _captured.Count); + Assert.Equal("inside {scoped}", _captured[0].Template); + Assert.Equal(new object[] { "yes" }, _captured[0].Args); + // After the using-block, the scope is popped; the second log carries no + // appended scope keys. + Assert.Equal("outside", _captured[1].Template); + Assert.Empty(_captured[1].Args); + } + + [Fact] + public async Task BeginScope_IsAsyncLocal_DoesNotLeakAcrossTasks() + { + var logger = new LambdaCoreLogger(); + var sibling1Captured = new List<(string Template, object[] Args)>(); + var sibling2Captured = new List<(string Template, object[] Args)>(); + + var inflight = new TaskCompletionSource(); + + async Task Sibling(string id, List<(string, object[])> sink) + { + using (logger.BeginScope(new Dictionary { ["taskId"] = id })) + { + // Yield to give the other task a chance to run with its own scope. + await Task.Yield(); + logger.LogInformation("emit"); + inflight.TrySetResult(); + } + } + + // Replace the capture sink temporarily so we can route per-task. + // Easiest: just inspect _captured, since the AsyncLocal scope chain is + // what we care about — order doesn't matter. + await Task.WhenAll(Sibling("A", sibling1Captured), Sibling("B", sibling2Captured)); + + Assert.Equal(2, _captured.Count); + var taskIds = _captured.Select(c => c.Args.Single()).OrderBy(v => v).ToArray(); + Assert.Equal(new object[] { "A", "B" }, taskIds); + Assert.All(_captured, c => Assert.Equal("emit {taskId}", c.Template)); + } + + [Fact] + public void BeginScope_NonKvpScope_Ignored() + { + var logger = new LambdaCoreLogger(); + + using (logger.BeginScope("just-a-string")) + { + logger.LogInformation("hello"); + } + + var entry = Assert.Single(_captured); + // String scopes don't carry keys; nothing to append. + Assert.Equal("hello", entry.Template); + Assert.Empty(entry.Args); + } + + private static Action? SwapLevelAction(Action replacement) + { + var field = typeof(Amazon.Lambda.Core.LambdaLogger).GetField( + "_loggingWithLevelAction", + BindingFlags.NonPublic | BindingFlags.Static)!; + var original = (Action?)field.GetValue(null); + field.SetValue(null, replacement); + return original; + } + + private static Action? SwapLevelAndExceptionAction( + Action replacement) + { + var field = typeof(Amazon.Lambda.Core.LambdaLogger).GetField( + "_loggingWithLevelAndExceptionAction", + BindingFlags.NonPublic | BindingFlags.Static)!; + var original = (Action?)field.GetValue(null); + field.SetValue(null, replacement); + return original; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/ReplayAwareLoggerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/ReplayAwareLoggerTests.cs new file mode 100644 index 000000000..dc92ccf79 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Internal/ReplayAwareLoggerTests.cs @@ -0,0 +1,153 @@ +using Amazon.Lambda.DurableExecution.Internal; +using Microsoft.Extensions.Logging; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests.Internal; + +public class ReplayAwareLoggerTests +{ + private const string SeedOpId = "seed"; + + private static ExecutionState ReplayState() + { + // Seed one completed user-replayable op so IsReplaying starts true. + // The op is NOT yet visited via TrackReplay, so we stay in replay. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() { Id = SeedOpId, Type = OperationTypes.Step, Status = OperationStatuses.Succeeded } + } + }); + Assert.True(state.IsReplaying); + return state; + } + + [Fact] + public void Log_DuringReplay_Suppressed() + { + var inner = new RecordingLogger(); + var logger = new ReplayAwareLogger(inner, ReplayState(), modeAware: true); + + logger.LogInformation("hello"); + + Assert.Empty(inner.Records); + } + + [Fact] + public void Log_DuringExecution_Passthrough() + { + var state = ReplayState(); + state.TrackReplay(SeedOpId); + var inner = new RecordingLogger(); + var logger = new ReplayAwareLogger(inner, state, modeAware: true); + + logger.LogInformation("hello"); + + Assert.Single(inner.Records); + Assert.Equal(LogLevel.Information, inner.Records[0].Level); + } + + [Fact] + public void Log_ModeAwareFalse_AlwaysLogs() + { + var inner = new RecordingLogger(); + var logger = new ReplayAwareLogger(inner, ReplayState(), modeAware: false); + + logger.LogWarning("still here"); + + Assert.Single(inner.Records); + } + + [Fact] + public void IsEnabled_DuringReplay_ReturnsFalse() + { + var inner = new RecordingLogger { ForcedEnabled = true }; + var logger = new ReplayAwareLogger(inner, ReplayState(), modeAware: true); + + Assert.False(logger.IsEnabled(LogLevel.Information)); + } + + [Fact] + public void IsEnabled_DuringExecution_DelegatesToInner() + { + var state = ReplayState(); + state.TrackReplay(SeedOpId); + var inner = new RecordingLogger { ForcedEnabled = false }; + var logger = new ReplayAwareLogger(inner, state, modeAware: true); + + Assert.False(logger.IsEnabled(LogLevel.Information)); + + inner.ForcedEnabled = true; + Assert.True(logger.IsEnabled(LogLevel.Information)); + } + + [Fact] + public void BeginScope_AlwaysDelegates() + { + var inner = new RecordingLogger(); + var logger = new ReplayAwareLogger(inner, ReplayState(), modeAware: true); + + // Even during replay, scopes must pass through to keep the scope stack + // balanced. + using (logger.BeginScope("scope-during-replay")) + { + Assert.Equal(1, inner.OpenScopes); + } + Assert.Equal(0, inner.OpenScopes); + } + + [Fact] + public void Log_TransitionsFromReplayToExecution() + { + // Mirror Python's test_logger_replay_then_new_logging: while the state + // is replaying the logger drops messages, but the moment TrackReplay + // visits the last checkpointed op IsReplaying flips and the next log + // line lands. + var state = ReplayState(); + var inner = new RecordingLogger(); + var logger = new ReplayAwareLogger(inner, state, modeAware: true); + + logger.LogInformation("during replay"); + Assert.Empty(inner.Records); + + state.TrackReplay(SeedOpId); + logger.LogInformation("after transition"); + + Assert.Single(inner.Records); + Assert.Contains("after transition", inner.Records[0].Message); + } +} + +internal sealed class RecordingLogger : ILogger +{ + public List<(LogLevel Level, string Message)> Records { get; } = new(); + public int OpenScopes { get; private set; } + public bool ForcedEnabled { get; set; } = true; + + public IDisposable BeginScope(TState state) where TState : notnull + { + OpenScopes++; + return new ScopeToken(this); + } + + public bool IsEnabled(LogLevel logLevel) => ForcedEnabled; + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + Records.Add((logLevel, formatter(state, exception))); + } + + private sealed class ScopeToken : IDisposable + { + private readonly RecordingLogger _owner; + public ScopeToken(RecordingLogger owner) => _owner = owner; + public void Dispose() => _owner.OpenScopes--; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs new file mode 100644 index 000000000..eb8b7a757 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs @@ -0,0 +1,577 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class InvokeOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private const string FunctionArn = "arn:aws:lambda:us-east-1:123456789012:function:downstream:prod"; + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + #region Argument validation + + [Fact] + public async Task InvokeAsync_NullFunctionName_ThrowsArgumentNullException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: null!, payload: "x")); + } + + [Fact] + public async Task InvokeAsync_EmptyFunctionName_ThrowsArgumentException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: "", payload: "x")); + } + + [Fact] + public async Task InvokeAsync_WhitespaceFunctionName_ThrowsArgumentException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: " ", payload: "x")); + } + + [Fact] + public async Task InvokeAsync_PreservesUnqualifiedArn_AndPassesItThrough() + { + // The SDK does NOT regex-validate qualified ARNs. The service enforces + // that rule. We verify the value is propagated unmodified to the + // ChainedInvokeOptions.FunctionName so that service-side rejection + // surfaces with the user's exact input. + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync( + "arn:aws:lambda:us-east-1:123456789012:function:no-version", + payload: "x", + name: "noversion"); + + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + var start = recorder.Flushed.Single(o => o.Action == "START"); + Assert.Equal("arn:aws:lambda:us-east-1:123456789012:function:no-version", + start.ChainedInvokeOptions.FunctionName); + } + + #endregion + + #region Fresh execution + + [Fact] + public async Task InvokeAsync_FreshExecution_CheckpointsStartAndSuspends() + { + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync( + FunctionArn, + new RequestPayload { Amount = 100, Currency = "USD" }, + name: "process_payment", + config: new InvokeConfig { TenantId = "tenant-A" }); + + // Service-side suspend mechanics: TerminationManager fires before the + // user task completes; the task itself never resolves on the fresh path. + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + + var start = recorder.Flushed.Single(); + Assert.Equal("CHAINED_INVOKE", start.Type); + Assert.Equal("START", start.Action); + Assert.Equal("ChainedInvoke", start.SubType); + Assert.Equal("process_payment", start.Name); + Assert.Equal(IdAt(1), start.Id); + + // Payload is JSON-serialized via the registered ILambdaSerializer. + Assert.Contains("\"Amount\":100", start.Payload); + Assert.Contains("\"Currency\":\"USD\"", start.Payload); + + // ChainedInvokeOptions carries function name + tenant id. + Assert.NotNull(start.ChainedInvokeOptions); + Assert.Equal(FunctionArn, start.ChainedInvokeOptions.FunctionName); + Assert.Equal("tenant-A", start.ChainedInvokeOptions.TenantId); + } + + [Fact] + public async Task InvokeAsync_FreshExecution_NoTenantId_OmitsTenantId() + { + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync(FunctionArn, "payload", name: "no_tenant"); + + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + + var start = recorder.Flushed.Single(); + Assert.NotNull(start.ChainedInvokeOptions); + Assert.Equal(FunctionArn, start.ChainedInvokeOptions.FunctionName); + // null tenant means the SDK didn't set the field; the AWS SDK model's + // IsSet property is what callers actually inspect, but the easy + // deterministic assertion is that the property is null. + Assert.Null(start.ChainedInvokeOptions.TenantId); + } + + [Fact] + public async Task InvokeAsync_FreshExecution_StartIsSyncFlushed() + { + // Critical correctness invariant: START must be flushed BEFORE we + // suspend. A queued-but-unflushed START is "the service doesn't know + // about the chained invocation," so the parent suspends forever. + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync(FunctionArn, "x", name: "sync_flush"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // No DrainAsync — the START must already be flushed at the moment + // suspension is signaled. This mirrors WaitOperation_NewExecution_SignalsTermination's + // contract: TerminationManager firing implies the matching START is durable. + Assert.Single(recorder.Flushed); + Assert.Equal("START", recorder.Flushed[0].Action); + } + + [Fact] + public async Task InvokeAsync_TerminationReason_IsInvokePending() + { + var (context, _, tm, _) = CreateContext(); + + _ = context.InvokeAsync(FunctionArn, "x", name: "reason_check"); + var termination = await tm.TerminationTask; + + Assert.Equal(TerminationReason.InvokePending, termination.Reason); + } + + [Fact] + public async Task InvokeAsync_NoSerializerRegistered_ThrowsInvalidOperationException() + { + // If the user constructs a Lambda runtime without a serializer (or in + // tests, neglects to set TestLambdaContext.Serializer), InvokeAsync + // surfaces a helpful error rather than NREing inside InvokeOperation. + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no serializer! + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "no_serializer")); + } + + #endregion + + #region Replay — terminal status mapping + + [Fact] + public async Task InvokeAsync_ReplaySucceeded_ReturnsCachedResultWithoutRescheduling() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "cached", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Result = "{\"OrderId\":\"abc\",\"Total\":42}" + } + } + } + }); + + var result = await context.InvokeAsync( + FunctionArn, "x", name: "cached"); + + Assert.False(tm.IsTerminated); + Assert.Equal("abc", result.OrderId); + Assert.Equal(42, result.Total); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayFailed_ThrowsInvokeFailedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Failed, + Name = "boom", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "downstream exploded", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "boom")); + + Assert.Equal("downstream exploded", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + + // Subclass relationship — `catch (InvokeException)` catches all three. + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayTimedOut_ThrowsInvokeTimedOutException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.TimedOut, + Name = "slow", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Error = new ErrorObject + { + ErrorMessage = "execution timed out after 60s" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "slow")); + + Assert.Equal("execution timed out after 60s", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayStopped_ThrowsInvokeStoppedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Stopped, + Name = "stopped" + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "stopped")); + + // No recorded ErrorMessage → fallback default. + Assert.Equal("Chained invoke was stopped.", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayStarted_ResuspendsWithoutRecheckpoint() + { + // Service hasn't reached terminal yet. The original START is still + // authoritative; do not re-emit, just suspend. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Started, + Name = "still_running" + } + } + }); + + var task = context.InvokeAsync(FunctionArn, "x", name: "still_running"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // Crucially: no checkpoint was emitted. Original START is authoritative. + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayPending_ResuspendsWithoutRecheckpoint() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Pending, + Name = "pending" + } + } + }); + + var task = context.InvokeAsync(FunctionArn, "x", name: "pending"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayUnknownStatus_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = "TOTALLY_BOGUS", + Name = "mystery" + } + } + }); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "mystery")); + } + + [Fact] + public async Task InvokeAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, // wrong type + Status = OperationStatuses.Succeeded, + Name = "kept_consistent", + StepDetails = new StepDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "kept_consistent")); + + Assert.Contains("expected type 'CHAINED_INVOKE'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + #endregion + + #region Serialization + + [Fact] + public async Task InvokeAsync_DeserializesResultViaRegisteredSerializer() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "json_result", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Result = "{\"OrderId\":\"o-7\",\"Total\":1024}" + } + } + } + }); + + var result = await context.InvokeAsync( + FunctionArn, + new RequestPayload { Amount = 1, Currency = "USD" }, + name: "json_result"); + + Assert.Equal("o-7", result.OrderId); + Assert.Equal(1024, result.Total); + } + + #endregion + + #region End-to-end suspension / resume parity + + [Fact] + public async Task EndToEnd_StepInvokeStep_FirstInvocation_SuspendsOnInvoke() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var batcher = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, batcher.Batcher); + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + await context.StepAsync(async (_) => { await Task.CompletedTask; return "validated"; }, name: "validate"); + var paymentId = await context.InvokeAsync( + FunctionArn, "validated", name: "process_payment"); + return await context.StepAsync(async (_) => { await Task.CompletedTask; return paymentId + "-done"; }, name: "finalize"); + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + + await batcher.Batcher.DrainAsync(); + Assert.Contains(batcher.Flushed, o => o.Type == "CHAINED_INVOKE" && o.Action == "START"); + Assert.DoesNotContain(batcher.Flushed, o => o.Type == "STEP" && o.Name == "finalize"); + } + + [Fact] + public async Task EndToEnd_StepInvokeStep_SecondInvocation_ResumesAndCompletes() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "validate", + StepDetails = new StepDetails { Result = "\"validated\"" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "process_payment", + ChainedInvokeDetails = new ChainedInvokeDetails { Result = "\"pmt-42\"" } + } + } + }); + + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + var finalizeRan = false; + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + var validated = await context.StepAsync(async (_) => { await Task.CompletedTask; return "fresh-validated"; }, name: "validate"); + Assert.Equal("validated", validated); // cached + + var paymentId = await context.InvokeAsync( + FunctionArn, validated, name: "process_payment"); + Assert.Equal("pmt-42", paymentId); // cached + + return await context.StepAsync(async (_) => + { + finalizeRan = true; + await Task.CompletedTask; + return paymentId + "-done"; + }, name: "finalize"); + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("pmt-42-done", result.Result); + Assert.True(finalizeRan); + } + + #endregion + + #region Test-only types + + private class RequestPayload + { + public int Amount { get; set; } + public string? Currency { get; set; } + } + + private class ResponsePayload + { + public string? OrderId { get; set; } + public long Total { get; set; } + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs new file mode 100644 index 000000000..ab649f150 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs @@ -0,0 +1,407 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Services; +using Amazon.Lambda.Model; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class LambdaDurableServiceClientTests +{ + [Fact] + public async Task CheckpointAsync_EmptyOperations_NoApiCallReturnsToken() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + var token = await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + "input-token", + Array.Empty()); + + Assert.Equal("input-token", token); + Assert.Empty(mockClient.CheckpointCalls); + } + + [Fact] + public async Task CheckpointAsync_NullCheckpointToken_SendsEmptyString() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + checkpointToken: null, + new[] + { + new OperationUpdate + { + Id = "0-step", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Name = "do_thing", + Payload = "\"ok\"" + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + Assert.Equal("", call.CheckpointToken); + } + + [Fact] + public async Task CheckpointAsync_StepWithError_PropagatesError() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-bad", + Type = "STEP", + Action = "FAIL", + SubType = "Step", + Name = "bad", + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new List { "at A.B()", "at C.D()" } + } + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + var update = Assert.Single(call.Updates); + Assert.Equal("STEP", update.Type); + Assert.Equal("FAIL", update.Action); + Assert.NotNull(update.Error); + Assert.Equal("System.TimeoutException", update.Error.ErrorType); + Assert.Equal("timed out", update.Error.ErrorMessage); + Assert.Equal("{\"detail\":\"x\"}", update.Error.ErrorData); + Assert.Equal(2, update.Error.StackTrace.Count); + } + + [Fact] + public async Task CheckpointAsync_WaitWithOptions_PropagatesWaitOptions() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-wait", + Type = "WAIT", + Action = "START", + SubType = "Wait", + Name = "delay", + WaitOptions = new WaitOptions { WaitSeconds = 45 } + } + }); + + var update = mockClient.CheckpointCalls[0].Updates[0]; + Assert.NotNull(update.WaitOptions); + Assert.Equal(45, update.WaitOptions.WaitSeconds); + } + + [Fact] + public async Task CheckpointAsync_ParentIdAndPayload_ArePropagated() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "child-1", + ParentId = "parent-0", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Payload = "{\"a\":1}" + } + }); + + var update = mockClient.CheckpointCalls[0].Updates[0]; + Assert.Equal("parent-0", update.ParentId); + Assert.Equal("{\"a\":1}", update.Payload); + } + + [Fact] + public async Task CheckpointAsync_MultipleUpdates_AllForwarded() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-step", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Name = "validate" + }, + new OperationUpdate + { + Id = "1-wait", + Type = "WAIT", + Action = "START", + SubType = "Wait", + Name = "delay", + WaitOptions = new WaitOptions { WaitSeconds = 30 } + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + Assert.Equal(2, call.Updates.Count); + Assert.Equal("STEP", call.Updates[0].Type); + Assert.Equal("WAIT", call.Updates[1].Type); + } + + [Fact] + public async Task GetExecutionStateAsync_CopiesContextDetailsResultAndError() + { + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "SUCCEEDED", + Name = "phase", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Result = "\"ok\"" + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "ctx-2", + Type = "CONTEXT", + Status = "FAILED", + Name = "phase2", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "boom", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new List { "at A.B()", "at C.D()" } + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + Assert.Equal(2, operations.Count); + + Assert.NotNull(operations[0].ContextDetails); + Assert.Equal("\"ok\"", operations[0].ContextDetails!.Result); + Assert.Null(operations[0].ContextDetails!.Error); + + Assert.NotNull(operations[1].ContextDetails); + Assert.NotNull(operations[1].ContextDetails!.Error); + Assert.Equal("System.InvalidOperationException", operations[1].ContextDetails!.Error!.ErrorType); + Assert.Equal("boom", operations[1].ContextDetails!.Error!.ErrorMessage); + Assert.Equal("{\"detail\":\"x\"}", operations[1].ContextDetails!.Error!.ErrorData); + Assert.Equal(new[] { "at A.B()", "at C.D()" }, operations[1].ContextDetails!.Error!.StackTrace); + } + + [Fact] + public async Task GetExecutionStateAsync_CopiesStepDetailsErrorStackTraceAndErrorData() + { + // Round-trip safety: the SDK returns ErrorObject with all four fields, + // and Internal.Operation must preserve them so StepException can surface + // OriginalStackTrace / ErrorData on replay. + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "step-1", + Type = "STEP", + Status = "FAILED", + Name = "charge", + StepDetails = new Amazon.Lambda.Model.StepDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"y\"}", + StackTrace = new List { "at E.F()", "at G.H()" } + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + var op = Assert.Single(operations); + Assert.NotNull(op.StepDetails); + Assert.NotNull(op.StepDetails!.Error); + Assert.Equal("System.TimeoutException", op.StepDetails!.Error!.ErrorType); + Assert.Equal("timed out", op.StepDetails!.Error!.ErrorMessage); + Assert.Equal("{\"detail\":\"y\"}", op.StepDetails!.Error!.ErrorData); + Assert.Equal(new[] { "at E.F()", "at G.H()" }, op.StepDetails!.Error!.StackTrace); + } + + [Fact] + public async Task GetExecutionStateAsync_MapFromSdkOperation_RoundTripsAllErrorFields() + { + // Pre-existing bug guard: MapFromSdkOperation used to drop ErrorData + // and StackTrace from the SDK error object, so the durable exception + // builders (StepException, ChildContextException, and the + // InvokeException tree) always saw nulls for those fields on + // real-service replay. This test pins down the fix for all three + // operation types that carry an error. + var stack = new List { "at Frame.One()", "at Frame.Two()" }; + + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "step-1", + Type = "STEP", + Status = "FAILED", + StepDetails = new Amazon.Lambda.Model.StepDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "step blew up", + ErrorData = "{\"detail\":\"step\"}", + StackTrace = stack + } + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "FAILED", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.ArgumentException", + ErrorMessage = "ctx blew up", + ErrorData = "{\"detail\":\"ctx\"}", + StackTrace = stack + } + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "inv-1", + Type = "CHAINED_INVOKE", + Status = "FAILED", + ChainedInvokeDetails = new Amazon.Lambda.Model.ChainedInvokeDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "invoke blew up", + ErrorData = "{\"detail\":\"invoke\"}", + StackTrace = stack + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + Assert.Equal(3, operations.Count); + + // STEP — all four fields propagate. + var stepError = operations[0].StepDetails!.Error!; + Assert.Equal("System.InvalidOperationException", stepError.ErrorType); + Assert.Equal("step blew up", stepError.ErrorMessage); + Assert.Equal("{\"detail\":\"step\"}", stepError.ErrorData); + Assert.NotNull(stepError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, stepError.StackTrace!); + + // CHILD CONTEXT — all four fields propagate. + var ctxError = operations[1].ContextDetails!.Error!; + Assert.Equal("System.ArgumentException", ctxError.ErrorType); + Assert.Equal("ctx blew up", ctxError.ErrorMessage); + Assert.Equal("{\"detail\":\"ctx\"}", ctxError.ErrorData); + Assert.NotNull(ctxError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, ctxError.StackTrace!); + + // CHAINED_INVOKE — all four fields propagate. + var invError = operations[2].ChainedInvokeDetails!.Error!; + Assert.Equal("System.TimeoutException", invError.ErrorType); + Assert.Equal("invoke blew up", invError.ErrorMessage); + Assert.Equal("{\"detail\":\"invoke\"}", invError.ErrorData); + Assert.NotNull(invError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, invError.StackTrace!); + } + + [Fact] + public async Task CheckpointAsync_ReturnsNewToken() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + var newToken = await client.CheckpointAsync( + "arn", + "old-token", + new[] + { + new OperationUpdate + { + Id = "0-x", + Type = "STEP", + Action = "SUCCEED" + } + }); + + // MockLambdaClient returns "token-1", "token-2", etc. + Assert.Equal("token-1", newToken); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs new file mode 100644 index 000000000..0e796e7a0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs @@ -0,0 +1,777 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class MapOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FreshExecution_AllItemsSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var items = new[] { 10, 20, 30 }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => { await Task.Yield(); return item * 2; }, + name: "double_all"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 20, 40, 60 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 item CONTEXT STARTs + 3 item CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Map:START", contextActions[0]); + Assert.Equal("Map:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task MapAsync_PassesItemIndexAndFullList_ToCallback() + { + var (context, _, _, _) = CreateContext(); + + var items = new[] { "a", "b", "c" }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.Yield(); + // Confirm the callback sees the item, its index, and the whole list. + Assert.Same(items, all); + Assert.Equal(items[index], item); + return $"{index}:{item}:{all.Count}"; + }); + + Assert.Equal(new[] { "0:a:3", "1:b:3", "2:c:3" }, result.GetResults()); + } + + [Fact] + public async Task MapAsync_PreservesIndexOrder_EvenWhenItemsCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 40, 10, 20 }, + async (ctx, delay, index, all) => { await Task.Delay(delay); return index + 1; }); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task MapAsync_ItemOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { "a", "b" }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstItemId = ChildIdAt(parentOpId, 1); + var secondItemId = ChildIdAt(parentOpId, 2); + + var itemStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .ToArray(); + Assert.Equal(2, itemStarts.Length); + Assert.Contains(itemStarts, o => o.Id == firstItemId); + Assert.Contains(itemStarts, o => o.Id == secondItemId); + } + + [Fact] + public async Task MapAsync_DefaultNaming_UsesIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task MapAsync_ItemNamer_PropagatesNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { "order-1", "order-2" }, + async (ctx, item, index, all) => { await Task.Yield(); return item.Length; }, + name: "process_orders", + config: new MapConfig { ItemNamer = (item, index) => $"Order-{item}" }); + + Assert.Equal("Order-order-1", result.All[0].Name); + Assert.Equal("Order-order-2", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var itemSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-1"); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-2"); + } + + [Fact] + public async Task MapAsync_EmptyCollection_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + Array.Empty(), + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Map:START", "Map:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — Map's permissive default vs fail-fast opt-in + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_AllCompletedDefault_PartialFailureDoesNotThrow() + { + // Map's default CompletionConfig is AllCompleted() (permissive), unlike + // Parallel's AllSuccessful(). A single item failure is captured rather + // than thrown. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("oops"); + return item; + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task MapAsync_AllSuccessfulOptIn_OneFailureThrowsMapException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("item boom"); + return item; + }, + config: new MapConfig { CompletionConfig = CompletionConfig.AllSuccessful() })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task MapAsync_ThrowIfError_ThrowsUnderPermissiveDefault() + { + // The permissive default does not auto-throw; ThrowIfError is the + // explicit strict-success check. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("boom"); + return item; + }); + + Assert.True(result.HasFailure); + var thrown = Assert.ThrowsAny(() => result.ThrowIfError()); + Assert.Contains("boom", thrown.Message); + } + + [Fact] + public async Task MapAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item != 3) throw new InvalidOperationException($"fail-{item}"); + return item; + }, + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first/min-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so dispatch order is deterministic: item 0 fires + // first and succeeds; items 1 and 2 are never dispatched and remain + // BatchItemStatus.Started. + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var result = await context.MapAsync( + new[] { 1, 2, 3, 4, 5 }, + async (ctx, item, index, all) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return item; + }, + config: new MapConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + } + + [Fact] + public async Task MapAsync_MaxConcurrencyAtLeastItemCount_RunsWithoutSemaphore() + { + // MaxConcurrency >= item count exercises the no-semaphore optimization + // path; behavior must be identical (all items still run). + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig { MaxConcurrency = 10 }); + + Assert.Equal(3, result.SuccessCount); + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + } + + [Fact] + public void MapConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new MapConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + [Fact] + public void MapConfig_DefaultCompletionConfig_IsAllCompleted() + { + // Guards the intentional divergence from ParallelConfig (AllSuccessful). + var config = new MapConfig(); + // AllCompleted() == empty CompletionConfig (no failure thresholds). + Assert.Null(config.CompletionConfig.ToleratedFailureCount); + Assert.Null(config.CompletionConfig.MinSuccessful); + Assert.Null(config.CompletionConfig.ToleratedFailurePercentage); + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NestingTypeFlat_SuppressesPerItemContextOps() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item * 10; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Map CONTEXT ops still emitted; no per-item CONTEXT ops under Flat. + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Map") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "MapItem")); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_InnerOpsReparentToMapOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + await ctx.StepAsync(async (_) => { await Task.Yield(); return item * 10; }), + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var item0Id = ChildIdAt(parentOpId, 1); + var item1Id = ChildIdAt(parentOpId, 2); + var step0Id = ChildIdAt(item0Id, 1); + var step1Id = ChildIdAt(item1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // Inner steps re-parent to the MAP op (nearest non-virtual ancestor). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"10"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"20"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "doubler", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { executed = true; await Task.Yield(); return item * 999; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NullItems_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync( + null!, + async (ctx, item, index, all) => { await Task.Yield(); return item; })); + } + + [Fact] + public async Task MapAsync_NullFunc_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync(new[] { 1 }, (Func, Task>)null!)); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "double_all", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "double_all"); + + // Cached results returned without re-executing the callback. + Assert.Equal(0, calls); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Item 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "m"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayFailed_RebuildsResultAndThrows() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject { ErrorMessage = "stored failure", ErrorType = "System.InvalidOperationException" } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + } + + [Fact] + public async Task MapAsync_ReplayWithDriftedItemName_ThrowsNonDeterministic() + { + // A checkpointed item name that differs from the current ItemNamer output + // indicates the item set was reordered/renamed between deployments. + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m", + // Namer now yields "renamed" instead of the checkpointed "alpha". + config: new MapConfig { ItemNamer = (item, index) => "renamed" })); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_TwoFreshRuns_ProduceIdenticalItemOperationIds() + { + // Item operation IDs are derived from the parent op ID + index, so two + // independent fresh runs of the same workflow shape must emit the same + // child IDs (the foundation of replay correctness). + string[] IdsFromRun() + { + var (context, recorder, _, _) = CreateContext(); + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }).GetAwaiter().GetResult(); + recorder.Batcher.DrainAsync().GetAwaiter().GetResult(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .Select(o => o.Id) + .OrderBy(id => id) + .ToArray(); + } + + var first = IdsFromRun(); + var second = IdsFromRun(); + + Assert.Equal(3, first.Length); + Assert.Equal(first, second); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs new file mode 100644 index 000000000..9739b2907 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs @@ -0,0 +1,85 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Model; +using Amazon.Runtime; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// A mock that subclasses AmazonLambdaClient and overrides CheckpointDurableExecutionAsync +/// to avoid real API calls. Records checkpoint requests for test assertions. +/// +internal class MockLambdaClient : AmazonLambdaClient +{ + public List CheckpointCalls { get; } = new(); + public List GetExecutionStateCalls { get; } = new(); + + /// + /// Optional handler for calls. Tests + /// that exercise the paginated-state path can set this to control the response + /// for each page. + /// + public Func? GetExecutionStateHandler { get; set; } + + private int _tokenCounter; + + public MockLambdaClient() : base("fake-access-key", "fake-secret-key", Amazon.RegionEndpoint.USEast1) { } + + /// + /// Optional exception thrown by . Tests + /// that exercise checkpoint-error classification can set this to inject a specific + /// SDK exception on the orchestration-path drain. + /// + public Exception? CheckpointThrows { get; set; } + + /// + /// Optional exception thrown by . Tests + /// that exercise hydration-error classification can set this to inject a specific + /// SDK exception on the initial state-fetch path. + /// + public Exception? GetExecutionStateThrows { get; set; } + + /// + /// Optional handler that produces a + /// per request. Tests modeling the durable-execution service's + /// NewExecutionState response (e.g. stamping a CallbackId onto a + /// freshly-started CALLBACK op) wire this up. When null, a default + /// response is produced with only the auto-incremented checkpoint token. + /// + public Func? CheckpointHandler { get; set; } + + public override Task CheckpointDurableExecutionAsync( + CheckpointDurableExecutionRequest request, + CancellationToken cancellationToken = default) + { + CheckpointCalls.Add(request); + if (CheckpointThrows != null) throw CheckpointThrows; + if (CheckpointHandler != null) + { + var resp = CheckpointHandler(request); + // Auto-fill token if the test left it blank. + if (string.IsNullOrEmpty(resp.CheckpointToken)) + resp.CheckpointToken = $"token-{++_tokenCounter}"; + return Task.FromResult(resp); + } + return Task.FromResult(new CheckpointDurableExecutionResponse + { + CheckpointToken = $"token-{++_tokenCounter}" + }); + } + + public override Task GetDurableExecutionStateAsync( + GetDurableExecutionStateRequest request, + CancellationToken cancellationToken = default) + { + GetExecutionStateCalls.Add(request); + if (GetExecutionStateThrows != null) throw GetExecutionStateThrows; + if (GetExecutionStateHandler != null) + { + return Task.FromResult(GetExecutionStateHandler(request)); + } + return Task.FromResult(new GetDurableExecutionStateResponse()); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs new file mode 100644 index 000000000..4c9aaeba4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs @@ -0,0 +1,295 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ModelsTests +{ + [Fact] + public void Operation_PropertiesAssignable() + { + var op = new Operation + { + Id = "op-1", + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "fetch_user", + StepDetails = new StepDetails { Result = "{\"name\":\"Alice\"}" } + }; + + Assert.Equal("op-1", op.Id); + Assert.Equal(OperationTypes.Step, op.Type); + Assert.Equal(OperationStatuses.Succeeded, op.Status); + Assert.Equal("fetch_user", op.Name); + Assert.Equal("{\"name\":\"Alice\"}", op.StepDetails?.Result); + } + + [Fact] + public void Operation_WaitWithScheduledEndTimestamp() + { + var op = new Operation + { + Id = "op-2", + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + Name = "cooldown", + WaitDetails = new WaitDetails + { + ScheduledEndTimestamp = 1767268830000L // 2026-01-01T12:00:30Z in ms + } + }; + + Assert.Equal(OperationTypes.Wait, op.Type); + Assert.Equal(1767268830000L, op.WaitDetails?.ScheduledEndTimestamp); + } + + [Fact] + public void ErrorObject_FromException() + { + var ex = new InvalidOperationException("something went wrong"); + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.InvalidOperationException", error.ErrorType); + Assert.Equal("something went wrong", error.ErrorMessage); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsStepException() + { + // A failing user step gets wrapped as StepException carrying the original + // ErrorType. Recording the wrapper's type would lose the user-facing + // exception identity across a chained-invoke boundary, so FromException + // pulls the original error fields through. + var ex = new StepException("intentional child failure") + { + ErrorType = "System.InvalidOperationException", + ErrorData = "{\"hint\":\"data\"}", + OriginalStackTrace = new[] { "at User.Workflow.Body()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.InvalidOperationException", error.ErrorType); + Assert.Equal("intentional child failure", error.ErrorMessage); + Assert.Equal("{\"hint\":\"data\"}", error.ErrorData); + Assert.Equal(new[] { "at User.Workflow.Body()" }, error.StackTrace); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsChildContextException() + { + var ex = new ChildContextException("child failed") + { + ErrorType = "System.ArgumentException", + ErrorData = "{\"k\":\"v\"}", + OriginalStackTrace = new[] { "at Inner()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.ArgumentException", error.ErrorType); + Assert.Equal("child failed", error.ErrorMessage); + Assert.Equal("{\"k\":\"v\"}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsInvokeException() + { + var ex = new InvokeFailedException("downstream failed") + { + FunctionName = "arn:aws:lambda:...:function:downstream", + ErrorType = "System.TimeoutException", + ErrorData = "{\"region\":\"us-east-1\"}", + OriginalStackTrace = new[] { "at Downstream.Run()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.TimeoutException", error.ErrorType); + Assert.Equal("downstream failed", error.ErrorMessage); + Assert.Equal("{\"region\":\"us-east-1\"}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsCallbackException() + { + var ex = new CallbackFailedException("callback failed") + { + CallbackId = "cb-123", + ErrorType = "Acme.Errors.PaymentDeclined", + ErrorData = "{\"code\":42}", + OriginalStackTrace = new[] { "at External.Reject()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("Acme.Errors.PaymentDeclined", error.ErrorType); + Assert.Equal("callback failed", error.ErrorMessage); + Assert.Equal("{\"code\":42}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsStepException_WithNullErrorType() + { + // StepException without an explicit ErrorType (e.g., constructed by code + // that didn't set the init-only property) records null rather than + // falling back to the wrapper's type — the wrapper type is never useful. + var ex = new StepException("no type set"); + + var error = ErrorObject.FromException(ex); + + Assert.Null(error.ErrorType); + Assert.Equal("no type set", error.ErrorMessage); + } + + [Fact] + public void ErrorObject_RoundTripSerialization() + { + var error = new ErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + StackTrace = new[] { "at Foo.Bar()", "at Baz.Qux()" }, + ErrorData = "{\"key\":\"value\"}" + }; + + var json = JsonSerializer.Serialize(error); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal("System.TimeoutException", deserialized.ErrorType); + Assert.Equal("timed out", deserialized.ErrorMessage); + Assert.Equal(2, deserialized.StackTrace!.Count); + Assert.Equal("{\"key\":\"value\"}", deserialized.ErrorData); + } + + [Fact] + public void DurableExecutionInvocationInput_Deserialization() + { + var json = """ + { + "DurableExecutionArn": "arn:aws:lambda:us-east-1:123:durable-execution:abc", + "CheckpointToken": "token-1", + "InitialExecutionState": { + "Operations": [ + { + "Id": "exec-1", + "Type": "EXECUTION", + "Status": "STARTED", + "ExecutionDetails": { + "InputPayload": "{\"orderId\":\"order-123\",\"amount\":99.99}" + } + }, + { + "Id": "op-1", + "Type": "STEP", + "Status": "SUCCEEDED", + "Name": "validate", + "StepDetails": { + "Result": "true" + } + } + ] + } + } + """; + + var input = JsonSerializer.Deserialize(json)!; + + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:abc", input.DurableExecutionArn); + Assert.Equal("token-1", input.CheckpointToken); + Assert.NotNull(input.InitialExecutionState); + Assert.Equal(2, input.InitialExecutionState!.Operations!.Count); + + var stepOp = input.InitialExecutionState.Operations![1]; + Assert.Equal("op-1", stepOp.Id); + Assert.Equal(OperationTypes.Step, stepOp.Type); + Assert.Equal("true", stepOp.StepDetails?.Result); + + // The EXECUTION operation carries the user payload in ExecutionDetails.InputPayload. + var execOp = input.InitialExecutionState.Operations[0]; + Assert.Equal(OperationTypes.Execution, execOp.Type); + var payload = JsonSerializer.Deserialize(execOp.ExecutionDetails!.InputPayload!); + Assert.Equal("order-123", payload!.OrderId); + Assert.Equal(99.99m, payload.Amount); + } + + [Fact] + public void DurableExecutionInvocationInput_NoExecutionOp_HasNullPayload() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:test" + }; + + // No InitialExecutionState means no EXECUTION operation and thus no user payload + Assert.Null(input.InitialExecutionState); + } + + [Fact] + public void DurableExecutionInvocationOutput_Succeeded() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Succeeded, + Result = "{\"status\":\"approved\"}" + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Succeeded, deserialized.Status); + Assert.Equal("{\"status\":\"approved\"}", deserialized.Result); + } + + [Fact] + public void DurableExecutionInvocationOutput_Failed() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = new ErrorObject + { + ErrorMessage = "step failed", + ErrorType = "StepException" + } + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Failed, deserialized.Status); + Assert.NotNull(deserialized.Error); + Assert.Equal("step failed", deserialized.Error!.ErrorMessage); + Assert.Equal("StepException", deserialized.Error.ErrorType); + } + + [Fact] + public void DurableExecutionInvocationOutput_Pending() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Pending + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Pending, deserialized.Status); + Assert.Null(deserialized.Result); + Assert.Null(deserialized.Error); + } + + private class TestOrderEvent + { + [System.Text.Json.Serialization.JsonPropertyName("orderId")] + public string? OrderId { get; set; } + + [System.Text.Json.Serialization.JsonPropertyName("amount")] + public decimal Amount { get; set; } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs new file mode 100644 index 000000000..2c4d4ce90 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs @@ -0,0 +1,126 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class OperationIdGeneratorTests +{ + private static string Sha256Hex(string input) + { + using var sha = SHA256.Create(); + var bytes = sha.ComputeHash(Encoding.UTF8.GetBytes(input)); + var sb = new StringBuilder(bytes.Length * 2); + foreach (var b in bytes) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + [Fact] + public void NextId_ProducesSha256OfPositionString_StartingAtOne() + { + var gen = new OperationIdGenerator(); + Assert.Equal(Sha256Hex("1"), gen.NextId()); + Assert.Equal(Sha256Hex("2"), gen.NextId()); + Assert.Equal(Sha256Hex("3"), gen.NextId()); + } + + [Fact] + public void HashOperationId_IsStable() + { + Assert.Equal(Sha256Hex("hello"), OperationIdGenerator.HashOperationId("hello")); + Assert.Equal(Sha256Hex("1"), OperationIdGenerator.HashOperationId("1")); + } + + [Fact] + public void ChildGenerator_PrefixesPositionWithParentHash() + { + var gen = new OperationIdGenerator(); + var parentId = gen.NextId(); + var child = gen.CreateChild(parentId); + + Assert.Equal(Sha256Hex(parentId + "-1"), child.NextId()); + Assert.Equal(Sha256Hex(parentId + "-2"), child.NextId()); + } + + [Fact] + public void ChildGenerator_ParentIdProperty() + { + var gen = new OperationIdGenerator(); + Assert.Null(gen.ParentId); + + var child = new OperationIdGenerator("op-5"); + Assert.Equal("op-5", child.ParentId); + } + + [Fact] + public void MultipleChildren_IndependentCounters() + { + var child1 = new OperationIdGenerator("parent-1"); + var child2 = new OperationIdGenerator("parent-2"); + + Assert.Equal(Sha256Hex("parent-1-1"), child1.NextId()); + Assert.Equal(Sha256Hex("parent-2-1"), child2.NextId()); + Assert.Equal(Sha256Hex("parent-1-2"), child1.NextId()); + Assert.Equal(Sha256Hex("parent-2-2"), child2.NextId()); + } + + [Fact] + public void Deterministic_SameSequenceOnReplay() + { + var gen1 = new OperationIdGenerator(); + var ids1 = new[] { gen1.NextId(), gen1.NextId(), gen1.NextId() }; + + var gen2 = new OperationIdGenerator(); + var ids2 = new[] { gen2.NextId(), gen2.NextId(), gen2.NextId() }; + + Assert.Equal(ids1, ids2); + } + + [Fact] + public void Reset_RewindsCounter() + { + var gen = new OperationIdGenerator(); + gen.NextId(); + gen.NextId(); + gen.Reset(); + Assert.Equal(Sha256Hex("1"), gen.NextId()); + } + + [Fact] + public async Task NextId_ConcurrentCallers_ProduceUniqueIds() + { + // Without Interlocked.Increment, two threads racing on ++_counter can + // both observe the same pre-increment value and emit duplicate IDs, + // silently breaking replay determinism. Drive enough contention to + // catch a regression: many parallel callers, each making many calls. + const int threads = 16; + const int idsPerThread = 500; + const int total = threads * idsPerThread; + + var gen = new OperationIdGenerator(); + var allIds = new string[total]; + var start = new ManualResetEventSlim(false); + + var tasks = Enumerable.Range(0, threads).Select(t => Task.Run(() => + { + start.Wait(); + for (var i = 0; i < idsPerThread; i++) + { + allIds[t * idsPerThread + i] = gen.NextId(); + } + })).ToArray(); + + start.Set(); + await Task.WhenAll(tasks); + + Assert.Equal(total, allIds.Distinct().Count()); + + // Counter advanced exactly `total` times — the next ID must be hash("total+1"). + Assert.Equal(Sha256Hex((total + 1).ToString(System.Globalization.CultureInfo.InvariantCulture)), + gen.NextId()); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs new file mode 100644 index 000000000..efc06655c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -0,0 +1,1350 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ParallelOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FreshExecution_AllBranchesSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Yield(); return 10; }, + async (ctx) => { await Task.Yield(); return 20; }, + async (ctx) => { await Task.Yield(); return 30; }, + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 child CONTEXT STARTs + 3 child CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Parallel:START", contextActions[0]); + Assert.Equal("Parallel:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task ParallelAsync_PreservesIndexOrder_EvenWhenBranchesCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Delay(40); return 1; }, + async (ctx) => { await Task.Delay(10); return 2; }, + async (ctx) => { await Task.Delay(20); return 3; }, + }; + + var result = await context.ParallelAsync(branches); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task ParallelAsync_BranchOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return "a"; }, + async (_) => { await Task.Yield(); return "b"; }, + }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstBranchId = ChildIdAt(parentOpId, 1); + var secondBranchId = ChildIdAt(parentOpId, 2); + + // Each branch's CONTEXT START should hit the deterministic child ID. + var branchStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .ToArray(); + Assert.Equal(2, branchStarts.Length); + Assert.Contains(branchStarts, o => o.Id == firstBranchId); + Assert.Contains(branchStarts, o => o.Id == secondBranchId); + } + + [Fact] + public async Task ParallelAsync_NamedBranches_PropagateNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var branches = new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 1; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 2; }), + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var branchSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(branchSucceeds, o => o.Name == "alpha"); + Assert.Contains(branchSucceeds, o => o.Name == "beta"); + } + + [Fact] + public async Task ParallelAsync_UnnamedOverload_DefaultsToIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task ParallelAsync_EmptyBranches_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync(Array.Empty>>()); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Parallel:START", "Parallel:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — failure tolerance + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_AllSuccessfulDefault_OneFailureThrowsParallelException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("branch boom"); }, + async (_) => { await Task.Yield(); return 3; }, + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task ParallelAsync_AllCompleted_PartialFailureDoesNotThrow() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("oops"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_AllowsUpToThreshold() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 2 fail; tolerated = 2 (>= failures), so resolves without + // throwing. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 2 } + }); + + Assert.Equal(2, result.FailureCount); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailurePercentage_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 3 fail (75%) > 0.5 (50%) → exceeded. + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("f1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f2"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f3"); }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailurePercentage = 0.5 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public void CompletionConfig_ToleratedFailurePercentage_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailurePercentage = 1.5); + Assert.Throws(() => config.ToleratedFailurePercentage = -0.1); + // boundary values are accepted + config.ToleratedFailurePercentage = 0.0; + config.ToleratedFailurePercentage = 1.0; + config.ToleratedFailurePercentage = null; + } + + [Fact] + public void CompletionConfig_MinSuccessful_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.MinSuccessful = 0); + Assert.Throws(() => config.MinSuccessful = -1); + // 1 is the minimum meaningful value; null clears the criterion. + config.MinSuccessful = 1; + config.MinSuccessful = null; + } + + [Fact] + public void CompletionConfig_ToleratedFailureCount_Negative_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailureCount = -1); + // zero (fail-fast) and positive counts are valid; null clears the criterion. + config.ToleratedFailureCount = 0; + config.ToleratedFailureCount = 5; + config.ToleratedFailureCount = null; + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so we know the dispatch order is deterministic: + // branch 0 fires first and succeeds; branches 1 and 2 are never + // dispatched at all, so they remain in BatchItemStatus.Started. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + [Fact] + public async Task ParallelAsync_MinSuccessful_ResolvesWhenTargetReached() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = new CompletionConfig { MinSuccessful = 2 } + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var branches = new Func>[] + { + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + }; + + var result = await context.ParallelAsync(branches, config: new ParallelConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + + Func> MakeBranch() + { + return async (_) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return 1; + }; + } + } + + [Fact] + public void ParallelConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new ParallelConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_SuppressesPerBranchContextOps() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); return 20; }, + async (_) => { await Task.Yield(); return 30; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Parallel CONTEXT ops are still emitted (the parent is never + // virtual)... + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Parallel") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + // ...but NO per-branch CONTEXT ops are emitted under Flat. + var branchOps = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch") + .ToArray(); + Assert.Empty(branchOps); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_InnerOpsReparentToParallelOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync( + new Func>[] + { + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 1; }), + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 2; }), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var branch0Id = ChildIdAt(parentOpId, 1); + var branch1Id = ChildIdAt(parentOpId, 2); + + // Each branch's inner STEP is ID-derived from the branch op id (so the + // two branches' first steps don't collide)... + var step0Id = ChildIdAt(branch0Id, 1); + var step1Id = ChildIdAt(branch1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // ...but each inner step re-parents to the PARALLEL op (the nearest + // non-virtual ancestor), NOT to the virtual branch (which emitted no + // checkpoint to reference). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_PartialFailure_SurfacesInlineErrors() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("flat boom"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + Assert.Contains("flat boom", result.GetErrors()[0].Message); + + await recorder.Batcher.DrainAsync(); + + // The parent SUCCEED payload carries the inline per-unit results/errors; + // no per-branch FAIL op was emitted. + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "ParallelBranch")); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + // Flat replay reads per-unit results from the inline summary payload — + // there are NO per-branch child CONTEXT ops in state. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"100"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplayFailed_ThrowsWithInlineError() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED","Error":{"ErrorType":"System.InvalidOperationException","ErrorMessage":"flat branch 0 failed"}}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = (IBatchResult)ex.Result!; + Assert.Equal(1, typed.FailureCount); + Assert.Contains("flat branch 0 failed", typed.GetErrors()[0].Message); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","OperationId":"placeholder1"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_ReplayFailed_ThrowsParallelException() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"FAILED","OperationId":"placeholder1"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 0 failed" + } + } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 1 failed" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + + var typed = (IBatchResult)ex.Result!; + Assert.Equal(2, typed.FailureCount); + Assert.Contains("branch 0 failed", typed.GetErrors()[0].Message); + } + + [Fact] + public async Task ParallelAsync_ReplayStarted_ReExecutesBranches() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + SubType = OperationSubTypes.Parallel, + Name = "fanout" + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "11" } + } + } + }); + + var calls = new int[2]; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls[0]++; await Task.Yield(); return 99; }, + async (_) => { calls[1]++; await Task.Yield(); return 22; }, + }, + name: "fanout"); + + // Branch 0 replays cached value (not re-executed); branch 1 runs fresh. + Assert.Equal(0, calls[0]); + Assert.Equal(1, calls[1]); + Assert.Equal(new[] { 11, 22 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Critical: do NOT re-checkpoint parent CONTEXT START (the original + // STARTED record is still authoritative). + var parentStarts = recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "Parallel" && o.Action == "START").ToArray(); + Assert.Empty(parentStarts); + } + + [Fact] + public async Task ParallelAsync_ReplayUnknownStatus_ThrowsNonDeterministic() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = "BOGUS", + SubType = OperationSubTypes.Parallel, + Name = "fanout" + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] { async (_) => { await Task.Yield(); return 1; } }, + name: "fanout")); + } + + // ────────────────────────────────────────────────────────────────────── + // IBatchResult helpers + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task BatchResult_ThrowIfError_ThrowsFirstError() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("kaboom"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("kaboom", ex.Message); + } + + [Fact] + public async Task BatchResult_GetResults_SkipsFailedAndStartedItems() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("ouch"); }, + async (_) => { await Task.Yield(); return 30; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 10, 30 }, result.GetResults()); + } + + [Fact] + public async Task BatchResult_AllSucceededFailedStarted_AreInOriginalIndexOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, // index 0 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-1"); }, // index 1 fail + async (_) => { await Task.Yield(); return 3; }, // index 2 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-3"); }, // index 3 fail + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 0, 2 }, result.Succeeded.Select(i => i.Index).ToArray()); + Assert.Equal(new[] { 1, 3 }, result.Failed.Select(i => i.Index).ToArray()); + Assert.Empty(result.Started); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NullBranches_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.ParallelAsync((IReadOnlyList>>)null!)); + } + + [Fact] + public async Task ParallelAsync_NullBranchInList_Throws() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + null!, + }; + + await Assert.ThrowsAsync(() => context.ParallelAsync(branches)); + } + + // ────────────────────────────────────────────────────────────────────── + // Concurrency / cancellation regressions (Critical 1, Critical 2) + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_CancelMidDispatch_AllBranchesSettleAndNoObjectDisposed() + { + // Regression for orphan-branch bug: dispatch 5 branches with + // MaxConcurrency=2; cancel parent CancellationToken right after the + // first batch starts so the dispatcher's semaphore.WaitAsync trips + // OperationCanceledException mid-loop. With the old code branches in + // flight at cancellation time would Release on a disposed semaphore + // and fault as ObjectDisposedException. With the fix the semaphore + // dispose is gated on Task.WhenAll over inFlight, so every dispatched + // task settles cleanly first. + var (context, _, _, _) = CreateContext(); + + using var cts = new CancellationTokenSource(); + var dispatchedReady = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var dispatchedCount = 0; + var lockObj = new object(); + var capturedExceptions = new List(); + var unobservedCount = 0; + + EventHandler handler = (_, args) => + { + lock (lockObj) + { + Interlocked.Increment(ref unobservedCount); + capturedExceptions.Add(args.Exception); + } + }; + TaskScheduler.UnobservedTaskException += handler; + + try + { + var branches = new Func>[5]; + for (var i = 0; i < 5; i++) + { + branches[i] = async (_) => + { + int n; + lock (lockObj) n = ++dispatchedCount; + if (n == 2) dispatchedReady.TrySetResult(); + // Hold the branch long enough that cancellation arrives + // while we're in flight. + try { await Task.Delay(200, cts.Token).ConfigureAwait(false); } + catch (OperationCanceledException) { /* cooperatively stop */ } + return n; + }; + } + + var run = context.ParallelAsync( + branches, + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }, + cancellationToken: cts.Token); + + // Wait until 2 branches are running, then cancel — this trips + // the dispatcher on its next semaphore.WaitAsync call. + await dispatchedReady.Task.WaitAsync(TimeSpan.FromSeconds(5)); + cts.Cancel(); + + // The orchestrator should surface OperationCanceledException + // cleanly (NOT ObjectDisposedException) once the in-flight + // branches settle. + var ex = await Assert.ThrowsAnyAsync(() => run); + Assert.IsNotType(ex); + + // Force GC + finalizers so any unobserved exceptions surface. + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + + Assert.Equal(0, Volatile.Read(ref unobservedCount)); + foreach (var captured in capturedExceptions) + { + Assert.IsNotType(captured); + } + } + finally + { + TaskScheduler.UnobservedTaskException -= handler; + } + } + + [Fact] + public void ExecutionState_ConcurrentTrackReplayAndValidate_NoExceptionsAndConsistent() + { + // Regression for ExecutionState race: 16 tasks call TrackReplay / + // ValidateReplayConsistency / GetOperation concurrently. With the + // unguarded Dictionary/HashSet collections this would either throw + // InvalidOperationException (concurrent enumeration) or produce + // torn reads. Under the lock the ops are serialized and consistent. + var state = new ExecutionState(); + var ops = new List(); + var ids = new List(); + for (var i = 0; i < 50; i++) + { + var id = $"op-{i}"; + ids.Add(id); + ops.Add(new Operation + { + Id = id, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = $"name-{i}" + }); + } + state.LoadFromCheckpoint(new InitialExecutionState { Operations = ops }); + + var caught = new List(); + var caughtLock = new object(); + var tasks = new Task[16]; + for (var t = 0; t < 16; t++) + { + var seed = t; + tasks[t] = Task.Run(() => + { + try + { + var rng = new Random(seed); + for (var iter = 0; iter < 200; iter++) + { + var id = ids[rng.Next(ids.Count)]; + state.TrackReplay(id); + state.ValidateReplayConsistency(id, OperationTypes.Context, $"name-{id.Substring(3)}"); + _ = state.GetOperation(id); + _ = state.HasOperation(id); + _ = state.IsReplaying; + } + } + catch (Exception ex) + { + lock (caughtLock) caught.Add(ex); + } + }); + } + + Task.WaitAll(tasks, TimeSpan.FromSeconds(30)); + Assert.Empty(caught); + + // Once every terminal op has been visited, IsReplaying must be false. + Assert.False(state.IsReplaying); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism / failure modes / mixed-status replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplayDeterminism_SameWorkflowProducesSameBranchIds() + { + // Run the same workflow shape twice from scratch and assert the + // branch CONTEXT START IDs are byte-identical. This pins the + // determinism contract: the n-th branch's hashed ID is a pure + // function of (root counter position, branch index). + async Task RunOnce() + { + var (context, recorder, _, _) = CreateContext(); + await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout"); + await recorder.Batcher.DrainAsync(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .Select(o => o.Id!) + .OrderBy(s => s) + .ToArray(); + } + + var run1Ids = await RunOnce(); + var run2Ids = await RunOnce(); + + Assert.Equal(3, run1Ids.Length); + Assert.Equal(run1Ids, run2Ids); + } + + [Fact] + public async Task ParallelAsync_FirstSuccessful_AllFail_AggregatesAsParallelException() + { + // FirstSuccessful() aliases MinSuccessful=1 with no explicit failure + // tolerance. When every branch fails, MinSuccessful is unreachable + // AND there is no failure-tolerance threshold, so the run completes + // as AllCompleted with HasFailure=true. Calling ThrowIfError surfaces + // the first failure; without explicit failure tolerance the parallel + // does NOT throw on its own (matches Python). + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("a"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("b"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("c"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(0, result.SuccessCount); + Assert.Equal(3, result.FailureCount); + Assert.True(result.HasFailure); + + // Caller-driven aggregation: ThrowIfError surfaces the first failure. + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("a", ex.Message); + } + + [Fact] + public async Task ParallelAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + // Parent SUCCEEDED with MinSuccessful short-circuit: branch 0 + // SUCCEEDED, branch 1 SUCCEEDED, branch 2 was never dispatched + // (still STARTED in the summary). Replay must reproduce the original + // BatchResult shape — including the un-dispatched STARTED entry — + // without re-executing any branch. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Branch 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_ReplayUsesCheckpointedBranchName_NotCurrentName() + { + // The checkpointed name is authoritative on replay. Even when a branch + // has no per-branch checkpoint (STARTED / never dispatched), the name + // from the parent summary must flow through to the reconstructed item. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"}, + {"Index":1,"Name":"beta","Status":"STARTED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + var result = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 999; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + } + + [Fact] + public async Task ParallelAsync_ReplayWithDriftedBranchName_ThrowsNonDeterministic() + { + // A branch name that differs between the checkpoint and the current + // code indicates the branch set was reordered/renamed between + // deployments — surface it rather than silently reconstructing. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new[] + { + // Renamed from "alpha" → "renamed" since the checkpoint. + new DurableBranch("renamed", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout")); + } + +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs new file mode 100644 index 000000000..992ebdb22 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// Test helper: a that records every flushed +/// update without making any network calls. Tests construct one of these in +/// place of a real batcher to inspect what would have been sent to the service. +/// +internal sealed class RecordingBatcher +{ + private readonly List _flushed = new(); + private readonly List _flushBatchSizes = new(); + private readonly object _lock = new(); + + public CheckpointBatcher Batcher { get; } + + /// + /// Optional hook invoked synchronously after each batch flush, with that + /// batch's updates. Tests modeling the durable-execution service's + /// NewExecutionState response (e.g. stamping a CallbackId onto a + /// freshly-started CALLBACK op) wire this up to mutate the test's + /// . + /// + public Action>? OnFlush { get; set; } + + public RecordingBatcher(CheckpointBatcherConfig? config = null) + { + Batcher = new CheckpointBatcher("test-token", Flush, config); + } + + /// + /// Cumulative list of every update that has been flushed, in order. + /// + public IReadOnlyList Flushed + { + get { lock (_lock) return _flushed.ToArray(); } + } + + /// + /// One entry per batch flushed, recording the batch size. With + /// = Zero (default), + /// every produces one batch. + /// + public IReadOnlyList FlushBatchSizes + { + get { lock (_lock) return _flushBatchSizes.ToArray(); } + } + + private Task Flush(string? token, IReadOnlyList ops, CancellationToken ct) + { + lock (_lock) + { + _flushed.AddRange(ops); + _flushBatchSizes.Add(ops.Count); + } + OnFlush?.Invoke(ops); + return Task.FromResult(token); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs new file mode 100644 index 000000000..f226ea079 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs @@ -0,0 +1,205 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class RetryStrategyTests +{ + [Fact] + public void ExponentialDefault_RetriesUpToMaxAttempts() + { + var strategy = RetryStrategy.Default; + + // Attempts 1-5 should retry (maxAttempts=6 means 6 total attempts) + for (int i = 1; i < 6; i++) + { + var decision = strategy.ShouldRetry(new InvalidOperationException("fail"), i); + Assert.True(decision.ShouldRetry); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + } + + // Attempt 6 should not retry (exhausted) + var lastDecision = strategy.ShouldRetry(new InvalidOperationException("fail"), 6); + Assert.False(lastDecision.ShouldRetry); + } + + [Fact] + public void None_NeverRetries() + { + var strategy = RetryStrategy.None; + + var decision = strategy.ShouldRetry(new Exception("fail"), 1); + Assert.False(decision.ShouldRetry); + } + + [Fact] + public void Transient_RetriesUpTo3Attempts() + { + var strategy = RetryStrategy.Transient; + + Assert.True(strategy.ShouldRetry(new Exception("fail"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("fail"), 2).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception("fail"), 3).ShouldRetry); + } + + [Fact] + public void Exponential_DelayIncreases() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(120), + backoffRate: 2.0, + jitter: JitterStrategy.None); + + var d1 = strategy.ShouldRetry(new Exception(), 1).Delay; + var d2 = strategy.ShouldRetry(new Exception(), 2).Delay; + var d3 = strategy.ShouldRetry(new Exception(), 3).Delay; + + // With no jitter: 2s, 4s, 8s (ceiling to whole seconds) + Assert.Equal(TimeSpan.FromSeconds(2), d1); + Assert.Equal(TimeSpan.FromSeconds(4), d2); + Assert.Equal(TimeSpan.FromSeconds(8), d3); + } + + [Fact] + public void Exponential_DelayCapsAtMax() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 10, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(30), + backoffRate: 3.0, + jitter: JitterStrategy.None); + + // Attempt 3: 10 * 3^2 = 90, capped to 30 + var decision = strategy.ShouldRetry(new Exception(), 3); + Assert.Equal(TimeSpan.FromSeconds(30), decision.Delay); + } + + [Fact] + public void Exponential_FullJitter_BoundedByDelay() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Full); + + // Run multiple times to check bounds + for (int i = 0; i < 50; i++) + { + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + Assert.True(decision.Delay <= TimeSpan.FromSeconds(10)); + } + } + + [Fact] + public void Exponential_HalfJitter_BoundedBetween50And100Percent() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Half); + + for (int i = 0; i < 50; i++) + { + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(5)); + Assert.True(decision.Delay <= TimeSpan.FromSeconds(10)); + } + } + + [Fact] + public void Exponential_RetryableExceptions_FiltersCorrectly() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException), typeof(HttpRequestException) }); + + Assert.True(strategy.ShouldRetry(new TimeoutException(), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new HttpRequestException(), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new InvalidOperationException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_RetryableExceptions_MatchesDerivedTypes() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(IOException) }); + + Assert.True(strategy.ShouldRetry(new FileNotFoundException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_MessagePatterns_FiltersCorrectly() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableMessagePatterns: new[] { "timeout", "throttl", "5\\d{2}" }); + + Assert.True(strategy.ShouldRetry(new Exception("connection timeout"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("request throttled"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("HTTP 503"), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception("not found"), 1).ShouldRetry); + } + + [Fact] + public void Exponential_BothFilters_EitherMatches() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException) }, + retryableMessagePatterns: new[] { "throttl" }); + + // Matches exception type + Assert.True(strategy.ShouldRetry(new TimeoutException("any message"), 1).ShouldRetry); + // Matches message pattern + Assert.True(strategy.ShouldRetry(new Exception("throttled"), 1).ShouldRetry); + // Matches neither + Assert.False(strategy.ShouldRetry(new InvalidOperationException("bad state"), 1).ShouldRetry); + } + + [Fact] + public void Exponential_NoFilters_RetriesAllExceptions() + { + var strategy = RetryStrategy.Exponential(maxAttempts: 3); + + Assert.True(strategy.ShouldRetry(new Exception("anything"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new InvalidOperationException(), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new OutOfMemoryException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_MinimumDelayIsOneSecond() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromMilliseconds(100), + jitter: JitterStrategy.None); + + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + } + + [Fact] + public void FromDelegate_UsesProvidedFunction() + { + var strategy = RetryStrategy.FromDelegate((ex, attempt) => + attempt < 2 && ex is TimeoutException + ? RetryDecision.RetryAfter(TimeSpan.FromSeconds(5)) + : RetryDecision.DoNotRetry()); + + Assert.True(strategy.ShouldRetry(new TimeoutException(), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new TimeoutException(), 2).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception(), 1).ShouldRetry); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs new file mode 100644 index 000000000..3c163ccee --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs @@ -0,0 +1,91 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class TerminationManagerTests +{ + [Fact] + public async Task Terminate_ResolvesTerminationTask() + { + var manager = new TerminationManager(); + Assert.False(manager.IsTerminated); + + manager.Terminate(TerminationReason.WaitScheduled, "wait pending"); + + Assert.True(manager.IsTerminated); + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.WaitScheduled, result.Reason); + Assert.Equal("wait pending", result.Message); + } + + [Fact] + public void Terminate_OnlyFirstCallWins() + { + var manager = new TerminationManager(); + + var first = manager.Terminate(TerminationReason.WaitScheduled, "first"); + var second = manager.Terminate(TerminationReason.CallbackPending, "second"); + + Assert.True(first); + Assert.False(second); + } + + [Fact] + public async Task Terminate_FirstReasonIsPreserved() + { + var manager = new TerminationManager(); + + manager.Terminate(TerminationReason.CallbackPending, "callback"); + manager.Terminate(TerminationReason.WaitScheduled, "wait"); + + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.CallbackPending, result.Reason); + Assert.Equal("callback", result.Message); + } + + [Fact] + public async Task Terminate_WithException() + { + var manager = new TerminationManager(); + var ex = new Exception("checkpoint failed"); + + manager.Terminate(TerminationReason.CheckpointFailed, "error", ex); + + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.CheckpointFailed, result.Reason); + Assert.Same(ex, result.Exception); + } + + [Fact] + public async Task TerminationTask_WinsRaceAgainstNeverCompletingTask() + { + var manager = new TerminationManager(); + var neverCompletes = new TaskCompletionSource().Task; + + manager.Terminate(TerminationReason.WaitScheduled); + + var winner = await Task.WhenAny(neverCompletes, manager.TerminationTask); + Assert.Same(manager.TerminationTask, winner); + } + + [Fact] + public async Task ConcurrentTerminate_OnlyOneSucceeds() + { + var manager = new TerminationManager(); + var results = new bool[10]; + + var tasks = Enumerable.Range(0, 10).Select(i => Task.Run(() => + { + results[i] = manager.Terminate(TerminationReason.WaitScheduled, $"caller-{i}"); + })); + + await Task.WhenAll(tasks); + + Assert.Equal(1, results.Count(r => r)); + Assert.True(manager.IsTerminated); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs new file mode 100644 index 000000000..7f7f92412 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using System.Text.Json.Serialization; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// Direct tests for UpperSnakeCaseEnumConverter via a sample enum, exercising +/// every branch (Read with multi-word value, Read with single word, Read with +/// null/unparsable, plus the Write path for outbound serialization). +/// +public class UpperSnakeCaseEnumConverterTests +{ + public enum Sample + { + None, + FooBar, + BazQuxQuux + } + + public class Holder + { + [JsonConverter(typeof(UpperSnakeCaseEnumConverter))] + public Sample Value { get; set; } + } + + [Theory] + [InlineData("\"FOO_BAR\"", Sample.FooBar)] + [InlineData("\"BAZ_QUX_QUUX\"", Sample.BazQuxQuux)] + [InlineData("\"NONE\"", Sample.None)] + public void Read_UpperSnakeCase_ReturnsExpectedEnum(string json, Sample expected) + { + var holder = JsonSerializer.Deserialize($"{{\"Value\":{json}}}")!; + Assert.Equal(expected, holder.Value); + } + + [Fact] + public void Read_NullValue_ReturnsDefault() + { + var holder = JsonSerializer.Deserialize("{\"Value\":null}")!; + Assert.Equal(Sample.None, holder.Value); + } + + [Fact] + public void Read_CamelCase_ParsesCaseInsensitively() + { + // The converter first tries snake→pascal, then a raw case-insensitive parse. + // A camel-case input like "fooBar" hits the fallback path. + var holder = JsonSerializer.Deserialize("{\"Value\":\"fooBar\"}")!; + Assert.Equal(Sample.FooBar, holder.Value); + } + + [Fact] + public void Read_UnparsableValue_ThrowsJsonException() + { + // Unknown wire values must surface as JsonException rather than + // silently coercing to default(T) — otherwise an unrecognized + // service status would be indistinguishable from the zero value. + Assert.Throws(() => + JsonSerializer.Deserialize("{\"Value\":\"NOT_A_REAL_VALUE\"}")); + } + + [Fact] + public void Write_PascalCase_EmitsUpperSnake() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.FooBar }); + Assert.Contains("\"FOO_BAR\"", json); + } + + [Fact] + public void Write_MultiWord_EmitsUpperSnake() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.BazQuxQuux }); + Assert.Contains("\"BAZ_QUX_QUUX\"", json); + } + + [Fact] + public void Write_SingleWord_EmitsUpperWithoutUnderscores() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.None }); + Assert.Contains("\"NONE\"", json); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs new file mode 100644 index 000000000..430df41c5 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs @@ -0,0 +1,543 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class WaitForCallbackTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + private static void WireServiceCallbackIdAllocation( + RecordingBatcher recorder, ExecutionState state, string callbackId) + { + recorder.OnFlush = ops => + { + foreach (var op in ops) + { + if (op.Type == OperationTypes.Callback && op.Action == "START") + { + state.AddOperations(new[] + { + new Operation + { + Id = op.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = op.Name, + CallbackDetails = new CallbackDetails { CallbackId = callbackId } + } + }); + } + } + }; + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_RunsSubmitterAndSuspendsForCallback() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-wait-1"); + + string? receivedCallbackId = null; + var resultTask = context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + receivedCallbackId = callbackId; + Assert.NotNull(ctx.Logger); + await Task.CompletedTask; + }, + name: "approval"); + + // Race the suspended user task against termination — same idiom as the + // production handler. Once Terminate() is called inside the inner + // GetResultAsync, this completes immediately. + var winner = await Task.WhenAny(resultTask, tm.TerminationTask); + Assert.Same(tm.TerminationTask, winner); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + Assert.Equal("cb-wait-1", receivedCallbackId); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}:{o.SubType}").ToArray(); + Assert.Equal(new[] + { + $"{OperationTypes.Context}:START:{OperationSubTypes.WaitForCallback}", + $"{OperationTypes.Callback}:START:{OperationSubTypes.Callback}", + $"{OperationTypes.Step}:START:{OperationSubTypes.Step}", + $"{OperationTypes.Step}:SUCCEED:{OperationSubTypes.Step}", + }, actions); + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_KebabSuffixedSubOpNames() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + var stepSucceed = recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED"); + + Assert.Equal("approval-callback", callbackStart.Name); + Assert.Equal("approval-submitter", stepSucceed.Name); + + // Avoid unobserved-task warning. + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_NullParentName_LeavesSubOpsNameless() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + var stepSucceed = recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED"); + + Assert.Null(callbackStart.Name); + Assert.Null(stepSucceed.Name); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_ChildOperationIdsDeterministic() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT has IdAt(1); the inner callback is child #1, the inner + // submitter step is child #2 (under the same parent context op id). + var parentOpId = IdAt(1); + var callbackChildId = ChildIdAt(parentOpId, 1); + var submitterChildId = ChildIdAt(parentOpId, 2); + + Assert.Equal(callbackChildId, + recorder.Flushed.Single(o => o.Type == OperationTypes.Callback).Id); + Assert.Equal(submitterChildId, + recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED").Id); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_CallbackTimeoutInheritsFromConfig() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromHours(2), + HeartbeatTimeout = TimeSpan.FromMinutes(15), + }); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + Assert.NotNull(callbackStart.CallbackOptions); + Assert.Equal(7200, callbackStart.CallbackOptions.TimeoutSeconds); + Assert.Equal(900, callbackStart.CallbackOptions.HeartbeatTimeoutSeconds); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayWithCallbackSucceeded_ReturnsResult() + { + // Full replay: parent CONTEXT SUCCEEDED with the callback's deserialized + // payload as its checkpointed result. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails { Result = "\"approved\"" } + } + } + }); + + var executed = false; + var result = await context.WaitForCallbackAsync( + async (_, _) => { executed = true; await Task.CompletedTask; }, + name: "approval"); + + Assert.False(executed); // Replay returns cached without re-running submitter. + Assert.Equal("approved", result); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayCallbackTimedOut_ThrowsCallbackTimeoutException() + { + // Inside-out replay: parent CONTEXT is STARTED (still in flight), + // inner callback is TIMED_OUT, inner submitter step has SUCCEEDED. + var parentId = IdAt(1); + var callbackChildId = ChildIdAt(parentId, 1); + var submitterChildId = ChildIdAt(parentId, 2); + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + }, + new() + { + Id = callbackChildId, + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval-callback", + ParentId = parentId, + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-to-1", + Error = new ErrorObject { ErrorMessage = "callback timed out" } + } + }, + new() + { + Id = submitterChildId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval-submitter", + ParentId = parentId, + StepDetails = new StepDetails { Result = "null" } + }, + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal("callback timed out", ex.Message); + Assert.Equal("cb-to-1", ex.CallbackId); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayCallbackFailed_ThrowsCallbackFailedException() + { + var parentId = IdAt(1); + var callbackChildId = ChildIdAt(parentId, 1); + var submitterChildId = ChildIdAt(parentId, 2); + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + }, + new() + { + Id = callbackChildId, + Type = OperationTypes.Callback, + Status = OperationStatuses.Failed, + Name = "approval-callback", + ParentId = parentId, + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-fail-1", + Error = new ErrorObject + { + ErrorType = "ExternalSystemError", + ErrorMessage = "external rejected" + } + } + }, + new() + { + Id = submitterChildId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval-submitter", + ParentId = parentId, + StepDetails = new StepDetails { Result = "null" } + }, + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal("external rejected", ex.Message); + Assert.Equal("cb-fail-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + } + + [Fact] + public async Task WaitForCallbackAsync_SubmitterFails_ThrowsCallbackSubmitterException() + { + // Replay: parent CONTEXT is FAILED with a Step-error inside. + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(StepException).FullName, + ErrorMessage = "submitter API failed", + ErrorData = "{\"code\":\"500\"}", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.IsAssignableFrom(ex); + Assert.Equal("submitter API failed", ex.Message); + // On the replay path the live StepException was lost across invocations; + // we preserve the StepException type-name string and carry the + // ChildContextException as the InnerException for traceability. + Assert.NotNull(ex.InnerException); + Assert.Equal(typeof(StepException).FullName, ex.ErrorType); + Assert.Equal("{\"code\":\"500\"}", ex.ErrorData); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayParentContextFailedWithCallbackTimeoutErrorType_PreservesSubclass() + { + // Subclass-fidelity guarantee: when the parent CONTEXT was checkpointed + // FAILED on a previous invocation with a CallbackTimeoutException + // ErrorType, replay must surface CallbackTimeoutException — not the + // more generic CallbackFailedException — so user catch blocks behave + // identically across live and replay paths. + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(CallbackTimeoutException).FullName, + ErrorMessage = "callback timed out after 24h", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + // Concrete-type check: not just `is CallbackException` — must be the + // CallbackTimeoutException subclass exactly. + Assert.Equal(typeof(CallbackTimeoutException), ex.GetType()); + Assert.Equal("callback timed out after 24h", ex.Message); + Assert.Equal(typeof(CallbackTimeoutException).FullName, ex.ErrorType); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayParentContextFailedWithCallbackFailedErrorType_RemapsToCallbackFailed() + { + // Companion case: a stored CallbackFailedException ErrorType remaps to + // CallbackFailedException (not the base or CallbackTimeoutException). + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(CallbackFailedException).FullName, + ErrorMessage = "external rejected", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal(typeof(CallbackFailedException), ex.GetType()); + Assert.Equal("external rejected", ex.Message); + } + + [Fact] + public async Task WaitForCallbackAsync_RetryStrategyForwardedToSubmitterStep() + { + // Verifies the WaitForCallbackConfig.RetryStrategy gets passed into the + // submitter step's StepConfig (via the kebab "-submitter" inner step). + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var seenAttempts = new List(); + var resultTask = context.WaitForCallbackAsync( + async (_, ctx) => + { + // The submitter receives an IWaitForCallbackContext (no AttemptNumber) + // — but this test doesn't need to verify retry mechanics, only + // that the StepConfig with a retry strategy is wired through. + seenAttempts.Add(seenAttempts.Count + 1); + await Task.CompletedTask; + }, + name: "approval", + config: new WaitForCallbackConfig + { + RetryStrategy = new CountingRetryStrategy() + }); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + // Submitter ran exactly once (no failures to retry); a single STEP SUCCEED + // is sufficient evidence that the strategy was wired without throwing. + Assert.Single(recorder.Flushed.Where(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED")); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_SubmitterContext_IsIWaitForCallbackContext_NotIStepContext() + { + // Verifies the submitter delegate receives our distinct + // IWaitForCallbackContext type (not IStepContext) — protects the + // architectural decision against accidental conflation. + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + Type? observedContextType = null; + var resultTask = context.WaitForCallbackAsync( + async (_, ctx) => + { + observedContextType = ctx.GetType(); + await Task.CompletedTask; + }, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + Assert.NotNull(observedContextType); + Assert.True(typeof(IWaitForCallbackContext).IsAssignableFrom(observedContextType)); + Assert.False(typeof(IStepContext).IsAssignableFrom(observedContextType)); + + _ = resultTask; + } + + private sealed class CountingRetryStrategy : IRetryStrategy + { + public int Attempts; + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + { + Attempts = attemptNumber; + return RetryDecision.DoNotRetry(); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs new file mode 100644 index 000000000..6d355c47a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs @@ -0,0 +1,1106 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; +using ILambdaSerializer = Amazon.Lambda.Core.ILambdaSerializer; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class WaitForConditionOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext(ILambdaSerializer? serializer = null) => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = serializer ?? new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null, ILambdaSerializer? serializer = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(serializer); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ── Fresh execution ───────────────────────────────────────────────── + + [Fact] + public async Task FreshExecution_StrategyStopsImmediately_SucceedsWithFinalState() + { + var (context, recorder, tm, _) = CreateContext(); + + // The check function "advances" the state to 42; the strategy's + // isDone predicate matches immediately. This exercises the synchronous + // success path with no polling iterations. + int checkInvocations = 0; + var result = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + checkInvocations++; + Assert.Equal(checkInvocations, ctx.AttemptNumber); + await Task.CompletedTask; + return 42; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Exponential(isDone: s => s == 42) + }, + name: "poll"); + + Assert.Equal(42, result); + Assert.Equal(1, checkInvocations); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] { "STEP:START", "STEP:SUCCEED" }, actions); + + var succeed = recorder.Flushed.Single(o => o.Action == "SUCCEED"); + Assert.Equal(IdAt(1), succeed.Id); + Assert.Equal("WaitForCondition", succeed.SubType); + Assert.Equal("poll", succeed.Name); + Assert.Equal("42", succeed.Payload); + } + + [Fact] + public async Task FreshExecution_StrategyContinues_EmitsRetryAndSuspends() + { + var (context, recorder, tm, _) = CreateContext(); + + // Strategy says continue → operation must emit RETRY and suspend. + var task = context.WaitForConditionAsync( + check: async (state, _) => { await Task.CompletedTask; return state + 1; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(3), maxAttempts: 10) + }, + name: "poll"); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] { "STEP:START", "STEP:RETRY" }, actions); + + var retry = recorder.Flushed.Single(o => o.Action == "RETRY"); + Assert.Equal("WaitForCondition", retry.SubType); + Assert.Equal("1", retry.Payload); // state advanced to 1 + Assert.NotNull(retry.StepOptions); + Assert.Equal(3, retry.StepOptions.NextAttemptDelaySeconds); + } + + [Fact] + public async Task FreshExecution_UsesInitialStateOnFirstCall() + { + var (context, _, _, _) = CreateContext(); + + int? observedInitial = null; + await context.WaitForConditionAsync( + check: async (state, _) => + { + observedInitial ??= state; + await Task.CompletedTask; + return state; + }, + config: new WaitForConditionConfig + { + InitialState = 99, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 10, isDone: _ => true) + }, + name: "poll"); + + Assert.Equal(99, observedInitial); + } + + [Fact] + public async Task FreshExecution_AttemptNumberIs1OnFirstCall() + { + var (context, _, _, _) = CreateContext(); + + int observed = -1; + await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observed = ctx.AttemptNumber; + await Task.CompletedTask; + return state; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 5, isDone: _ => true) + }); + + Assert.Equal(1, observed); + } + + [Fact] + public async Task CheckContext_ExposesLogger() + { + // The check function receives an IConditionCheckContext whose Logger + // is a real ILogger forwarded from the durable runtime — user code + // can use it to emit observability without threading a logger in. + var (context, _, _, _) = CreateContext(); + + ILogger? observedLogger = null; + await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observedLogger = ctx.Logger; + await Task.CompletedTask; + return state; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 5, isDone: _ => true) + }); + + Assert.NotNull(observedLogger); + } + + // ── Replay paths ──────────────────────────────────────────────────── + + [Fact] + public async Task Replay_Succeeded_ReturnsCachedAndSkipsCheck() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Succeeded, + Name = "poll", + StepDetails = new StepDetails { Result = "7" } + } + } + }); + + var checkInvoked = false; + var result = await context.WaitForConditionAsync( + check: async (_, _) => { checkInvoked = true; await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll"); + + Assert.False(checkInvoked); + Assert.Equal(7, result); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task Replay_PendingTimerNotFired_ReSuspends() + { + // NextAttemptTimestamp 1 hour in the future → timer hasn't fired, + // operation must re-suspend without re-checkpointing or re-running. + var futureMs = DateTimeOffset.UtcNow.AddHours(1).ToUnixTimeMilliseconds(); + + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Pending, + Name = "poll", + StepDetails = new StepDetails + { + Result = "5", + Attempt = 2, + NextAttemptTimestamp = futureMs + } + } + } + }); + + var checkInvoked = false; + var task = context.WaitForConditionAsync( + check: async (_, _) => { checkInvoked = true; await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll"); + + await Task.Delay(50); + + Assert.False(checkInvoked); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task Replay_PendingTimerFired_ResumesWithCheckpointedState() + { + // NextAttemptTimestamp 1 hour in the past → timer fired (service + // hasn't yet stamped READY but the deadline is met). Continue. + var pastMs = DateTimeOffset.UtcNow.AddHours(-1).ToUnixTimeMilliseconds(); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Pending, + Name = "poll", + StepDetails = new StepDetails + { + Result = "5", + Attempt = 2, + NextAttemptTimestamp = pastMs + } + } + } + }); + + int? observedState = null; + int? observedAttempt = null; + var result = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observedState = state; + observedAttempt = ctx.AttemptNumber; + await Task.CompletedTask; + return state; // condition met (isDone returns true) + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), isDone: _ => true) + }, + name: "poll"); + + // Critical: state survives across iterations. Check receives the + // PRIOR state (5, from the prior RETRY's payload), not InitialState (0). + Assert.Equal(5, observedState); + Assert.Equal(3, observedAttempt); // prior attempt was 2, this is attempt 3 + Assert.Equal(5, result); + + await recorder.Batcher.DrainAsync(); + + // No new START — original is authoritative. + Assert.DoesNotContain(recorder.Flushed, o => o.Action == "START"); + Assert.Contains(recorder.Flushed, o => o.Action == "SUCCEED"); + } + + [Fact] + public async Task Replay_Ready_ResumesWithCheckpointedState() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Ready, + Name = "poll", + StepDetails = new StepDetails + { + Result = "11", + Attempt = 3 + } + } + } + }); + + int? observedState = null; + int? observedAttempt = null; + var result = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observedState = state; + observedAttempt = ctx.AttemptNumber; + await Task.CompletedTask; + return state * 2; + }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), isDone: _ => true) + }, + name: "poll"); + + Assert.Equal(11, observedState); + Assert.Equal(4, observedAttempt); // prior=3 → next=4 + Assert.Equal(22, result); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => o.Action == "START"); + } + + [Fact] + public async Task Replay_Started_ResumesWithInitialState() + { + // STARTED with no payload means the very first check attempt was + // lost (Lambda crash before RETRY/SUCCEED). Re-execute with + // InitialState since no prior state is available. + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Started, + Name = "poll" + } + } + }); + + int? observedState = null; + int? observedAttempt = null; + var result = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observedState = state; + observedAttempt = ctx.AttemptNumber; + await Task.CompletedTask; + return state + 100; + }, + config: new WaitForConditionConfig + { + InitialState = 50, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), isDone: _ => true) + }, + name: "poll"); + + Assert.Equal(50, observedState); // InitialState is the seed + Assert.Equal(1, observedAttempt); + Assert.Equal(150, result); + + await recorder.Batcher.DrainAsync(); + // Do NOT re-emit START on STARTED replay. + Assert.DoesNotContain(recorder.Flushed, o => o.Action == "START"); + } + + [Fact] + public async Task Replay_Failed_FromCheckException_ThrowsStepException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Failed, + Name = "poll", + StepDetails = new StepDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "check went wrong", + StackTrace = new[] { "at A.B()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + + Assert.Equal("check went wrong", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + } + + [Fact] + public async Task Replay_Failed_FromMaxAttempts_ThrowsWaitForConditionException() + { + // The FAIL checkpoint records LastState in Error.ErrorData (the wire + // protocol disallows a Payload on FAIL updates) so replay can + // reconstruct an identically-populated exception. Live execution sets + // the same field in MaxAttemptsExhausted_FreshExecution. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Failed, + Name = "poll", + StepDetails = new StepDetails + { + Attempt = 3, + Error = new ErrorObject + { + ErrorType = typeof(WaitForConditionException).FullName, + ErrorMessage = "exhausted", + ErrorData = "42" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + + Assert.Equal(3, ex.AttemptsExhausted); + Assert.Equal("exhausted", ex.Message); + Assert.Equal(42, ex.LastState); // round-tripped from FAIL Error.ErrorData + } + + [Fact] + public async Task Replay_Failed_FromMaxAttempts_LastState_MatchesLiveExecution() + { + // Live execution path: exhaust max-attempts and capture the + // exception's LastState. Then construct a FAIL checkpoint mirroring + // what was written, replay, and assert LastState round-trips. + var (liveCtx, liveRecorder, _, _) = CreateContext(); + + var liveEx = await Assert.ThrowsAsync(() => + liveCtx.WaitForConditionAsync( + check: async (state, _) => { await Task.CompletedTask; return state + 1; }, + config: new WaitForConditionConfig + { + InitialState = 5, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 1) + }, + name: "poll")); + + await liveRecorder.Batcher.DrainAsync(); + var failUpdate = liveRecorder.Flushed.Single(o => o.Action == "FAIL"); + Assert.Null(failUpdate.Payload); // wire protocol forbids Payload on FAIL + Assert.Equal("6", failUpdate.Error?.ErrorData); // last state was 5+1=6, stored in ErrorData + + // Reconstruct the operation as the service would echo it back on + // replay (Error → StepDetails.Error; LastState lives in Error.ErrorData). + var (replayCtx, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Failed, + Name = "poll", + StepDetails = new StepDetails + { + Attempt = liveEx.AttemptsExhausted, + Error = new ErrorObject + { + ErrorType = failUpdate.Error?.ErrorType, + ErrorMessage = failUpdate.Error?.ErrorMessage, + ErrorData = failUpdate.Error?.ErrorData + } + } + } + } + }); + + var replayEx = await Assert.ThrowsAsync(() => + replayCtx.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 1) + }, + name: "poll")); + + Assert.Equal(liveEx.AttemptsExhausted, replayEx.AttemptsExhausted); + Assert.NotNull(replayEx.LastState); + Assert.Equal(liveEx.LastState, replayEx.LastState); + } + + [Fact] + public async Task Replay_Failed_FromMaxAttempts_NullPayload_LeavesLastStateNull() + { + // Backwards-compat: a FAIL checkpoint produced before LastState + // was stored in ErrorData (or one that lost its ErrorData) should + // not blow up — LastState falls back to null. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Failed, + Name = "poll", + StepDetails = new StepDetails + { + Attempt = 2, + Error = new ErrorObject + { + ErrorType = typeof(WaitForConditionException).FullName, + ErrorMessage = "exhausted" + // ErrorData intentionally null (legacy FAIL). + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + + Assert.Equal(2, ex.AttemptsExhausted); + Assert.Null(ex.LastState); + } + + // ── Max attempts exhaustion ───────────────────────────────────────── + + [Fact] + public async Task MaxAttemptsExhausted_FreshExecution_ThrowsWaitForConditionException() + { + var (context, recorder, _, _) = CreateContext(); + + // maxAttempts=1 + isDone always false → strategy stops on attempt 1 + // but it's because the counter is saturated, NOT because the + // condition was met. Operation must throw, not SUCCEED. + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (state, _) => { await Task.CompletedTask; return state + 1; }, + config: new WaitForConditionConfig + { + InitialState = 5, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), maxAttempts: 1) + }, + name: "poll")); + + Assert.Equal(1, ex.AttemptsExhausted); + Assert.Equal(6, ex.LastState); // last state observed was 5+1 + + await recorder.Batcher.DrainAsync(); + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] { "STEP:START", "STEP:FAIL" }, actions); + + var fail = recorder.Flushed.Single(o => o.Action == "FAIL"); + Assert.Equal("WaitForCondition", fail.SubType); + Assert.NotNull(fail.Error); + Assert.Equal(typeof(WaitForConditionException).FullName, fail.Error.ErrorType); + // LastState round-trips through Error.ErrorData (wire protocol forbids + // a Payload on FAIL). See Replay_Failed_FromMaxAttempts_LastState_MatchesLiveExecution. + Assert.Null(fail.Payload); + Assert.Equal("6", fail.Error.ErrorData); + } + + [Fact] + public async Task MaxAttemptsExhausted_DistinguishesFromConditionMet() + { + var (context, _, _, _) = CreateContext(); + + // The same maxAttempts=1 strategy WITH an isDone that's satisfied + // should SUCCEED, not throw. + var result = await context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 99; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed( + TimeSpan.FromSeconds(1), + maxAttempts: 1, + isDone: state => state == 99) + }, + name: "poll"); + + Assert.Equal(99, result); + } + + // ── Check function exception ──────────────────────────────────────── + + [Fact] + public async Task CheckThrows_CheckpointsFailAndThrows() + { + var (context, recorder, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; throw new InvalidOperationException("boom"); }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + + Assert.Equal("boom", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + + await recorder.Batcher.DrainAsync(); + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] { "STEP:START", "STEP:FAIL" }, actions); + + var fail = recorder.Flushed.Single(o => o.Action == "FAIL"); + Assert.Equal("WaitForCondition", fail.SubType); + Assert.Equal("System.InvalidOperationException", fail.Error?.ErrorType); + } + + // ── Replay determinism: state survives iterations ─────────────────── + + [Fact] + public async Task ReplayDeterminism_StateIsCarriedAcrossIterations() + { + // Simulate a multi-iteration history: invocation N had advanced the + // state to {Count=3}; invocation N+1 should pick that up and + // continue from there. + var pastMs = DateTimeOffset.UtcNow.AddSeconds(-1).ToUnixTimeMilliseconds(); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Ready, + Name = "counter", + StepDetails = new StepDetails + { + Result = """{"Count":3}""", + Attempt = 3, + NextAttemptTimestamp = pastMs + } + } + } + }); + + CounterState? observed = null; + int? observedAttempt = null; + var result = await context.WaitForConditionAsync( + check: async (state, ctx) => + { + observed = state; + observedAttempt = ctx.AttemptNumber; + await Task.CompletedTask; + return new CounterState { Count = state.Count + 1 }; + }, + config: new WaitForConditionConfig + { + InitialState = new CounterState { Count = 0 }, // ignored on replay + WaitStrategy = WaitStrategy.Fixed( + TimeSpan.FromSeconds(1), + maxAttempts: 100, + isDone: c => c.Count >= 4) // stop when we hit 4 + }, + name: "counter"); + + // Started from the checkpointed counter=3 (NOT InitialState=0), + // incremented to 4, isDone returned true, returned 4. + Assert.Equal(3, observed?.Count); + Assert.Equal(4, observedAttempt); + Assert.Equal(4, result.Count); + + await recorder.Batcher.DrainAsync(); + var succeed = recorder.Flushed.Single(o => o.Action == "SUCCEED"); + Assert.Equal("""{"Count":4}""", succeed.Payload); + } + + [Fact] + public async Task ReplayDeterminism_RoundTripsThroughLambdaSerializer() + { + var serializer = new RecordingPersonSerializer(); + var (context, _, _, _) = CreateContext( + new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Succeeded, + Name = "poll", + StepDetails = new StepDetails { Result = "Marie,30" } + } + } + }, + serializer: serializer); + + var result = await context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return new TestPerson { Name = "ignored", Age = 0 }; }, + config: new WaitForConditionConfig + { + InitialState = new TestPerson { Name = "init", Age = 0 }, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll"); + + Assert.True(serializer.DeserializeCalled); + Assert.Equal("Marie", result.Name); + Assert.Equal(30, result.Age); + } + + // ── Sync-flush of START before suspending ─────────────────────────── + + [Fact] + public async Task FreshExecution_FlushesStartBeforeSuspending() + { + // The START checkpoint MUST be persisted before the workflow + // suspends — otherwise the service has no record of the polling op + // and replay can't find it. + var (context, recorder, tm, _) = CreateContext(); + + var task = context.WaitForConditionAsync( + check: async (state, _) => { await Task.CompletedTask; return state + 1; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(5), maxAttempts: 10) + }, + name: "poll"); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // At the moment of suspension, both START and RETRY must already be + // flushed (sync-enqueued ahead of SuspendAndAwait). No drain needed. + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Contains("STEP:START", actions); + Assert.Contains("STEP:RETRY", actions); + } + + // ── Replay non-determinism guards ─────────────────────────────────── + + [Fact] + public async Task ReplayUnknownStatus_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = "BOGUS", + Name = "poll" + } + } + }); + + await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + } + + [Fact] + public async Task ReplayTypeMismatch_ThrowsNonDeterministicException() + { + // Same Id but a different Type — operation order changed between + // deployments. The base class's ValidateReplayConsistency catches it. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Succeeded, + Name = "poll" + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + name: "poll")); + + Assert.Contains("expected type 'STEP'", ex.Message); + } + + // ── Argument validation ───────────────────────────────────────────── + + [Fact] + public async Task NullCheck_ThrowsArgumentNullException() + { + var (context, _, _, _) = CreateContext(); + await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: null!, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + })); + } + + [Fact] + public async Task NullConfig_ThrowsArgumentNullException() + { + var (context, _, _, _) = CreateContext(); + await Assert.ThrowsAsync(() => + context.WaitForConditionAsync( + check: async (_, _) => { await Task.CompletedTask; return 0; }, + config: null!)); + } + + // ── Observability: warning on payload deserialization failure ────── + + [Fact] + public async Task DeserializeStateOrInitial_CorruptPayload_LogsWarningAndFallsBack() + { + // A READY checkpoint with a payload the serializer cannot read should + // NOT fail the workflow (Python parity); it should fall back to + // InitialState. The recovery should be logged at Warning level so + // corruption / schema-migrations are observable. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Ready, + Name = "poll", + StepDetails = new StepDetails { Result = "this-is-not-valid", Attempt = 2 } + } + } + }); + + var recorder = new RecordingBatcher(); + var logger = new RecordingLogger(); + + var op = new WaitForConditionOperation( + operationId: IdAt(1), + name: "poll", + parentId: null, + check: async (s, _) => { await Task.CompletedTask; return s; }, + config: new WaitForConditionConfig + { + InitialState = 999, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1), isDone: _ => true) + }, + serializer: new ThrowingLambdaSerializer(), + logger: logger, + state: state, + termination: new TerminationManager(), + durableExecutionArn: "arn:test", + batcher: recorder.Batcher); + + var result = await op.ExecuteAsync(CancellationToken.None); + + Assert.Equal(999, result); // fell back to InitialState + var warning = Assert.Single(logger.Entries.Where(e => e.Level == LogLevel.Warning)); + Assert.Contains("failed to deserialize prior state", warning.Message); + Assert.Contains(IdAt(1), warning.Message); + } + + [Fact] + public async Task ReplayFailed_CorruptLastStatePayload_LogsWarningAndLastStateNull() + { + // FAIL replay's LastState recovery: same observability story — if the + // FAIL Error.ErrorData can't be deserialized, log a warning and + // surface LastState=null instead of throwing. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + SubType = OperationSubTypes.WaitForCondition, + Status = OperationStatuses.Failed, + Name = "poll", + StepDetails = new StepDetails + { + Attempt = 4, + Error = new ErrorObject + { + ErrorType = typeof(WaitForConditionException).FullName, + ErrorMessage = "exhausted", + ErrorData = "bogus-payload" + } + } + } + } + }); + + var recorder = new RecordingBatcher(); + var logger = new RecordingLogger(); + + var op = new WaitForConditionOperation( + operationId: IdAt(1), + name: "poll", + parentId: null, + check: async (s, _) => { await Task.CompletedTask; return s; }, + config: new WaitForConditionConfig + { + InitialState = 0, + WaitStrategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(1)) + }, + serializer: new ThrowingLambdaSerializer(), + logger: logger, + state: state, + termination: new TerminationManager(), + durableExecutionArn: "arn:test", + batcher: recorder.Batcher); + + var ex = await Assert.ThrowsAsync(() => op.ExecuteAsync(CancellationToken.None)); + + Assert.Equal(4, ex.AttemptsExhausted); + Assert.Null(ex.LastState); + var warning = Assert.Single(logger.Entries.Where(e => e.Level == LogLevel.Warning)); + Assert.Contains("failed to deserialize LastState", warning.Message); + } + + // ── Test helpers ──────────────────────────────────────────────────── + + private class CounterState + { + public int Count { get; set; } + } + + private class TestPerson + { + public string? Name { get; set; } + public int Age { get; set; } + } + + /// + /// ILambdaSerializer that round-trips through a + /// custom non-JSON wire format so tests can verify the serializer on + /// ILambdaContext.Serializer is the one used during checkpointing. + /// + private class RecordingPersonSerializer : ILambdaSerializer + { + public bool SerializeCalled { get; private set; } + public bool DeserializeCalled { get; private set; } + + public void Serialize(T response, Stream responseStream) + { + SerializeCalled = true; + var person = (TestPerson)(object)response!; + using var writer = new StreamWriter(responseStream, leaveOpen: true); + writer.Write($"{person.Name},{person.Age}"); + } + + public T Deserialize(Stream requestStream) + { + DeserializeCalled = true; + using var reader = new StreamReader(requestStream); + var data = reader.ReadToEnd(); + var inner = data.Replace("", "").Replace("", ""); + var parts = inner.Split(','); + var person = new TestPerson { Name = parts[0], Age = int.Parse(parts[1]) }; + return (T)(object)person; + } + } + + /// Serializer whose Deserialize always throws — exercises the fallback paths. + private sealed class ThrowingLambdaSerializer : ILambdaSerializer + { + public void Serialize(T response, Stream responseStream) + { + using var writer = new StreamWriter(responseStream, leaveOpen: true); + writer.Write(response?.ToString() ?? string.Empty); + } + + public T Deserialize(Stream requestStream) + { + using var reader = new StreamReader(requestStream); + var data = reader.ReadToEnd(); + throw new InvalidOperationException($"cannot deserialize '{data}'"); + } + } + + /// Captures log calls so tests can assert on level and rendered message. + private sealed class RecordingLogger : ILogger + { + public List<(LogLevel Level, string Message)> Entries { get; } = new(); + + public IDisposable? BeginScope(TState state) where TState : notnull => null; + public bool IsEnabled(LogLevel logLevel) => true; + public void Log(LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func formatter) + => Entries.Add((logLevel, formatter(state, exception))); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitStrategyTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitStrategyTests.cs new file mode 100644 index 000000000..f03635326 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitStrategyTests.cs @@ -0,0 +1,226 @@ +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class WaitStrategyTests +{ + [Fact] + public void Exponential_Defaults_MatchReferenceSDKs() + { + // Reference SDKs (Python, JS, Java) all default to: + // maxAttempts=60, initialDelay=5s, maxDelay=300s, backoff=1.5x, FullJitter. + // Verify by exercising the boundary: an attempt one short of 60 + // continues; the 60th throws (matches the JS SDK pattern of + // signaling max-attempts via exception so the operation can produce + // a WaitForConditionException carrying the last state). + var strategy = WaitStrategy.Exponential(); + + Assert.True(strategy.Decide("any", 1).ShouldContinue); + Assert.True(strategy.Decide("any", 59).ShouldContinue); + + var ex = Assert.Throws(() => strategy.Decide("any", 60)); + Assert.Equal(60, ex.AttemptsExhausted); + } + + [Fact] + public void Exponential_NoIsDone_ThrowsAtMaxAttempts() + { + var strategy = WaitStrategy.Exponential(maxAttempts: 5); + + Assert.True(strategy.Decide(0, 1).ShouldContinue); + Assert.True(strategy.Decide(0, 4).ShouldContinue); + + var ex = Assert.Throws(() => strategy.Decide(0, 5)); + Assert.Equal(5, ex.AttemptsExhausted); + } + + [Fact] + public void Exponential_IsDoneTrue_StopsRegardlessOfAttempt() + { + var strategy = WaitStrategy.Exponential( + maxAttempts: 100, + isDone: state => state >= 10); + + // Predicate is the gate, not the attempt counter. + Assert.True(strategy.Decide(5, 1).ShouldContinue); + Assert.False(strategy.Decide(10, 1).ShouldContinue); + Assert.False(strategy.Decide(15, 1).ShouldContinue); + } + + [Fact] + public void Exponential_DelayGrowsAndCapsAtMax() + { + var strategy = WaitStrategy.Exponential( + maxAttempts: 20, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(20), + backoffRate: 2.0, + jitter: JitterStrategy.None); + + Assert.Equal(TimeSpan.FromSeconds(2), strategy.Decide(0, 1).Delay); + Assert.Equal(TimeSpan.FromSeconds(4), strategy.Decide(0, 2).Delay); + Assert.Equal(TimeSpan.FromSeconds(8), strategy.Decide(0, 3).Delay); + Assert.Equal(TimeSpan.FromSeconds(16), strategy.Decide(0, 4).Delay); + // 2 * 2^4 = 32, capped at 20. + Assert.Equal(TimeSpan.FromSeconds(20), strategy.Decide(0, 5).Delay); + } + + [Fact] + public void Exponential_FullJitter_StaysWithinBounds() + { + var strategy = WaitStrategy.Exponential( + maxAttempts: 20, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Full); + + for (int i = 0; i < 50; i++) + { + var d = strategy.Decide(0, 1).Delay; + // With Full jitter at attempt 1: between 1 (floor) and 10 inclusive. + Assert.True(d >= TimeSpan.FromSeconds(1)); + Assert.True(d <= TimeSpan.FromSeconds(10)); + } + } + + [Fact] + public void Exponential_HalfJitter_StaysWithinBounds() + { + // Half-jitter formula: cappedDelay * (0.5 + 0.5 * rand) ⇒ output is in + // [cappedDelay/2, cappedDelay], then ceilinged to whole seconds with a + // 1-second floor. At attempt 3 with initialDelay=10s, backoff=2.0: + // cappedDelay = min(10 * 2^2, 100) = 40s ⇒ output ∈ [20, 40] seconds. + var strategy = WaitStrategy.Exponential( + maxAttempts: 20, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Half); + + for (int i = 0; i < 50; i++) + { + var d = strategy.Decide(0, 3).Delay; + Assert.True(d >= TimeSpan.FromSeconds(20), $"expected >= 20s, got {d}"); + Assert.True(d <= TimeSpan.FromSeconds(40), $"expected <= 40s, got {d}"); + } + } + + [Fact] + public void Linear_DefaultsAreSensible() + { + // Default: 5s initial, +5s per attempt, no cap, 60 attempts. + var strategy = WaitStrategy.Linear(); + + Assert.Equal(TimeSpan.FromSeconds(5), strategy.Decide(0, 1).Delay); + Assert.Equal(TimeSpan.FromSeconds(10), strategy.Decide(0, 2).Delay); + Assert.Equal(TimeSpan.FromSeconds(15), strategy.Decide(0, 3).Delay); + } + + [Fact] + public void Linear_RespectsMaxDelay() + { + var strategy = WaitStrategy.Linear( + maxAttempts: 10, + initialDelay: TimeSpan.FromSeconds(2), + increment: TimeSpan.FromSeconds(3), + maxDelay: TimeSpan.FromSeconds(8)); + + Assert.Equal(TimeSpan.FromSeconds(2), strategy.Decide(0, 1).Delay); + Assert.Equal(TimeSpan.FromSeconds(5), strategy.Decide(0, 2).Delay); + Assert.Equal(TimeSpan.FromSeconds(8), strategy.Decide(0, 3).Delay); + // 2+3*3=11, capped to 8. + Assert.Equal(TimeSpan.FromSeconds(8), strategy.Decide(0, 4).Delay); + } + + [Fact] + public void Linear_ThrowsAtMaxAttempts() + { + var strategy = WaitStrategy.Linear(maxAttempts: 3); + + Assert.True(strategy.Decide(0, 1).ShouldContinue); + Assert.True(strategy.Decide(0, 2).ShouldContinue); + Assert.Throws(() => strategy.Decide(0, 3)); + } + + [Fact] + public void Linear_IsDonePredicate_ShortCircuits() + { + var strategy = WaitStrategy.Linear( + maxAttempts: 100, + isDone: state => state == 42); + + Assert.True(strategy.Decide(1, 1).ShouldContinue); + Assert.False(strategy.Decide(42, 1).ShouldContinue); + } + + [Fact] + public void Fixed_AlwaysReturnsSameDelay() + { + var strategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(7), maxAttempts: 5); + + Assert.Equal(TimeSpan.FromSeconds(7), strategy.Decide(0, 1).Delay); + Assert.Equal(TimeSpan.FromSeconds(7), strategy.Decide(0, 2).Delay); + Assert.Equal(TimeSpan.FromSeconds(7), strategy.Decide(0, 4).Delay); + } + + [Fact] + public void Fixed_ThrowsAtMaxAttempts() + { + var strategy = WaitStrategy.Fixed(TimeSpan.FromSeconds(2), maxAttempts: 3); + + Assert.True(strategy.Decide(0, 1).ShouldContinue); + Assert.True(strategy.Decide(0, 2).ShouldContinue); + Assert.Throws(() => strategy.Decide(0, 3)); + } + + [Fact] + public void Fixed_FloorsDelayAtOneSecond() + { + // Service timer granularity is 1 second; sub-second delays would + // round to 0 if we didn't floor. + var strategy = WaitStrategy.Fixed(TimeSpan.FromMilliseconds(100), maxAttempts: 3); + var decision = strategy.Decide(0, 1); + Assert.True(decision.ShouldContinue); + Assert.Equal(TimeSpan.FromSeconds(1), decision.Delay); + } + + [Fact] + public void Fixed_IsDonePredicate_ShortCircuits() + { + var strategy = WaitStrategy.Fixed( + TimeSpan.FromSeconds(1), + maxAttempts: 50, + isDone: state => state); + + Assert.True(strategy.Decide(false, 1).ShouldContinue); + Assert.False(strategy.Decide(true, 1).ShouldContinue); + } + + [Fact] + public void FromDelegate_UsesProvidedFunction() + { + var strategy = WaitStrategy.FromDelegate((state, attempt) => + state >= 3 || attempt >= 5 + ? WaitDecision.Stop() + : WaitDecision.ContinueAfter(TimeSpan.FromSeconds(state + 1))); + + Assert.True(strategy.Decide(0, 1).ShouldContinue); + Assert.Equal(TimeSpan.FromSeconds(1), strategy.Decide(0, 1).Delay); + Assert.False(strategy.Decide(3, 1).ShouldContinue); + Assert.False(strategy.Decide(0, 5).ShouldContinue); + } + + [Fact] + public void WaitDecision_StopAndContinueAfter_ProduceExpectedShape() + { + var stop = WaitDecision.Stop(); + Assert.False(stop.ShouldContinue); + Assert.Equal(TimeSpan.Zero, stop.Delay); + + var cont = WaitDecision.ContinueAfter(TimeSpan.FromSeconds(3)); + Assert.True(cont.ShouldContinue); + Assert.Equal(TimeSpan.FromSeconds(3), cont.Delay); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings new file mode 100644 index 000000000..6c38b1258 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings @@ -0,0 +1,15 @@ + + + + + + + cobertura + [Amazon.Lambda.DurableExecution]* + [Amazon.Lambda.DurableExecution.Tests]* + GeneratedCodeAttribute + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh new file mode 100644 index 000000000..b953bd07e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -e +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$HERE/../../.." && pwd)" +PROJ="$HERE/Amazon.Lambda.DurableExecution.Tests.csproj" +OUT="$HERE/TestResults" + +rm -rf "$OUT" +dotnet test "$PROJ" -c Release \ + --collect:"XPlat Code Coverage" \ + --settings "$HERE/coverage.runsettings" \ + --results-directory "$OUT" + +REPORT_FILE=$(find "$OUT" -name "coverage.cobertura.xml" -type f | head -1) +if [ -z "$REPORT_FILE" ]; then + echo "No coverage report found under $OUT" + exit 1 +fi + +reportgenerator \ + "-reports:$REPORT_FILE" \ + "-targetdir:$OUT/report" \ + "-reporttypes:Html;TextSummary" + +echo +echo "==================== Coverage Summary ====================" +cat "$OUT/report/Summary.txt" +echo "==========================================================" +echo "Full HTML report: $OUT/report/index.html" diff --git a/MAP-IMPLEMENTATION-PLAN.md b/MAP-IMPLEMENTATION-PLAN.md new file mode 100644 index 000000000..ab6d6e915 --- /dev/null +++ b/MAP-IMPLEMENTATION-PLAN.md @@ -0,0 +1,234 @@ +# MapAsync Implementation Plan (.NET Durable Execution SDK) — Wave 2 + +Tracking: follow-up to `ParallelAsync` (DOTNET-8662). This document is the +agreed plan before any code is written. It captures the cross-SDK research, +the locked-in decisions, and the concrete file-by-file changes. + +--- + +## 1. Background & research summary + +`MapAsync` processes a collection in parallel with configurable concurrency. +It is the sibling of the already-shipped `ParallelAsync`. The design doc +(`Docs/durable-execution-design.md`) specifies the public surface: + +```csharp +Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); +``` + +### Cross-SDK findings (Python / JavaScript / Java) + +| Aspect | Python | JavaScript | Java | Conclusion for .NET | +|--------|--------|------------|------|---------------------| +| Map vs Parallel | Siblings over shared `ConcurrentExecutor` | Siblings over shared `executeItemsConcurrently` engine | Siblings over shared `ConcurrencyOperation` | **Extract a shared base; Map & Parallel are thin subclasses.** | +| Per-item callback | `(ctx, item, index, items)` | `(ctx, item, index, array)` | `(item, index, ctx)` | Our design doc uses **`(ctx, item, index, allItems)`** — matches Python/JS (context-first). ✅ | +| Item → branch | 1 item = 1 child context | 1 item = 1 child context | 1 item = 1 child context | Same — reuse `ChildContextOperation` per item. | +| `ItemBatcher` | Config dataclass, **never wired into execution** | **Does not exist** | **Does not exist** | **Remove entirely** (decision below). | +| Default `CompletionConfig` | `all_successful()` (Parallel), permissive (Map) | fail-fast (both) | `allCompleted()` (both) | **Map default = `AllCompleted()`** (Python/Java majority); Parallel stays `AllSuccessful()`. | +| `toleratedFailurePercentage` units | 0–100 | 0–1 | 0–1 | Ours is **0.0–1.0** (already validated in `CompletionConfig`). ✅ | +| Subtypes | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | Add `Map` / `MapItem` constants. | +| Naming | `map-item-{i}` or `item_namer(item,i)` | `map-item-{i}` or `itemNamer(item,i)` | `{name}-iteration-{i}` | `ItemNamer(item, index)`; default = index string (consistent with Parallel's branch naming). | +| Empty collection | empty result, `ALL_COMPLETED` | empty result, `ALL_COMPLETED` | empty result (not replayable) | Empty → empty `BatchResult`, `AllCompleted`. | + +### Locked-in decisions (from user) + +1. **Extract a shared `ConcurrentOperation` base class.** Parallel and Map + become thin subclasses. (All three reference SDKs do this.) +2. **Remove `ItemBatcher` entirely** — no reference SDK implements it. Strip it + from `MapConfig` AND from the design doc. +3. **`MapConfig.CompletionConfig` defaults to `AllCompleted()`** (permissive), + matching Python + Java Map. Parallel's `AllSuccessful()` default is correct + and stays as-is (matches Python + JS Parallel). + +### Decisions NOT revisited + +- **Parallel default `AllSuccessful()`** — confirmed correct (Python + JS + majority). Not changing. +- **Empty `CompletionConfig` = permissive in .NET** (vs JS's empty = fail-fast). + Deliberate per DESIGN-QUESTIONS.md Q3 / REVIEW.md. Our model uses explicit + named factories (`AllSuccessful()` = `{ToleratedFailureCount=0}`, + `AllCompleted()` = empty). Map's permissive default is the explicit + `AllCompleted()` factory, so it never depends on the empty-config edge case. +- **One `MapAsync` overload** (not the 4 in the stale DESIGN-QUESTIONS.md). The + shipped serializer model pulls `ILambdaSerializer` from + `ILambdaContext.Serializer` via `LambdaSerializerHelper.GetRequired`, so the + `ICheckpointSerializer` AOT overloads do not apply. The design doc's single + signature is authoritative. + +--- + +## 2. Reuse map (what Map borrows from Parallel) + +| Component | Action | +|-----------|--------| +| `DurableOperation` base | Reuse unchanged | +| `ExecutionState` (thread-safe, `_lock`-guarded) | Reuse unchanged — REVIEW.md race already fixed | +| `OperationIdGenerator` / `HashOperationId` | Reuse unchanged — child IDs derived as `Hash($"{OperationId}-{index+1}")` in the base | +| `ChildContextOperation` | Reuse unchanged — each item runs as one child context | +| `BatchResult` / `BatchItem` | Reuse unchanged | +| `IBatchResult` / `IBatchItem` / `BatchItemStatus` | Reuse unchanged | +| `CompletionConfig` / `CompletionReason` / `NestingType` | Reuse unchanged | +| `ParallelSummary` / `ParallelJsonContext` | Generalize into a shared `BatchSummary` (see Step 3) | + +--- + +## 3. Implementation steps (ordered) + +### Step 1 — Extract `ConcurrentOperation` base class +**New file:** `Internal/ConcurrentOperation.cs` + +Move the reusable core out of `Internal/ParallelOperation.cs` (currently +lines 70–637) into an abstract base `ConcurrentOperation : DurableOperation>`: + +- `StartAsync` — sync-flush parent CONTEXT START (using `ParentSubType`), then `ExecuteItemsAsync`. +- `ReplayAsync` — the 4-way status dispatch (Succeeded → reconstruct; Failed → reconstruct + throw via `BuildException`; Started/Pending → re-execute; else `NonDeterministicExecutionException`). +- `ExecuteItemsAsync` — the full dispatch loop: `SemaphoreSlim` concurrency, the + orphan-task-safe `try/finally` that awaits all in-flight tasks before disposing + the semaphore, short-circuit checks, completion-reason computation, parent + checkpoint, throw-on-tolerance-exceeded. +- `RunUnitAsync(index, ...)` — wraps one unit in a `ChildContextOperation` + (child ID = `Hash($"{OperationId}-{index+1}")`, subtype = `ChildSubType`), + with the existing per-branch exception capture (ChildContextException → Failed + slot; structural DurableExecutionException → rethrow; OCE handling). +- `ShouldStopDispatching`, `ComputeCompletionReason`, `BranchOutcome` struct, + wire (de)serialization helpers, `DeserializeResult`, `CheckpointParentResultAsync`, + `ReconstructFromCheckpoints` — all move down. + +**Abstract/virtual hooks subclasses implement:** +```csharp +protected abstract int UnitCount; +protected abstract string ParentSubType; // OperationSubTypes.Parallel / .Map +protected abstract string ChildSubType; // .ParallelBranch / .MapItem +protected abstract (string? name, Func> func) GetUnit(int index); +protected abstract DurableExecutionException BuildException(IBatchResult result); +``` + +`ParallelOperation` then shrinks to: store `branches`, return +`OperationSubTypes.Parallel`/`ParallelBranch`, `GetUnit(i)` → `(branches[i].Name, branches[i].Func)`, +`BuildException` → `ParallelException`. **Existing 193 tests are the regression net.** + +### Step 2 — Operation subtype constants +**Edit:** `Operation.cs` → add to `OperationSubTypes`: +```csharp +public const string Map = "Map"; +public const string MapItem = "MapItem"; +``` + +### Step 3 — Generalize the checkpoint summary +**Edit:** `Internal/ParallelSummary.cs` → rename to shared `BatchSummary` / +`BatchUnitSummary` (or keep names, just broaden the doc comment). The shape +(`CompletionReason` + `[{Index, Name, Status}]`) is identical for both. +**Edit:** `Internal/ParallelJsonContext.cs` → rename to `BatchJsonContext` (one +shared source-gen context). Both subclasses use it via the base. Keeps a single +wire format and avoids drift. + +> Note: REVIEW.md issue #3 — `ParallelBranchSummary.OperationId` is dead. While +> generalizing, drop that field (smaller checkpoints) since reconstruction +> recomputes the ID by index. Confirm it isn't present before removing. + +### Step 4 — `MapConfig` + `MapException` +**New file:** `MapConfig.cs` — mirrors `ParallelConfig`: +- `int? MaxConcurrency` with `<= 0` rejection (same setter as ParallelConfig). +- `CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted();` ← **the key difference**. +- `NestingType NestingType { get; set; } = NestingType.Nested;` (Flat throws `NotSupportedException` at run). +- `Func? ItemNamer { get; set; }` — receives `(item, index)`, returns the item's branch name. **No `ItemBatcher`.** +- XML doc frames the permissive default as Python/Java parity: "Map collects all results by default; pass `CompletionConfig.AllSuccessful()` for fail-fast." + +**Edit:** `DurableExecutionException.cs` — add `MapException : DurableExecutionException` +mirroring `ParallelException` (carries `IBatchResult? Result`, `CompletionReason`, +the three ctors). Lets `catch` distinguish Map from Parallel failures. + +### Step 5 — `MapOperation` +**New file:** `Internal/MapOperation.cs` — `: ConcurrentOperation`. +Holds `items`, `func` (`Func, Task>`), +and `ItemNamer`. Implements the hooks: +- `UnitCount => items.Count` +- `ParentSubType => OperationSubTypes.Map`, `ChildSubType => OperationSubTypes.MapItem` +- `GetUnit(i)` → name = `ItemNamer?.Invoke(items[i]!, i) ?? i.ToString(InvariantCulture)`; + func = `ctx => func(ctx, items[i], i, items)` +- `BuildException` → `MapException` + +~60 lines. + +### Step 6 — Wire into the context +**Edit:** `IDurableContext.cs` — add the single `MapAsync` overload +(exact design-doc signature) with XML docs mirroring the `ParallelAsync` style. + +**Edit:** `DurableContext.cs` — add `MapAsync` + private `RunMap` +(mirrors `RunParallel`, lines 206–240): null/empty-arg validation, `Flat` guard, +serializer fetch, construct `MapOperation`, `ExecuteAsync`. Empty `items` → empty +`BatchResult` with `AllCompleted` (handled naturally by the base when `UnitCount == 0`). + +### Step 7 — Tests +**New file:** `test/.../MapOperationTests.cs` — mirror `ParallelOperationTests.cs` +(same `CreateContext` harness with `TestLambdaContext` + `DefaultLambdaJsonSerializer` ++ `RecordingBatcher`). Cover: +- Happy path (all items succeed, results in index order). +- Per-item failure capture under default `AllCompleted()` → **no throw**, failure in `IBatchResult.Failed`. +- `AllSuccessful()` override → one failure throws `MapException`. +- `ItemNamer` produces expected `IBatchItem.Name`; default naming = index. +- Empty collection → empty result, `AllCompleted`, no parent throw. +- `MaxConcurrency` (incl. the `>= count` no-semaphore optimization). +- `FirstSuccessful()` / `MinSuccessful` short-circuit → unfinished items = `Started`. +- Replay determinism: two fresh runs → identical item operation IDs. +- Replay from parent=SUCCEEDED → reconstruct results from child checkpoints. +- Mixed-status replay (some SUCCEEDED, some STARTED in summary). + +**New (DONE):** integration `test/.../IntegrationTests/TestFunctions/Map*` + +matching `Map*Test.cs`, mirroring the `Parallel*` set: HappyPath, PartialFailure +(permissive-default, the headline Map-vs-Parallel difference), FailureTolerance +(asserts `MapException`), FirstSuccessful, MaxConcurrency, ReplayDeterminism. All +6 function projects and the IntegrationTests assembly compile; the tests deploy +real Lambdas and require live AWS credentials to run. + +Re-run the **full suite on net8.0 + net10.0** to confirm the Step 1 base +extraction did not regress Parallel. + +### Step 8 — Documentation cleanup +**Edit:** `Docs/durable-execution-design.md`: +- Remove all `ItemBatcher` / `Batcher` references: the `MapConfig` block + (~lines 1369–1399), the cross-SDK "Item batching" row (~line 2132), and any + pipeline example using a batcher. +- Correct the `MapConfig.CompletionConfig` default in the doc to `AllCompleted()`. +- Note the (intentional) Parallel `AllSuccessful` vs Map `AllCompleted` default split. + +**Edit (optional):** annotate `DESIGN-QUESTIONS.md` stale bits (the +`ICheckpointSerializer` 4-overload section and any `ItemBatcher` mention) so the +record stays accurate. + +--- + +## 4. Intentional divergences (documented, not bugs) + +1. **Map default `AllCompleted()` vs Parallel default `AllSuccessful()`** — each + follows its own reference-SDK majority (Map: Python+Java; Parallel: Python+JS). +2. **One `MapAsync` overload** — superseded the stale 4-overload AOT design. +3. **`MapException`** is its own type (not reused `ParallelException`) so callers + can pattern-match the operation that failed. +4. **No `ItemBatcher`** — does not exist in JS/Java; inert in Python. + +--- + +## 5. File change checklist + +**New:** +- `Internal/ConcurrentOperation.cs` +- `Internal/MapOperation.cs` +- `MapConfig.cs` +- `test/.../MapOperationTests.cs` +- `test/.../IntegrationTests/TestFunctions/Map*` (×~6) + +**Edited:** +- `Internal/ParallelOperation.cs` (slimmed to subclass) +- `Internal/ParallelSummary.cs` → shared `BatchSummary` +- `Internal/ParallelJsonContext.cs` → shared `BatchJsonContext` +- `Operation.cs` (+2 subtype constants) +- `DurableExecutionException.cs` (+`MapException`) +- `IDurableContext.cs` (+`MapAsync` overload + docs) +- `DurableContext.cs` (+`MapAsync` + `RunMap`) +- `Docs/durable-execution-design.md` (remove ItemBatcher, fix default) diff --git a/README.md b/README.md index 405e952a5..afd2c11e3 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ For a history of releases view the [release change log](CHANGELOG.md) - [Amazon.Lambda.Annotations](#amazonlambdaannotations) - [Amazon.Lambda.AspNetCoreServer](#amazonlambdaaspnetcoreserver) - [Amazon.Lambda.TestUtilities](#amazonlambdatestutilities) + - [Amazon.Lambda.DurableExecution](#amazonlambdadurableexecution) - [Blueprints](#blueprints) - [Dotnet CLI Templates](#dotnet-cli-templates) - [Yeoman (Deprecated)](#yeoman-deprecated) @@ -113,6 +114,11 @@ For more information see the [README.md](Libraries/src/Amazon.Lambda.AspNetCoreS Package includes test implementation of the interfaces from Amazon.Lambda.Core and helper methods to help in locally testing. For more information see the [README.md](Libraries/src/Amazon.Lambda.TestUtilities/README.md) file for Amazon.Lambda.TestUtilities. +### Amazon.Lambda.DurableExecution + +The Durable Execution SDK lets you write multi-step Lambda workflows that automatically checkpoint progress and resume after failures. +For more information see the [README.md](Libraries/src/Amazon.Lambda.DurableExecution/README.md) file for Amazon.Lambda.DurableExecution. + ## Blueprints Blueprints in this repository are .NET Core Lambda functions that can used to get started. In Visual Studio the Blueprints are available when creating a new project and selecting the AWS Lambda Project. diff --git a/buildtools/build.proj b/buildtools/build.proj index 037c11f0a..0b80ec612 100644 --- a/buildtools/build.proj +++ b/buildtools/build.proj @@ -215,6 +215,7 @@ +