Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion cdk/src/constructs/ecs-agent-cluster.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets';
import * as ecs from 'aws-cdk-lib/aws-ecs';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as logs from 'aws-cdk-lib/aws-logs';
import * as s3 from 'aws-cdk-lib/aws-s3';
import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
import { NagSuppressions } from 'cdk-nag';
import { Construct } from 'constructs';
Expand All @@ -38,6 +39,18 @@ export interface EcsAgentClusterProps {
readonly githubTokenSecret: secretsmanager.ISecret;
readonly memoryId?: string;

/**
* S3 bucket holding per-task ECS payloads (#502). The orchestrator writes the
* payload (incl. the large hydrated_context, which can't fit in the 8 KB
* RunTask containerOverrides limit) here and passes only an
* `AGENT_PAYLOAD_S3_URI` pointer; the container fetches it on boot. The task
* role gets **read-only** on this bucket — the container runs untrusted repo
* code, so it must not be able to delete payloads (the trusted orchestrator
* owns write + delete). When omitted (isolated construct tests / deployments
* that still pass the payload inline), no grant or env var is added.
*/
readonly payloadBucket?: s3.IBucket;

/**
* Per-task SessionRole (#209). When provided, tenant-data DynamoDB access
* (task/events tables) is NOT granted to the Fargate task role; instead the
Expand Down Expand Up @@ -132,6 +145,10 @@ export class EcsAgentCluster extends Construct {
LOG_GROUP_NAME: logGroup.logGroupName,
GITHUB_TOKEN_SECRET_ARN: props.githubTokenSecret.secretArn,
...(props.memoryId && { MEMORY_ID: props.memoryId }),
// #502: the payload bucket name so the orchestrator-issued
// AGENT_PAYLOAD_S3_URI can be fetched. (The orchestrator sets the URI
// per-task via container override; this is informational parity.)
...(props.payloadBucket && { ECS_PAYLOAD_BUCKET: props.payloadBucket.bucketName }),
// Per-session IAM scoping (#209): when a SessionRole is wired, the
// agent assumes it for tenant-data access (see aws_session.py).
...(props.agentSessionRole && {
Expand Down Expand Up @@ -160,6 +177,15 @@ export class EcsAgentCluster extends Construct {
// agent assumes the SessionRole — stays on the task role).
props.githubTokenSecret.grantRead(taskRole);

// #502: read-only on the ECS payload bucket so the container can fetch its
// payload (AGENT_PAYLOAD_S3_URI) at boot. READ only — the container runs
// untrusted repo code, so it must not be able to write or delete payloads
// (the trusted orchestrator owns write + delete). Stays on the task role
// (read once at startup, before the agent assumes any SessionRole).
if (props.payloadBucket) {
props.payloadBucket.grantRead(taskRole);
}

// Bedrock model invocation — scoped to explicit foundation-model and
// cross-region inference-profile ARNs (parity with the AgentCore runtime
// grants in agent.ts), replacing the prior Resource: '*' wildcard.
Expand Down Expand Up @@ -201,7 +227,7 @@ export class EcsAgentCluster extends Construct {
NagSuppressions.addResourceSuppressions(this.taskDefinition, [
{
id: 'AwsSolutions-IAM5',
reason: 'DynamoDB index/* wildcards from CDK grantReadWriteData (UserConcurrencyTable, and task tables only when no SessionRole is wired); Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite. Bedrock InvokeModel is scoped to explicit model/inference-profile ARNs (no wildcard resource).',
reason: 'DynamoDB index/* wildcards from CDK grantReadWriteData (UserConcurrencyTable, and task tables only when no SessionRole is wired); Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite; S3 object/* wildcard from CDK grantRead on the ECS payload bucket (read-only, scoped to that bucket — #502). Bedrock InvokeModel is scoped to explicit model/inference-profile ARNs (no wildcard resource).',
},
{
id: 'AwsSolutions-ECS2',
Expand Down
117 changes: 117 additions & 0 deletions cdk/src/constructs/ecs-payload-bucket.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/**
* MIT No Attribution
*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as s3 from 'aws-cdk-lib/aws-s3';
import { Construct } from 'constructs';

/**
* Lifecycle expiry for ECS task payloads. The payload is consumed once, at
* container boot, and the orchestrator deletes it promptly in the ``finalize``
* step. This 1-day rule is only a crash backstop: if the orchestrator dies
* before finalize (rare — it runs under durable execution), the object is still
* reaped within a day instead of lingering. Payloads carry the hydrated prompt
* context, so a tight TTL also keeps the blast radius of an accidental
* permission leak small.
*/
export const ECS_PAYLOAD_TTL_DAYS = 1;

/**
* Object-key prefix for ECS task payloads. Key layout:
* ``<task_id>/payload.json``. Each task writes a single object under its own
* task-id prefix; the orchestrator deletes it on terminal.
*/
export const ECS_PAYLOAD_OBJECT_KEY_PREFIX = '';

/**
* Properties for the EcsPayloadBucket construct.
*/
export interface EcsPayloadBucketProps {
/**
* Removal policy for the bucket.
* @default RemovalPolicy.DESTROY
*/
readonly removalPolicy?: RemovalPolicy;

/**
* Whether to auto-delete objects when the bucket is removed (so ``cdk
* destroy`` does not need a manual bucket-empty first). Mirrors
* ``TraceArtifactsBucket`` / ``AttachmentsBucket``. Deploys CDK's
* ``Custom::S3AutoDeleteObjects`` Lambda with delete permissions on this
* bucket — acceptable here because the contents are ephemeral throwaway
* payloads.
* @default true
*/
readonly autoDeleteObjects?: boolean;
}

/**
* S3 bucket for ECS task payloads (#502).
*
* The ECS compute strategy cannot pass the orchestrator payload (repo URL,
* prompt, and the large ``hydrated_context``) inline: a Fargate ``RunTask``
* caps the entire ``containerOverrides`` blob at 8192 bytes, and the hydrated
* context routinely exceeds that, so the call is rejected with
* ``InvalidParameterException``. (AgentCore is unaffected — it passes the
* payload in the ``InvokeAgentRuntime`` request body, which has no comparable
* limit.) Instead, the orchestrator writes the payload to
* ``s3://<bucket>/<task_id>/payload.json`` and passes only a small
* ``AGENT_PAYLOAD_S3_URI`` pointer in the override; the container fetches and
* parses it on boot.
*
* Dedicated (not co-tenant with attachments/traces) so the boundary is
* structural: the ECS task role gets S3 **read** here and nowhere else, the
* attachments feature can never collide with payload keys, and the tight
* 1-day TTL is whole-bucket rather than a prefix-scoped rule grafted onto a
* shared bucket.
*
* Security / hygiene (parity with TraceArtifactsBucket):
* - ``blockPublicAccess: BLOCK_ALL`` + ``enforceSSL: true`` — no public read,
* TLS-only transport.
* - ``encryption: S3_MANAGED`` — server-side encryption at rest.
* - 1-day lifecycle expiry — payloads are ephemeral (read once at boot,
* deleted by the orchestrator at finalize); this is the crash backstop.
*/
export class EcsPayloadBucket extends Construct {
/** The underlying S3 bucket. */
public readonly bucket: s3.Bucket;

constructor(scope: Construct, id: string, props: EcsPayloadBucketProps = {}) {
super(scope, id);

this.bucket = new s3.Bucket(this, 'Bucket', {
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
encryption: s3.BucketEncryption.S3_MANAGED,
enforceSSL: true,
lifecycleRules: [
{
id: 'ecs-payload-ttl',
enabled: true,
expiration: Duration.days(ECS_PAYLOAD_TTL_DAYS),
// Reap incomplete multipart uploads after 1 day. Object expiration
// does not apply to in-flight MPUs (they are not objects yet), so a
// separate reaper keeps stale upload parts from lingering.
abortIncompleteMultipartUploadAfter: Duration.days(1),
},
],
removalPolicy: props.removalPolicy ?? RemovalPolicy.DESTROY,
autoDeleteObjects: props.autoDeleteObjects ?? true,
});
}
}
23 changes: 23 additions & 0 deletions cdk/src/constructs/task-orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ export interface TaskOrchestratorProps {
readonly executionRoleArn: string;
};

/**
* S3 bucket for per-task ECS payloads (#502). When provided (alongside
* ``ecsConfig``), the orchestrator writes the payload here and passes only an
* ``AGENT_PAYLOAD_S3_URI`` pointer in the RunTask override (the full payload
* exceeds the 8 KB containerOverrides limit), then deletes the object in the
* finalize step. The orchestrator gets write + delete; the ECS task role gets
* read-only (granted on the bucket by ``EcsAgentCluster``).
*/
readonly ecsPayloadBucket?: s3.IBucket;

/**
* S3 bucket for task attachments. When provided, the orchestrator gets
* ReadWrite grants for URL fetch/screen/upload during hydration.
Expand Down Expand Up @@ -220,6 +230,7 @@ export class TaskOrchestrator extends Construct {
'@aws-sdk/client-ecs',
'@aws-sdk/client-lambda',
'@aws-sdk/client-bedrock-runtime',
'@aws-sdk/client-s3',
'@aws-sdk/client-secrets-manager',
'@aws-sdk/lib-dynamodb',
'@aws-sdk/util-dynamodb',
Expand Down Expand Up @@ -262,6 +273,9 @@ export class TaskOrchestrator extends Construct {
ECS_SECURITY_GROUP: props.ecsConfig.securityGroup,
ECS_CONTAINER_NAME: props.ecsConfig.containerName,
}),
// #502: bucket the orchestrator writes the ECS payload to (and deletes
// from at finalize); the ECS strategy reads this to build the S3 URI.
...(props.ecsPayloadBucket && { ECS_PAYLOAD_BUCKET: props.ecsPayloadBucket.bucketName }),
...(props.attachmentsBucket && { ATTACHMENTS_BUCKET_NAME: props.attachmentsBucket.bucketName }),
},
bundling: orchestratorBundling,
Expand All @@ -280,6 +294,15 @@ export class TaskOrchestrator extends Construct {
props.attachmentsBucket.grantReadWrite(this.fn);
}

// #502: ECS payload bucket — the orchestrator writes the payload before
// RunTask and deletes it at finalize. Write + delete only (it never reads
// its own payload back; the ECS container is the reader, with its own
// read-only grant from EcsAgentCluster).
if (props.ecsPayloadBucket) {
props.ecsPayloadBucket.grantPut(this.fn);
props.ecsPayloadBucket.grantDelete(this.fn);
}

// Durable execution managed policy
this.fn.role!.addManagedPolicy(
iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaBasicDurableExecutionRolePolicy'),
Expand Down
9 changes: 9 additions & 0 deletions cdk/src/handlers/orchestrate-task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import {
type PollState,
} from './shared/orchestrator';
import { runPreflightChecks } from './shared/preflight';
import { deleteEcsPayload } from './shared/strategies/ecs-strategy';
import type { TaskRecord } from './shared/types';
import { workflowIsReadOnly, workflowRequiresRepo } from './shared/workflows';

Expand Down Expand Up @@ -297,6 +298,14 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
// Step 6: Finalize — update terminal status, emit events, release concurrency
await context.step('finalize', async () => {
await finalizeTask(taskId, finalPollState, task.user_id);
// #502: the task is terminal — the container has long since read its
// payload, so delete the ephemeral S3 payload object now. Best-effort
// (deleteEcsPayload swallows errors) and a no-op for AgentCore tasks /
// deployments without a payload bucket; the bucket's 1-day lifecycle rule
// is the backstop if this delete or the whole step never runs.
if (blueprintConfig.compute_type === 'ecs') {
await deleteEcsPayload(taskId);
}
});
};

Expand Down
Loading
Loading