diff --git a/apps/backend/README.md b/apps/backend/README.md index fdb815078..b430cac55 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -2,6 +2,8 @@ > ⚠️ **Reference implementation, local development only.** > No real authentication. The bundled `AllowAllAuthPort` permits every caller and every action (see `src/auth/`), and the constructor refuses to start without the explicit `WB_AUTH_PORT=allow-all` opt-in so a forgotten env var fails loudly. No tenant isolation. The HTTP server and the docker-compose services bind to `127.0.0.1` by default. Do not expose to the internet or shared networks without first plugging in a real `AuthPort`, see [`auth-port.decision-log.md`](./auth-port.decision-log.md) for the seam, default, and a JWT adapter sketch. +> +> Seams for consumers to plug in: [`AuthPort`](./auth-port.decision-log.md) for authn/authz, [`TenantContextPort`](./tenant-context-port.decision-log.md) for multi-tenant identity propagation (wiring guide: [`multi-tenancy.md`](./multi-tenancy.md)). > **Note:** setup is in [root README "Path B. Run the full stack demo"](../../README.md#path-b-run-the-full-stack-demo). This file documents the backend's internals, not how to start it. diff --git a/apps/backend/drizzle/0001_odd_iron_lad.sql b/apps/backend/drizzle/0001_odd_iron_lad.sql new file mode 100644 index 000000000..efb4a351a --- /dev/null +++ b/apps/backend/drizzle/0001_odd_iron_lad.sql @@ -0,0 +1,4 @@ +ALTER TABLE "execution_events" ADD COLUMN "tenant_id" text;--> statement-breakpoint +ALTER TABLE "executions" ADD COLUMN "tenant_id" text;--> statement-breakpoint +CREATE INDEX "execution_events_tenant_id_idx" ON "execution_events" USING btree ("tenant_id");--> statement-breakpoint +CREATE INDEX "executions_tenant_id_idx" ON "executions" USING btree ("tenant_id"); \ No newline at end of file diff --git a/apps/backend/drizzle/meta/0001_snapshot.json b/apps/backend/drizzle/meta/0001_snapshot.json new file mode 100644 index 000000000..665f42c31 --- /dev/null +++ b/apps/backend/drizzle/meta/0001_snapshot.json @@ -0,0 +1,359 @@ +{ + "id": "a831f5bf-efaf-47d7-8a84-9845bdf309de", + "prevId": "6d12f5a1-dc40-4f2c-b339-9613ed78fce7", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.execution_events": { + "name": "execution_events", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "execution_id": { + "name": "execution_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "sequence": { + "name": "sequence", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "timestamp": { + "name": "timestamp", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "path_id": { + "name": "path_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "payload_json": { + "name": "payload_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "tenant_id": { + "name": "tenant_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "execution_events_execution_sequence_idx": { + "name": "execution_events_execution_sequence_idx", + "columns": [ + { + "expression": "execution_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "sequence", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "execution_events_execution_id_idx": { + "name": "execution_events_execution_id_idx", + "columns": [ + { + "expression": "execution_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "execution_events_tenant_id_idx": { + "name": "execution_events_tenant_id_idx", + "columns": [ + { + "expression": "tenant_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "execution_events_execution_id_executions_id_fk": { + "name": "execution_events_execution_id_executions_id_fk", + "tableFrom": "execution_events", + "tableTo": "executions", + "columnsFrom": ["execution_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.executions": { + "name": "executions", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "workflow_id": { + "name": "workflow_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "source_version": { + "name": "source_version", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workflow_snapshot_json": { + "name": "workflow_snapshot_json", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "trigger_payload_json": { + "name": "trigger_payload_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "tenant_id": { + "name": "tenant_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "finished_at": { + "name": "finished_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "error_message": { + "name": "error_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "executions_workflow_id_idx": { + "name": "executions_workflow_id_idx", + "columns": [ + { + "expression": "workflow_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "executions_status_idx": { + "name": "executions_status_idx", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "executions_tenant_id_idx": { + "name": "executions_tenant_id_idx", + "columns": [ + { + "expression": "tenant_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "executions_workflow_id_workflows_id_fk": { + "name": "executions_workflow_id_workflows_id_fk", + "tableFrom": "executions", + "tableTo": "workflows", + "columnsFrom": ["workflow_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.workflows": { + "name": "workflows", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "draft_json": { + "name": "draft_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "published_json": { + "name": "published_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "published_at": { + "name": "published_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/apps/backend/drizzle/meta/_journal.json b/apps/backend/drizzle/meta/_journal.json index af89d3820..503fd982b 100644 --- a/apps/backend/drizzle/meta/_journal.json +++ b/apps/backend/drizzle/meta/_journal.json @@ -8,6 +8,13 @@ "when": 1776110826965, "tag": "0000_steady_quicksilver", "breakpoints": true + }, + { + "idx": 1, + "version": "7", + "when": 1780490055890, + "tag": "0001_odd_iron_lad", + "breakpoints": true } ] } diff --git a/apps/backend/multi-tenancy.md b/apps/backend/multi-tenancy.md new file mode 100644 index 000000000..df69d486e --- /dev/null +++ b/apps/backend/multi-tenancy.md @@ -0,0 +1,94 @@ +# Multi-tenancy: wiring the TenantContextPort seams + +How to make the reference backend multi-tenant. For **why** it is shaped this way — the responsibility boundary, threat model, and rejected alternatives — read [`tenant-context-port.decision-log.md`](./tenant-context-port.decision-log.md) first. That document is stable; this one tracks the code, so signatures and file paths here are the live truth. + +> **The headline rule:** implementing `TenantContextPort` propagates tenant _identity_. It does **not** isolate tenants. Isolation is `AuthPort` (app layer, see [`auth-port.decision-log.md`](./auth-port.decision-log.md)) + RLS (DB layer, seam 5). All five seams are the job. + +## Seam status + +| Seam | What it does | State | Code | Tested | +| ---- | -------------------------------------------------------------------- | ----------------------------------- | --------------------------------------- | ------- | +| 1 | Resolve tenant once per request, stash on context | shipped (no-op default) | `src/tenant/middleware.ts` | ✅ | +| 2 | Stamp `tenantId` onto the execution row at submit | shipped (no-op default) | `src/routes/workflows.ts` | ✅ | +| 3 | Worker inherits `tenant_id` for each event from its parent execution | shipped (no-op default) | `apps/execution-worker/src/database.ts` | ⚠️ none | +| 4 | SSE stream cross-check (defence-in-depth) | shipped (no-op default) | `src/routes/executions.ts` | ✅ | +| 5 | Postgres Row-Level Security | **documented pattern, not shipped** | — | — | + +"No-op default" means: under `NoopTenantContextPort` the tenant is `null` on every request, every seam degrades to single-tenant behaviour, and the reference runs with zero tenancy ceremony. Swap the port instance to turn seams 1–4 on; enable seam 5 yourself. + +## Make it multi-tenant + +Implement the port and point one line at it: + +```ts +// src/server.ts — swap the no-op for your adapter +const tenantPort: TenantContextPort = new NoopTenantContextPort(); // ← your adapter here +app.use('/api/*', createAuthMiddleware(authPort)); +app.use('/api/*', createTenantMiddleware(tenantPort)); // after auth: handlers run under AuthVariables & TenantVariables +``` + +Your adapter implements one method; how it resolves the tenant (subdomain, cookie, JWT claim) is your call: + +```ts +interface TenantContextPort { + resolve(request: Request): Promise; +} +``` + +## The seams + +### Seam 1 — HTTP middleware: resolve once per request + +`createTenantMiddleware(port)` calls `port.resolve(c.req.raw)` once at the boundary and stashes the result under `c.var.tenant`. `requireTenant(c)` is a shipped convenience that returns a `400 tenant_required` for tenant-less callers — no reference route calls it (the single-tenant reference would always 400), but it ships tested so multi-tenant consumers do not re-derive the early-return idiom in every scoped route. + +→ `src/tenant/middleware.ts` · `src/tenant/tenant-context-port.ts` + +### Seam 2 — route: propagate tenant into the execution + +`POST /workflows/:id/execute` reads `c.var.tenant?.tenantId ?? null` and stamps it onto the execution row at submission. The worker reads it back from that row (seam 3), so the tenant is **not** threaded through the run. + +The reference deliberately does **not** inject the tenant into the engine `variables` bag — no reference executor consumes it, so that would be dead transport. If _your_ executors need the tenant at runtime, inject `variables: tenantId ? { tenantId } : {}` and read `context.variables.tenantId` inside the executor. `execution-core` never reads the field; it is opaque transport (and an untyped string convention — accept that trade-off). + +→ `src/routes/workflows.ts` + +### Seam 3 — persistence: every row carries `tenant_id` + +Migration `0001_odd_iron_lad` adds a **nullable** `text` `tenant_id` to `executions` and `execution_events`, each indexed. + +- **Nullable** on purpose: a `NOT NULL` column will not apply against existing rows without a backfill story the reference cannot assume. +- **`text`, not `uuid`**: `TenantContext.tenantId` is an opaque string (UUID or slug). A consumer who knows their ids are UUIDs can tighten it. + +The worker inherits each event's `tenant_id` from its parent execution via a correlated subquery **at INSERT time** (`SELECT tenant_id FROM executions WHERE id = …`), rather than threading the value through the workflow and every `emitEvent` call. This keeps the worker and the Temporal workflow tenant-agnostic. Cost: one indexed primary-key lookup per event insert. The match holds _at insert time_ — it is not a maintained invariant, and **there is no test exercising it yet** (see the decision log's Verification status and follow-ups). + +→ `apps/execution-worker/src/database.ts` · `drizzle/0001_odd_iron_lad.sql` + +### Seam 4 — SSE stream cross-check (defence-in-depth) + +Before subscribing, `GET /:id/stream` checks the caller's tenant against the tenant on the execution row. This is the one execution-row endpoint with a tenant guard, because EventSource's auth is weaker (no `Authorization` header) — see the decision log's responsibility boundary for why the other routes deliberately do not have it. + +On mismatch the response is a **404 byte-identical to not-found, not a 403** — a recognisable 403 would confirm the id exists in another tenant and make foreign executions enumerable. The check is a no-op when either side is `null`, preserving single-tenant behaviour (which is also why untenanted rows stay app-layer visible — the threat-model trade-off). + +→ `src/routes/executions.ts` + +### Seam 5 — Postgres Row-Level Security (you enable this) + +App-level scoping (`AuthPort` + seams 2–4) is the primary enforcement. RLS is the safety net for the day someone forgets a `WHERE tenant_id`. It is **not** shipped: under the no-op default a `NULL` session variable compared to a `NULL` column silently returns zero rows for every query, and the right policy (admin bypass? parent-tenant visibility? DB-per-tenant?) is consumer-specific. + +```sql +-- consumer migration +ALTER TABLE executions ENABLE ROW LEVEL SECURITY; +ALTER TABLE execution_events ENABLE ROW LEVEL SECURITY; + +CREATE POLICY executions_tenant_isolation ON executions + USING (tenant_id = current_setting('app.current_tenant', true)); +CREATE POLICY events_tenant_isolation ON execution_events + USING (tenant_id = current_setting('app.current_tenant', true)); +``` + +Each request opens a transaction and sets `app.current_tenant` before querying (a `withTenantScope(tenantId, run)` helper wrapping `database.transaction` + `SELECT set_config(...)`). This is **not yet shipped or tested** — treat it as a sketch until the recipe lands (decision-log follow-up); do not paste it into production untested. + +Gotchas the code cannot teach you: + +- **`current_setting('app.current_tenant', true)` returns `NULL` when unset, and `NULL = NULL` matches nothing** — a request that forgets to set the tenant sees zero rows, not all rows. This is the exact reason RLS cannot be on by default in the single-tenant reference. +- Migrations and superuser ops need a role with `BYPASSRLS`. +- If the SSE dispatcher should fan out per tenant without reloading the row, the `pg_notify` payload must also carry the tenant. diff --git a/apps/backend/src/auth/middleware.ts b/apps/backend/src/auth/middleware.ts index 2264b059f..9b481472c 100644 --- a/apps/backend/src/auth/middleware.ts +++ b/apps/backend/src/auth/middleware.ts @@ -37,8 +37,13 @@ export function createAuthMiddleware(authPort: AuthPort): MiddlewareHandler<{ Va * // proceed - if we got here, the caller is authorized * ``` */ -export type AssertAuthorized = ( - c: Context<{ Variables: AuthVariables }>, +// Generic over the context's `Variables` (constrained to extend AuthVariables) +// so a route can compose auth with other per-request seams, e.g. run under +// `AuthVariables & TenantVariables`. Hono's `Context.set` is invariant in +// `Variables`, so a fixed `Context<{ Variables: AuthVariables }>` parameter +// would reject any widened context. Do not narrow this back to a concrete type. +export type AssertAuthorized = ( + c: Context<{ Variables: V }>, action: AuthAction, resource: AuthResource, ) => Promise; diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 38dfe95a6..4151f83e0 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -21,6 +21,12 @@ export const executions = pgTable( workflowSnapshotJson: jsonb('workflow_snapshot_json').notNull(), status: text('status').notNull().default('pending'), triggerPayloadJson: jsonb('trigger_payload_json'), + // Nullable: single-tenant deployments leave it NULL on every row. + // Multi-tenant consumers populate via TenantContextPort at /execute time. + // `text`, not `uuid`: TenantContext.tenantId is an opaque string (UUID or + // slug), so the column must accept whatever the consumer's port returns. + // See apps/backend/tenant-context-port.decision-log.md for the wiring. + tenantId: text('tenant_id'), startedAt: timestamp('started_at', { withTimezone: true }), finishedAt: timestamp('finished_at', { withTimezone: true }), errorMessage: text('error_message'), @@ -30,6 +36,7 @@ export const executions = pgTable( (table) => [ index('executions_workflow_id_idx').on(table.workflowId), index('executions_status_idx').on(table.status), + index('executions_tenant_id_idx').on(table.tenantId), ], ); @@ -46,10 +53,16 @@ export const executionEvents = pgTable( nodeId: text('node_id'), pathId: text('path_id'), payloadJson: jsonb('payload_json'), + // Denormalised from `executions.tenant_id` so analytics queries can scope + // events by tenant without joining. Worker writes inherit it from the + // parent execution via subquery at INSERT time (see worker database.ts). + // `text` to match executions.tenant_id. NULL in single-tenant mode. + tenantId: text('tenant_id'), createdAt: timestamp('created_at', { withTimezone: true }).notNull().defaultNow(), }, (table) => [ uniqueIndex('execution_events_execution_sequence_idx').on(table.executionId, table.sequence), index('execution_events_execution_id_idx').on(table.executionId), + index('execution_events_tenant_id_idx').on(table.tenantId), ], ); diff --git a/apps/backend/src/events/drain-events.test.ts b/apps/backend/src/events/drain-events.test.ts index 0d928134b..b365c7fd7 100644 --- a/apps/backend/src/events/drain-events.test.ts +++ b/apps/backend/src/events/drain-events.test.ts @@ -14,6 +14,7 @@ function makeEvent(executionId: string, sequence: number, type = 'node_completed nodeId: null, pathId: null, payloadJson: null, + tenantId: null, createdAt: stamp, }; } diff --git a/apps/backend/src/events/serialized-drainer.test.ts b/apps/backend/src/events/serialized-drainer.test.ts index 79ef8bde0..5503f3977 100644 --- a/apps/backend/src/events/serialized-drainer.test.ts +++ b/apps/backend/src/events/serialized-drainer.test.ts @@ -15,6 +15,7 @@ function makeEvent(executionId: string, sequence: number, type = 'node_completed nodeId: null, pathId: null, payloadJson: null, + tenantId: null, createdAt: stamp, }; } diff --git a/apps/backend/src/routes/executions.test.ts b/apps/backend/src/routes/executions.test.ts index 0b560f442..0c702df36 100644 --- a/apps/backend/src/routes/executions.test.ts +++ b/apps/backend/src/routes/executions.test.ts @@ -10,6 +10,7 @@ import { createAuthMiddleware, makeAssertAuthorized, } from '../auth'; +import { type TenantContext, type TenantVariables, createTenantMiddleware } from '../tenant'; import { createExecutionsRoutes } from './executions'; // ---- module mocks ----------------------------------------------------------- @@ -88,6 +89,17 @@ function denyAll(): AuthPort { }; } +// Same wiring as buildApp, plus the tenant middleware so the route's tenant +// cross-check has a `c.var.tenant` to read. `tenant` is what the configured +// TenantContextPort resolves to (null = single-tenant reference default). +function buildAppWithTenant(port: AuthPort, tenant: TenantContext | null) { + const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); + app.use('*', createAuthMiddleware(port)); + app.use('*', createTenantMiddleware({ resolve: vi.fn(async () => tenant) })); + app.route('/api/executions', createExecutionsRoutes(makeAssertAuthorized(port))); + return app; +} + // Terminal status so the SSE handler returns after the snapshot and the test // does not have to chase a long-lived stream connection. const terminalExecution = { @@ -188,3 +200,64 @@ describe('createExecutionsRoutes - deny short-circuits before DB or engine work' expect(subscribeMock).not.toHaveBeenCalled(); }); }); + +// ---- tenant cross-check on the SSE stream ---------------------------------- +// +// The stream is the one execution-row endpoint that enforces tenant isolation +// directly, as defence-in-depth for EventSource's weaker auth (it cannot send +// an Authorization header). Resource-level scoping of GET/:id and DELETE/:id +// is the AuthPort's job - see tenant-context-port.decision-log.md. The check +// is a no-op when either side is null, which keeps the single-tenant reference +// (NoopTenantContextPort resolves null) behaving exactly as before. + +const tenantedExecution = { ...terminalExecution, tenantId: 'acme' }; + +const allowStream = () => allowAll(vi.fn(async () => true)); + +function programStream(execution: unknown) { + // 1st select: the executions row. 2nd: the events catch-up query. + databaseMock.select.mockReturnValueOnce(chainResolving([execution])); + databaseMock.select.mockReturnValue(chainResolving([])); +} + +describe('createExecutionsRoutes - stream tenant cross-check', () => { + it('404 (not 403) when caller tenant differs - no cross-tenant existence leak', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'other' }); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + // Byte-identical to the not-found branch: a foreign execution must be + // indistinguishable from one that does not exist, or the id is enumerable. + expect(response.status).toBe(404); + expect(await response.json()).toEqual({ code: 'execution_not_found', message: 'Execution not found' }); + expect(subscribeMock).not.toHaveBeenCalled(); + }); + + it('streams when caller tenant matches the execution tenant', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'acme' }); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); + + it('no-op when caller has no tenant - single-tenant default - even if the execution is tenanted', async () => { + const app = buildAppWithTenant(allowStream(), null); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); + + it('no-op when the execution has no tenant even if the caller is tenanted', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'acme' }); + programStream(terminalExecution); // terminalExecution carries no tenantId + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); +}); diff --git a/apps/backend/src/routes/executions.ts b/apps/backend/src/routes/executions.ts index 2d82a69de..0934f645b 100644 --- a/apps/backend/src/routes/executions.ts +++ b/apps/backend/src/routes/executions.ts @@ -11,13 +11,16 @@ import { subscribe } from '../events/execution-event-bus'; import { type ExecutionEventRow, fetchEventsAfter } from '../events/fetch-events-after'; import { createSerializedDrainer } from '../events/serialized-drainer'; import { logger as backendLogger } from '../logger'; +import type { TenantVariables } from '../tenant'; const logger = backendLogger.child({ component: 'executions-route' }); const TERMINAL_STATUSES = new Set(['completed', 'failed', 'cancelled']); -export function createExecutionsRoutes(assertAuthorized: AssertAuthorized): Hono<{ Variables: AuthVariables }> { - const routes = new Hono<{ Variables: AuthVariables }>(); +export function createExecutionsRoutes( + assertAuthorized: AssertAuthorized, +): Hono<{ Variables: AuthVariables & TenantVariables }> { + const routes = new Hono<{ Variables: AuthVariables & TenantVariables }>(); routes.get('/:id', async (c) => { const executionId = c.req.param('id'); @@ -57,6 +60,31 @@ export function createExecutionsRoutes(assertAuthorized: AssertAuthorized): Hono return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); } + // Tenant cross-check, scoped to the stream on purpose. This is NOT the + // general per-resource tenant guard - resource-level scoping of GET/:id + // and DELETE/:id is the AuthPort's job (it receives { kind: 'execution', + // executionId } and a real adapter checks ownership). The stream gets an + // extra, independent check because EventSource cannot send an + // Authorization header, so its auth falls back to weaker query-param / + // cookie schemes (see the comment above this handler). The tenant resolved + // by TenantContextPort - which can come from a subdomain or cookie - is a + // second line of defence on exactly that weak path. + // + // No-op when either side is null: a tenant-less caller (single-tenant + // reference, NoopTenantContextPort) or an untenanted execution row passes + // through unchanged. Untenanted rows are therefore globally visible at the + // app layer; Postgres RLS (decision-log seam 5) is the systematic backstop + // for deployments that need rows to be invisible across tenants by default. + // + // On mismatch we return 404, byte-identical to the not-found branch above, + // not 403. A distinct "belongs to another tenant" response would confirm + // the id exists in some other tenant, letting a caller enumerate foreign + // executions. Indistinguishable-from-absent is the only safe answer here. + const tenant = c.var.tenant; + if (tenant && execution.tenantId && execution.tenantId !== tenant.tenantId) { + return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); + } + return streamSSE(c, async (stream) => { // Catch-up snapshot. Reuses the same incremental query (afterSequence=0) // that powers live drains — one query shape across the route, not two. diff --git a/apps/backend/src/routes/workflows.test.ts b/apps/backend/src/routes/workflows.test.ts index 7a5dd2dbe..5e6bacb54 100644 --- a/apps/backend/src/routes/workflows.test.ts +++ b/apps/backend/src/routes/workflows.test.ts @@ -10,6 +10,7 @@ import { createAuthMiddleware, makeAssertAuthorized, } from '../auth'; +import { type TenantContext, type TenantVariables, createTenantMiddleware } from '../tenant'; import { createWorkflowsRoutes } from './workflows'; // ---- module mocks ----------------------------------------------------------- @@ -94,6 +95,31 @@ function denyAll(): AuthPort { }; } +// buildApp plus the tenant middleware, so the execute route can read +// `c.var.tenant`. `tenant` is what the configured TenantContextPort resolves +// to (null = single-tenant reference default). +function buildAppWithTenant(port: AuthPort, tenant: TenantContext | null) { + const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); + app.use('*', createAuthMiddleware(port)); + app.use('*', createTenantMiddleware({ resolve: vi.fn(async () => tenant) })); + app.route('/api/workflows', createWorkflowsRoutes(makeAssertAuthorized(port))); + return app; +} + +// Capture what the execute route passes to `database.insert(...).values(...)`. +// The chainable proxy used elsewhere discards call arguments, so the row-stamp +// assertion needs a mock that records the values object. +function captureExecuteInsert(): { values?: Record } { + const captured: { values?: Record } = {}; + databaseMock.insert.mockImplementation(() => ({ + values: (v: Record) => { + captured.values = v; + return { returning: () => chainResolving([{ id: 'e-1', status: 'pending' }]) }; + }, + })); + return captured; +} + const fakeWorkflow = { id: 'w-1', name: 'demo', @@ -203,3 +229,42 @@ describe('createWorkflowsRoutes - deny short-circuits before any DB access', () expect(engineMock.cancel).not.toHaveBeenCalled(); }); }); + +// ---- tenant propagation on execute ----------------------------------------- +// +// The resolved tenant is stamped onto the executions row (the worker's event +// subquery reads it back from there). It is deliberately NOT placed in the +// engine `variables` bag: no reference executor reads it, so `variables` stays +// empty regardless of whether a tenant was resolved. + +function executeRequest(app: ReturnType) { + return app.request('/api/workflows/w-1/execute', { + method: 'POST', + body: JSON.stringify({ sourceVersion: 'draft' }), + headers: { 'content-type': 'application/json' }, + }); +} + +describe('createWorkflowsRoutes - execute propagates tenant identity', () => { + it('stamps the resolved tenantId onto the execution row, not into variables', async () => { + const app = buildAppWithTenant(allowAll(vi.fn(async () => true)), { tenantId: 'acme' }); + databaseMock.select.mockReturnValue(chainResolving([fakeWorkflow])); + const insert = captureExecuteInsert(); + + await executeRequest(app); + + expect(insert.values?.tenantId).toBe('acme'); + expect(engineMock.submit).toHaveBeenCalledWith(expect.objectContaining({ variables: {} })); + }); + + it('writes null tenantId in single-tenant mode', async () => { + const app = buildAppWithTenant(allowAll(vi.fn(async () => true)), null); + databaseMock.select.mockReturnValue(chainResolving([fakeWorkflow])); + const insert = captureExecuteInsert(); + + await executeRequest(app); + + expect(insert.values?.tenantId).toBeNull(); + expect(engineMock.submit).toHaveBeenCalledWith(expect.objectContaining({ variables: {} })); + }); +}); diff --git a/apps/backend/src/routes/workflows.ts b/apps/backend/src/routes/workflows.ts index e08ff7e27..20d632f81 100644 --- a/apps/backend/src/routes/workflows.ts +++ b/apps/backend/src/routes/workflows.ts @@ -9,6 +9,7 @@ import { mapToExecutionModel } from '../domain/mapper/from-integration-data'; import { workflowSnapshotSchema } from '../domain/mapper/snapshot-schema'; import { getWorkflowEngine } from '../engine'; import { logger as backendLogger } from '../logger'; +import type { TenantVariables } from '../tenant'; const logger = backendLogger.child({ component: 'workflows-route' }); @@ -34,8 +35,10 @@ function formatValidationDetails(error: z.ZodError) { })); } -export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono<{ Variables: AuthVariables }> { - const routes = new Hono<{ Variables: AuthVariables }>(); +export function createWorkflowsRoutes( + assertAuthorized: AssertAuthorized, +): Hono<{ Variables: AuthVariables & TenantVariables }> { + const routes = new Hono<{ Variables: AuthVariables & TenantVariables }>(); routes.post('/', async (c) => { await assertAuthorized(c, 'workflows:create', { kind: 'workflows' }); @@ -209,6 +212,17 @@ export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono< ); } + // Propagate tenant identity from the HTTP boundary onto the execution row. + // The worker reads it back via subquery for event tagging (see worker + // database.ts), so it never has to know tenancy exists. Null in + // single-tenant mode (the reference default). + // + // We deliberately do NOT stuff tenantId into the engine `variables` bag: + // no reference node executor reads it, so shipping it would be dead + // transport. Consumers whose executors need tenant at runtime add it to + // `variables` themselves - see seam 2 in tenant-context-port.decision-log.md. + const tenantId = c.var.tenant?.tenantId ?? null; + const [execution] = await database .insert(executions) .values({ @@ -217,6 +231,7 @@ export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono< workflowSnapshotJson: snapshotJson, status: 'pending', triggerPayloadJson: body.triggerPayload ?? null, + tenantId, }) .returning(); diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index 61a36f952..351dc9a81 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -16,6 +16,7 @@ import { env } from './env'; import { logger } from './logger'; import { createExecutionsRoutes } from './routes/executions'; import { createWorkflowsRoutes } from './routes/workflows'; +import { NoopTenantContextPort, type TenantContextPort, type TenantVariables, createTenantMiddleware } from './tenant'; // Permissive default for local development. The constructor itself emits a // loud startup warning and refuses to boot unless `WB_AUTH_PORT=allow-all` is @@ -25,7 +26,12 @@ const authPort: AuthPort = new AllowAllAuthPort(); const assertAuthorized = makeAssertAuthorized(authPort); -const app = new Hono<{ Variables: AuthVariables }>(); +// Default single-tenant: every request lands with `c.var.tenant === null`. +// Multi-tenant consumers swap in their own implementation (subdomain, JWT +// claim, header, …) — see `apps/backend/tenant-context-port.decision-log.md`. +const tenantPort: TenantContextPort = new NoopTenantContextPort(); + +const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); app.use('/*', cors()); // Reject request bodies larger than 1 MB to prevent memory exhaustion @@ -46,11 +52,13 @@ app.onError((error, c) => { return c.json({ code: 'internal_error', message: 'Internal server error' }, 500); }); -// Health check is intentionally registered before the auth middleware so it -// stays accessible to monitoring tools that don't carry credentials. +// Health check is intentionally registered before the auth and tenant +// middleware so it stays accessible to monitoring tools that don't carry +// credentials or tenant headers. app.get('/api/health', (c) => c.json({ status: 'ok' })); app.use('/api/*', createAuthMiddleware(authPort)); +app.use('/api/*', createTenantMiddleware(tenantPort)); app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); diff --git a/apps/backend/src/tenant/index.ts b/apps/backend/src/tenant/index.ts new file mode 100644 index 000000000..6b9f3f43e --- /dev/null +++ b/apps/backend/src/tenant/index.ts @@ -0,0 +1,4 @@ +export { createTenantMiddleware, requireTenant } from './middleware'; +export type { TenantVariables } from './middleware'; +export { NoopTenantContextPort } from './noop-tenant-context-port'; +export type { TenantContext, TenantContextPort } from './tenant-context-port'; diff --git a/apps/backend/src/tenant/middleware.test.ts b/apps/backend/src/tenant/middleware.test.ts new file mode 100644 index 000000000..2d7a13741 --- /dev/null +++ b/apps/backend/src/tenant/middleware.test.ts @@ -0,0 +1,120 @@ +import { Hono } from 'hono'; +import { describe, expect, it, vi } from 'vitest'; + +// Import via the barrel — same path a third-party adapter author would use. +import { + type TenantContext, + type TenantContextPort, + type TenantVariables, + createTenantMiddleware, + requireTenant, +} from '.'; + +function fakePort(overrides: Partial = {}): TenantContextPort { + return { + resolve: vi.fn(async () => null), + ...overrides, + }; +} + +function makeApp(port: TenantContextPort): Hono<{ Variables: TenantVariables }> { + const app = new Hono<{ Variables: TenantVariables }>(); + app.use('*', createTenantMiddleware(port)); + + app.get('/probe', (c) => c.json({ tenant: c.var.tenant })); + + app.get('/scoped/:id', (c) => { + const tenant = requireTenant(c); + if (tenant instanceof Response) return tenant; + return c.json({ id: c.req.param('id'), tenantId: tenant.tenantId }); + }); + + return app; +} + +describe('createTenantMiddleware', () => { + it('stashes the resolved tenant on the request context', async () => { + const tenant: TenantContext = { tenantId: 'acme' }; + const port = fakePort({ resolve: vi.fn(async () => tenant) }); + + const response = await makeApp(port).request('/probe'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ tenant }); + expect(port.resolve).toHaveBeenCalledTimes(1); + }); + + it('stashes null when the port returns null — single-tenant default', async () => { + const port = fakePort({ resolve: vi.fn(async () => null) }); + + const response = await makeApp(port).request('/probe'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ tenant: null }); + }); + + it('forwards the raw Request object to the port — adapter sees full url + headers', async () => { + const seen: Request[] = []; + const port = fakePort({ + resolve: vi.fn(async (request: Request) => { + seen.push(request); + return null; + }), + }); + + await makeApp(port).request('/probe', { headers: { 'x-tenant-id': 'beta' } }); + + expect(seen).toHaveLength(1); + expect(seen[0]?.headers.get('x-tenant-id')).toBe('beta'); + }); + + it('resolves exactly once per request even with multiple handlers downstream', async () => { + const port = fakePort({ resolve: vi.fn(async () => ({ tenantId: 'acme' })) }); + + const app = new Hono<{ Variables: TenantVariables }>(); + app.use('*', createTenantMiddleware(port)); + app.get('/multi', async (c) => { + const a = c.var.tenant; + const b = c.var.tenant; + return c.json({ same: a === b, tenantId: a?.tenantId }); + }); + + const response = await app.request('/multi'); + + expect(await response.json()).toEqual({ same: true, tenantId: 'acme' }); + expect(port.resolve).toHaveBeenCalledTimes(1); + }); + + it('propagates an exception from the port — not silently coerced to null', async () => { + // Throw is reserved for unexpected failures (tenant store down, DNS timeout). + // Routes must NOT treat that as anonymous; Hono surfaces it as 500. + const port = fakePort({ + resolve: vi.fn(async () => { + throw new Error('tenant store unreachable'); + }), + }); + + const response = await makeApp(port).request('/probe'); + expect(response.status).toBe(500); + }); +}); + +describe('requireTenant', () => { + it('returns the resolved TenantContext when present', async () => { + const port = fakePort({ resolve: vi.fn(async () => ({ tenantId: 'acme' })) }); + + const response = await makeApp(port).request('/scoped/widget-1'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ id: 'widget-1', tenantId: 'acme' }); + }); + + it('returns a 400 Response when no tenant is on the context', async () => { + const port = fakePort({ resolve: vi.fn(async () => null) }); + + const response = await makeApp(port).request('/scoped/widget-1'); + + expect(response.status).toBe(400); + expect(await response.json()).toEqual({ code: 'tenant_required', message: 'Tenant context required' }); + }); +}); diff --git a/apps/backend/src/tenant/middleware.ts b/apps/backend/src/tenant/middleware.ts new file mode 100644 index 000000000..3a1259880 --- /dev/null +++ b/apps/backend/src/tenant/middleware.ts @@ -0,0 +1,51 @@ +import type { Context, MiddlewareHandler } from 'hono'; + +import type { TenantContext, TenantContextPort } from './tenant-context-port'; + +/** + * Stash for the resolved tenant on a Hono request context. `null` means the + * configured {@link TenantContextPort} did not identify a tenant for this + * request (e.g. health check, single-tenant reference mode). Routes that + * require a tenant use {@link requireTenant}. + */ +export type TenantVariables = { + tenant: TenantContext | null; +}; + +/** + * Identify the tenant once per request and stash it on the Hono context. + * Routes downstream read `c.var.tenant`. With the default + * `NoopTenantContextPort`, every request lands with `tenant === null` — + * the reference backend is single-tenant — and downstream code treats that + * as the standard case. + */ +export function createTenantMiddleware(port: TenantContextPort): MiddlewareHandler<{ Variables: TenantVariables }> { + return async (c, next) => { + c.set('tenant', await port.resolve(c.req.raw)); + await next(); + }; +} + +/** + * Convenience for routes that REQUIRE a tenant. Returns the `TenantContext` + * to use, or a 400 `Response` if no tenant was resolved. Use the early-return + * idiom: + * + * ```ts + * const tenant = requireTenant(c); + * if (tenant instanceof Response) return tenant; + * // tenant is TenantContext from here on + * ``` + * + * In the reference (single-tenant) setup with `NoopTenantContextPort`, this + * helper always short-circuits to 400 — which is fine, because no route in + * the reference backend currently calls it. Multi-tenant consumers call it + * from routes that should reject tenant-less callers. + */ +export function requireTenant(c: Context<{ Variables: TenantVariables }>): TenantContext | Response { + const tenant = c.var.tenant; + if (!tenant) { + return c.json({ code: 'tenant_required', message: 'Tenant context required' }, 400); + } + return tenant; +} diff --git a/apps/backend/src/tenant/noop-tenant-context-port.test.ts b/apps/backend/src/tenant/noop-tenant-context-port.test.ts new file mode 100644 index 000000000..8fbe311f7 --- /dev/null +++ b/apps/backend/src/tenant/noop-tenant-context-port.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, it } from 'vitest'; + +// Imported via the barrel — the same path a third-party adapter author would +// use. Pins that the public surface re-exports both the no-op impl and the +// interface in one place; knip flags if either drifts out of the barrel. +import { NoopTenantContextPort, type TenantContextPort } from '.'; + +describe('NoopTenantContextPort', () => { + // Typed as the interface so the test exercises the contract callers see + // rather than the impl's narrowed signature. + const port: TenantContextPort = new NoopTenantContextPort(); + + it('resolves to null — the reference backend is single-tenant', async () => { + const request = new Request('http://localhost/api/workflows'); + await expect(port.resolve(request)).resolves.toBeNull(); + }); + + it('does not throw for any request shape — tenancy is an opt-in seam', async () => { + // Subdomain hint, header hint, POST without body — none of these should + // make the no-op resolver care. Real adapters interpret one of them. + const requests = [ + new Request('http://localhost/'), + new Request('http://acme.example.com/api/x', { headers: { 'x-tenant-id': 'acme' } }), + new Request('http://localhost/api/health', { method: 'POST' }), + new Request('http://localhost/api/y', { + headers: { authorization: 'Bearer eyJhbGc...' }, + }), + ]; + + for (const request of requests) { + await expect(port.resolve(request)).resolves.toBeNull(); + } + }); +}); diff --git a/apps/backend/src/tenant/noop-tenant-context-port.ts b/apps/backend/src/tenant/noop-tenant-context-port.ts new file mode 100644 index 000000000..1df1bfba0 --- /dev/null +++ b/apps/backend/src/tenant/noop-tenant-context-port.ts @@ -0,0 +1,20 @@ +import type { TenantContext, TenantContextPort } from './tenant-context-port'; + +/** + * Default {@link TenantContextPort} for the single-tenant reference backend: + * resolves to `null` on every request. The reference backend operates as a + * single logical tenant, so attaching tenant context would be ceremony with + * no enforcement. + * + * Multi-tenant consumers swap this for a real adapter that reads from a + * subdomain (`https://acme.example.com/api/…`), a JWT claim, a header, a + * session store, or whatever fits their identity model. See + * `apps/backend/tenant-context-port.decision-log.md` for the integration + * pattern (five seams: HTTP middleware, route propagation, persistence, + * SSE filter, Postgres RLS) the port plugs into. + */ +export class NoopTenantContextPort implements TenantContextPort { + async resolve(): Promise { + return null; + } +} diff --git a/apps/backend/src/tenant/tenant-context-port.ts b/apps/backend/src/tenant/tenant-context-port.ts new file mode 100644 index 000000000..0d38c616a --- /dev/null +++ b/apps/backend/src/tenant/tenant-context-port.ts @@ -0,0 +1,47 @@ +/** + * TenantContextPort — multi-tenant identity seam for the reference backend. + * + * The reference backend ships **single-tenant**. Real multi-tenant + * deployments need to propagate a tenant identifier from the incoming HTTP + * request, through workflow submission, into worker activities, and back + * into event emission and SSE streams. This port surfaces the seam without + * prescribing **how** a tenant is resolved — adapters can pick subdomain, + * `X-Tenant-Id` header, JWT claim, session lookup, anything. + * + * Why a port and not a one-off helper: tenant identity has to flow through + * five different boundaries (HTTP, route handlers, workflow engine, worker + * activities, persistence). Without an explicit seam, multi-tenancy ends up + * as a cross-cutting hack scattered across the codebase. A typo at any + * boundary is a security incident — one tenant reading another's data. + * + * Implementations must not throw for "no tenant" — return `null`. Routes + * decide whether anonymous-tenant traffic is allowed (rejecting `null` on + * scoped routes, accepting it on health checks). + */ + +export type TenantContext = { + /** + * Stable tenant identifier, an opaque string (UUID or slug). Persisted as + * `text` in `executions.tenant_id`, so any string the adapter returns round + * trips. Consumers that need richer per-tenant data (plan tier, region, + * flags) extend this type in their own adapter. + */ + readonly tenantId: string; +}; + +export interface TenantContextPort { + /** + * Resolve the tenant for an incoming request. + * + * - Return a {@link TenantContext} when the request carries enough signal + * to identify a tenant. + * - Return `null` for tenant-less requests (e.g. `/api/health`, public + * landing pages). Scoped routes treat `null` as 4xx; non-scoped routes + * accept it. + * + * Throw only for unexpected failures — the tenant-store is down, the + * subdomain DNS lookup timed out, the JWT verifier choked on JWKS. + * "Could not determine tenant" is `null`, not an exception. + */ + resolve(request: Request): Promise; +} diff --git a/apps/backend/tenant-context-port.decision-log.md b/apps/backend/tenant-context-port.decision-log.md new file mode 100644 index 000000000..ef937c2d8 --- /dev/null +++ b/apps/backend/tenant-context-port.decision-log.md @@ -0,0 +1,95 @@ +### Title: TenantContextPort — multi-tenant identity seam for the reference backend + +### Proposed by: Kacper Cierzniewski + +### Proposed: 21.05.2026 — Landed: 03.06.2026 (`fa5999dd`) + +> This is the **decision** (why this shape, what was rejected, what it does and does not protect). The **how-to-wire-it** lives in [`multi-tenancy.md`](./multi-tenancy.md) — that document tracks the code and is the source of truth for current signatures and per-seam status. If a snippet here ever disagrees with the code, the code wins. + +## Context + +The reference backend (`apps/backend` + `apps/execution-worker`) ships **single-tenant**. Real multi-tenant consumers need a tenant identifier to flow from the incoming HTTP request, through workflow submission, into worker activities, and back into event emission and SSE streams. + +Without an explicit seam, multi-tenancy degrades into a cross-cutting hack: a `getTenantFromRequest(req)` helper called in 17 routes, a `tenant_id` quietly threaded through every executor, a `WHERE tenant_id = ?` clause every consumer remembers to add or forgets. Forgetting it once is the canonical multi-tenancy bug — one tenant reading another's data. + +## Decision + +Introduce `TenantContextPort` (`apps/backend/src/tenant/`): an interface with a single method `resolve(request) → TenantContext | null`, plus a default `NoopTenantContextPort` that resolves to `null` on every request. A consumer makes the backend multi-tenant by swapping one line (the port instance), not by re-implementing the wiring. + +The port is small on purpose; the **wiring around it (the five seams) is the actual contract**, documented in [`multi-tenancy.md`](./multi-tenancy.md). Two structural commitments are load-bearing and stated here because the code does not enforce them: + +- **The port lives in `apps/backend`, not `packages/execution-core`.** `execution-core` stays tenant-agnostic. If a consumer's executors need the tenant at runtime, it rides the existing opaque `ExecutionContext.variables` bag; the reference never injects it. +- **`resolve()` takes the raw `Request`**, so a tenant adapter never depends on auth having run. The two seams compose without coupling. + +## Responsibility boundary: TenantContextPort vs AuthPort + +The backend has two identity seams and they own different things. **Getting this split wrong is how the cross-tenant bug sneaks back in**, so it is stated explicitly — the code cannot express it. + +- **AuthPort owns per-resource authorization, including tenant ownership.** Every scoped route calls `assertAuthorized(c, action, resource)` before touching data. In a multi-tenant deployment "may act on this resource" _means_ "belongs to the caller's tenant". This is the single, systematic place resource scoping lives. Sprinkling `if (row.tenantId !== caller.tenantId)` into every route is the shotgun anti-pattern this design exists to avoid. +- **TenantContextPort owns identity propagation** — resolve the tenant once at the HTTP boundary, carry it onto the execution row and (denormalised) event rows, so reads, the worker, and RLS can all scope by it. **Propagation is not isolation.** + +The SSE stream cross-check (seam 4) is the **one deliberate exception** — defence-in-depth, not the general guard. `GET /:id/stream` cannot use a bearer token (EventSource sends no `Authorization` header), so its auth falls back to weaker query-param/cookie schemes; the tenant resolved independently by `TenantContextPort` gives a second check on exactly that weak path. `GET /:id` and `DELETE /:id` carry no such cross-check on purpose — they rely on `AuthPort` plus RLS. + +**Consequence a consumer must internalise:** implementing `TenantContextPort` alone does **not** isolate tenants. Isolation is `AuthPort` (app layer) plus RLS (DB layer). The seam map is the full job. + +## Threat model + +What this design defends against, and what it deliberately does not — so a reviewer can tell an accepted risk from an oversight. + +- **Defended:** a tenant reading or streaming another tenant's executions/events through the API, once a real `AuthPort` is plugged in (app layer) and RLS is enabled (DB layer). Enumeration of foreign execution ids is closed by the 404-not-403 choice on the stream boundary (a recognisable 403 would confirm the id exists in another tenant; an indistinguishable 404 does not). +- **Out of scope (accepted, not overlooked):** + - **Untenanted rows (`tenant_id IS NULL`) are globally visible at the app layer.** The null-tolerant cross-check required for the single-tenant reference means these rows are not isolated until RLS is on. Acceptable because the reference is single-tenant; a multi-tenant consumer must enable RLS and backfill. + - **An insider or process with direct Postgres access.** RLS raises the bar but the app trusts its DB session; this is not a defence against a compromised database role. + - **Side-channel / timing inference.** Response-time differences between tenanted paths are not equalised. + +## Alternative options considered + +- **Bake tenancy into `execution-core`.** Rejected. The runner ships to consumers who may not be multi-tenant; a `tenantId` on `BaseNode`/`ExecutionContext` would force every adapter to know about tenants. The variables-bag transport keeps execution-core ignorant and the field optional. +- **A single inline `getTenant(request)` helper.** Rejected. A helper called across 17 routes is exactly the cross-cutting hack to avoid. A port plus middleware adds one indirection but survives swap-in scenarios (subdomain → JWT, multi-region tenant store) without touching every route. +- **Thread `tenantId` through the workflow to the event INSERT** (instead of the seam-3 subquery). Rejected. Re-introduces a per-call-site parameter the worker must carry and could forget, coupling the otherwise tenant-agnostic worker to tenancy. The subquery costs one indexed lookup per event and ties the event's tenant to its parent at insert time. +- **Per-tenant Temporal namespace.** Stronger isolation than RLS but multiplies operational cost and couples the application to Temporal. Out of scope; the port composes with it if a consumer chooses it. +- **Header-based vs subdomain-based resolution.** Not our call. The port stays resolution-method-agnostic. + +## Consequences + +- **Pros** + - Reference stays single-tenant with the no-op default; cloning and running it needs zero tenancy ceremony. + - Swap-one-line multi-tenancy for seams 1–4 — wiring is shipped, not left as an exercise; the consumer replaces the port instance and enables seam 5. + - A five-seam map tells consumers exactly where to wire and which seam owns what. + - No `execution-core` change; the worker stays tenant-agnostic. +- **Cons** + - **Identity propagation is not isolation** — the headline foot-gun if the responsibility boundary is skimmed. + - **Untenanted rows are app-layer visible** until RLS is on (see threat model). + - Event tenant-tagging depends on the `executions.tenant_id` subquery — one indexed lookup per event insert, traded for a worker that stays fully tenant-agnostic. + +## Verification status + +What is enforced by a test versus what is a contract only on paper. Honesty here matters more than completeness — an untested seam is a claim, not a guarantee. + +| Seam | Behaviour under test | Status | +| -------------------------------------------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | +| 1 — middleware `resolve` / `requireTenant` | resolves once onto context; noop yields `null` | ✅ `tenant/middleware.test.ts`, `tenant/noop-tenant-context-port.test.ts` | +| 2 — execute stamps tenant on the row | `tenantId` written to the execution row, `null` in single-tenant mode, never into `variables` | ✅ `routes/workflows.test.ts` | +| 4 — SSE stream cross-check | 404-not-403 on mismatch (no existence leak); streams on match; no-op when either side `null` | ✅ `routes/executions.test.ts` | +| 3 — worker inherits `tenant_id` via subquery | event row's `tenant_id` matches its parent execution | ⚠️ **Not tested.** `execution-worker` has no test exercising `src/database.ts`. Paper contract — see follow-ups. | +| 5 — Postgres RLS | — | Not shipped; documented pattern in [`multi-tenancy.md`](./multi-tenancy.md). | + +## Revisit triggers + +This decision stands until one of these fires: + +- A real multi-tenant consumer proves the `variables`-bag transport for tenant-in-executor is too type-unsafe → reconsider a typed `tenantId` on `ExecutionContext` (reopens "bake into execution-core"). +- The reference itself goes multi-tenant, making globally-visible untenanted rows unacceptable → ship RLS plus a `NOT NULL` backfill, flipping seam 5 from pattern to code. +- The per-event subquery shows up in profiling at scale → revisit threading tenant through the worker. +- A consumer needs stronger isolation than RLS → open a namespace-per-tenant decision log. + +## Follow-ups + +- **Add a worker-side test for seam 3** (the subquery inheritance) — close the paper-contract gap flagged above. +- **AuthPort integration recipe**: the JWT-claim pattern, including parsing the token once in a combined middleware shared by both ports. +- **Ship a tested RLS recipe** under `apps/backend/migrations/recipes/multi-tenant.sql` (policy + a `withTenantScope` transaction helper), gated behind an env flag during `pnpm db:migrate`. Until it exists and is tested, seam 5 stays a documented pattern, not shipped code. +- **Tenant in `node_failed` payloads**: stamp tenant on event payloads so a sink can pivot logs by tenant without joining back to `executions`. + +## Status + +Accepted. Port + `NoopTenantContextPort` default + seams 1–4 wired as no-ops in the reference backend (seam 3 untested — see Verification status); seam 5 (RLS) documented as a consumer-enabled pattern in [`multi-tenancy.md`](./multi-tenancy.md). diff --git a/apps/execution-worker/src/database.ts b/apps/execution-worker/src/database.ts index b116186e9..8b2096172 100644 --- a/apps/execution-worker/src/database.ts +++ b/apps/execution-worker/src/database.ts @@ -8,7 +8,7 @@ const sql = postgres(env.DATABASE_URL); export const database = { async emitExecutionEvent(executionId: string, type: string, payload?: unknown, nodeId?: string) { await sql` - INSERT INTO execution_events (id, execution_id, sequence, timestamp, type, node_id, path_id, payload_json, created_at) + INSERT INTO execution_events (id, execution_id, sequence, timestamp, type, node_id, path_id, payload_json, tenant_id, created_at) VALUES ( gen_random_uuid(), ${executionId}, @@ -18,6 +18,7 @@ export const database = { ${nodeId ?? null}, ${null}, ${payload ? JSON.stringify(payload) : null}::jsonb, + (SELECT tenant_id FROM executions WHERE id = ${executionId}), now() ) `;