From fa5999dd76002fae3c8d5a3b804c1f883a2d3c48 Mon Sep 17 00:00:00 2001 From: Kacper Cierzniewski Date: Wed, 3 Jun 2026 13:58:19 +0200 Subject: [PATCH 1/2] feat(backend): introduce TenantContextPort seam for multi-tenant identity --- apps/backend/README.md | 2 + apps/backend/drizzle/0001_odd_iron_lad.sql | 4 + apps/backend/drizzle/meta/0001_snapshot.json | 359 ++++++++++++++++++ apps/backend/drizzle/meta/_journal.json | 7 + apps/backend/src/auth/middleware.ts | 9 +- apps/backend/src/db/schema.ts | 13 + apps/backend/src/events/drain-events.test.ts | 1 + .../src/events/serialized-drainer.test.ts | 1 + apps/backend/src/routes/executions.test.ts | 73 ++++ apps/backend/src/routes/executions.ts | 32 +- apps/backend/src/routes/workflows.test.ts | 65 ++++ apps/backend/src/routes/workflows.ts | 19 +- apps/backend/src/server.ts | 14 +- apps/backend/src/tenant/index.ts | 4 + apps/backend/src/tenant/middleware.test.ts | 120 ++++++ apps/backend/src/tenant/middleware.ts | 51 +++ .../tenant/noop-tenant-context-port.test.ts | 34 ++ .../src/tenant/noop-tenant-context-port.ts | 20 + .../backend/src/tenant/tenant-context-port.ts | 47 +++ .../tenant-context-port.decision-log.md | 212 +++++++++++ apps/execution-worker/src/database.ts | 3 +- 21 files changed, 1080 insertions(+), 10 deletions(-) create mode 100644 apps/backend/drizzle/0001_odd_iron_lad.sql create mode 100644 apps/backend/drizzle/meta/0001_snapshot.json create mode 100644 apps/backend/src/tenant/index.ts create mode 100644 apps/backend/src/tenant/middleware.test.ts create mode 100644 apps/backend/src/tenant/middleware.ts create mode 100644 apps/backend/src/tenant/noop-tenant-context-port.test.ts create mode 100644 apps/backend/src/tenant/noop-tenant-context-port.ts create mode 100644 apps/backend/src/tenant/tenant-context-port.ts create mode 100644 apps/backend/tenant-context-port.decision-log.md diff --git a/apps/backend/README.md b/apps/backend/README.md index fdb815078..c5cd592c9 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -2,6 +2,8 @@ > ⚠️ **Reference implementation, local development only.** > No real authentication. The bundled `AllowAllAuthPort` permits every caller and every action (see `src/auth/`), and the constructor refuses to start without the explicit `WB_AUTH_PORT=allow-all` opt-in so a forgotten env var fails loudly. No tenant isolation. The HTTP server and the docker-compose services bind to `127.0.0.1` by default. Do not expose to the internet or shared networks without first plugging in a real `AuthPort`, see [`auth-port.decision-log.md`](./auth-port.decision-log.md) for the seam, default, and a JWT adapter sketch. +> +> Seams for consumers to plug in: [`AuthPort`](./auth-port.decision-log.md) for authn/authz, [`TenantContextPort`](./tenant-context-port.decision-log.md) for multi-tenant identity propagation. > **Note:** setup is in [root README "Path B. Run the full stack demo"](../../README.md#path-b-run-the-full-stack-demo). This file documents the backend's internals, not how to start it. diff --git a/apps/backend/drizzle/0001_odd_iron_lad.sql b/apps/backend/drizzle/0001_odd_iron_lad.sql new file mode 100644 index 000000000..efb4a351a --- /dev/null +++ b/apps/backend/drizzle/0001_odd_iron_lad.sql @@ -0,0 +1,4 @@ +ALTER TABLE "execution_events" ADD COLUMN "tenant_id" text;--> statement-breakpoint +ALTER TABLE "executions" ADD COLUMN "tenant_id" text;--> statement-breakpoint +CREATE INDEX "execution_events_tenant_id_idx" ON "execution_events" USING btree ("tenant_id");--> statement-breakpoint +CREATE INDEX "executions_tenant_id_idx" ON "executions" USING btree ("tenant_id"); \ No newline at end of file diff --git a/apps/backend/drizzle/meta/0001_snapshot.json b/apps/backend/drizzle/meta/0001_snapshot.json new file mode 100644 index 000000000..665f42c31 --- /dev/null +++ b/apps/backend/drizzle/meta/0001_snapshot.json @@ -0,0 +1,359 @@ +{ + "id": "a831f5bf-efaf-47d7-8a84-9845bdf309de", + "prevId": "6d12f5a1-dc40-4f2c-b339-9613ed78fce7", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.execution_events": { + "name": "execution_events", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "execution_id": { + "name": "execution_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "sequence": { + "name": "sequence", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "timestamp": { + "name": "timestamp", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "path_id": { + "name": "path_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "payload_json": { + "name": "payload_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "tenant_id": { + "name": "tenant_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "execution_events_execution_sequence_idx": { + "name": "execution_events_execution_sequence_idx", + "columns": [ + { + "expression": "execution_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "sequence", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "execution_events_execution_id_idx": { + "name": "execution_events_execution_id_idx", + "columns": [ + { + "expression": "execution_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "execution_events_tenant_id_idx": { + "name": "execution_events_tenant_id_idx", + "columns": [ + { + "expression": "tenant_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "execution_events_execution_id_executions_id_fk": { + "name": "execution_events_execution_id_executions_id_fk", + "tableFrom": "execution_events", + "tableTo": "executions", + "columnsFrom": ["execution_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.executions": { + "name": "executions", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "workflow_id": { + "name": "workflow_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "source_version": { + "name": "source_version", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workflow_snapshot_json": { + "name": "workflow_snapshot_json", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "trigger_payload_json": { + "name": "trigger_payload_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "tenant_id": { + "name": "tenant_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "finished_at": { + "name": "finished_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "error_message": { + "name": "error_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "executions_workflow_id_idx": { + "name": "executions_workflow_id_idx", + "columns": [ + { + "expression": "workflow_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "executions_status_idx": { + "name": "executions_status_idx", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "executions_tenant_id_idx": { + "name": "executions_tenant_id_idx", + "columns": [ + { + "expression": "tenant_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "executions_workflow_id_workflows_id_fk": { + "name": "executions_workflow_id_workflows_id_fk", + "tableFrom": "executions", + "tableTo": "workflows", + "columnsFrom": ["workflow_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.workflows": { + "name": "workflows", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "draft_json": { + "name": "draft_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "published_json": { + "name": "published_json", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "published_at": { + "name": "published_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/apps/backend/drizzle/meta/_journal.json b/apps/backend/drizzle/meta/_journal.json index af89d3820..503fd982b 100644 --- a/apps/backend/drizzle/meta/_journal.json +++ b/apps/backend/drizzle/meta/_journal.json @@ -8,6 +8,13 @@ "when": 1776110826965, "tag": "0000_steady_quicksilver", "breakpoints": true + }, + { + "idx": 1, + "version": "7", + "when": 1780490055890, + "tag": "0001_odd_iron_lad", + "breakpoints": true } ] } diff --git a/apps/backend/src/auth/middleware.ts b/apps/backend/src/auth/middleware.ts index 2264b059f..9b481472c 100644 --- a/apps/backend/src/auth/middleware.ts +++ b/apps/backend/src/auth/middleware.ts @@ -37,8 +37,13 @@ export function createAuthMiddleware(authPort: AuthPort): MiddlewareHandler<{ Va * // proceed - if we got here, the caller is authorized * ``` */ -export type AssertAuthorized = ( - c: Context<{ Variables: AuthVariables }>, +// Generic over the context's `Variables` (constrained to extend AuthVariables) +// so a route can compose auth with other per-request seams, e.g. run under +// `AuthVariables & TenantVariables`. Hono's `Context.set` is invariant in +// `Variables`, so a fixed `Context<{ Variables: AuthVariables }>` parameter +// would reject any widened context. Do not narrow this back to a concrete type. +export type AssertAuthorized = ( + c: Context<{ Variables: V }>, action: AuthAction, resource: AuthResource, ) => Promise; diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 38dfe95a6..4151f83e0 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -21,6 +21,12 @@ export const executions = pgTable( workflowSnapshotJson: jsonb('workflow_snapshot_json').notNull(), status: text('status').notNull().default('pending'), triggerPayloadJson: jsonb('trigger_payload_json'), + // Nullable: single-tenant deployments leave it NULL on every row. + // Multi-tenant consumers populate via TenantContextPort at /execute time. + // `text`, not `uuid`: TenantContext.tenantId is an opaque string (UUID or + // slug), so the column must accept whatever the consumer's port returns. + // See apps/backend/tenant-context-port.decision-log.md for the wiring. + tenantId: text('tenant_id'), startedAt: timestamp('started_at', { withTimezone: true }), finishedAt: timestamp('finished_at', { withTimezone: true }), errorMessage: text('error_message'), @@ -30,6 +36,7 @@ export const executions = pgTable( (table) => [ index('executions_workflow_id_idx').on(table.workflowId), index('executions_status_idx').on(table.status), + index('executions_tenant_id_idx').on(table.tenantId), ], ); @@ -46,10 +53,16 @@ export const executionEvents = pgTable( nodeId: text('node_id'), pathId: text('path_id'), payloadJson: jsonb('payload_json'), + // Denormalised from `executions.tenant_id` so analytics queries can scope + // events by tenant without joining. Worker writes inherit it from the + // parent execution via subquery at INSERT time (see worker database.ts). + // `text` to match executions.tenant_id. NULL in single-tenant mode. + tenantId: text('tenant_id'), createdAt: timestamp('created_at', { withTimezone: true }).notNull().defaultNow(), }, (table) => [ uniqueIndex('execution_events_execution_sequence_idx').on(table.executionId, table.sequence), index('execution_events_execution_id_idx').on(table.executionId), + index('execution_events_tenant_id_idx').on(table.tenantId), ], ); diff --git a/apps/backend/src/events/drain-events.test.ts b/apps/backend/src/events/drain-events.test.ts index 0d928134b..b365c7fd7 100644 --- a/apps/backend/src/events/drain-events.test.ts +++ b/apps/backend/src/events/drain-events.test.ts @@ -14,6 +14,7 @@ function makeEvent(executionId: string, sequence: number, type = 'node_completed nodeId: null, pathId: null, payloadJson: null, + tenantId: null, createdAt: stamp, }; } diff --git a/apps/backend/src/events/serialized-drainer.test.ts b/apps/backend/src/events/serialized-drainer.test.ts index 79ef8bde0..5503f3977 100644 --- a/apps/backend/src/events/serialized-drainer.test.ts +++ b/apps/backend/src/events/serialized-drainer.test.ts @@ -15,6 +15,7 @@ function makeEvent(executionId: string, sequence: number, type = 'node_completed nodeId: null, pathId: null, payloadJson: null, + tenantId: null, createdAt: stamp, }; } diff --git a/apps/backend/src/routes/executions.test.ts b/apps/backend/src/routes/executions.test.ts index 0b560f442..0c702df36 100644 --- a/apps/backend/src/routes/executions.test.ts +++ b/apps/backend/src/routes/executions.test.ts @@ -10,6 +10,7 @@ import { createAuthMiddleware, makeAssertAuthorized, } from '../auth'; +import { type TenantContext, type TenantVariables, createTenantMiddleware } from '../tenant'; import { createExecutionsRoutes } from './executions'; // ---- module mocks ----------------------------------------------------------- @@ -88,6 +89,17 @@ function denyAll(): AuthPort { }; } +// Same wiring as buildApp, plus the tenant middleware so the route's tenant +// cross-check has a `c.var.tenant` to read. `tenant` is what the configured +// TenantContextPort resolves to (null = single-tenant reference default). +function buildAppWithTenant(port: AuthPort, tenant: TenantContext | null) { + const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); + app.use('*', createAuthMiddleware(port)); + app.use('*', createTenantMiddleware({ resolve: vi.fn(async () => tenant) })); + app.route('/api/executions', createExecutionsRoutes(makeAssertAuthorized(port))); + return app; +} + // Terminal status so the SSE handler returns after the snapshot and the test // does not have to chase a long-lived stream connection. const terminalExecution = { @@ -188,3 +200,64 @@ describe('createExecutionsRoutes - deny short-circuits before DB or engine work' expect(subscribeMock).not.toHaveBeenCalled(); }); }); + +// ---- tenant cross-check on the SSE stream ---------------------------------- +// +// The stream is the one execution-row endpoint that enforces tenant isolation +// directly, as defence-in-depth for EventSource's weaker auth (it cannot send +// an Authorization header). Resource-level scoping of GET/:id and DELETE/:id +// is the AuthPort's job - see tenant-context-port.decision-log.md. The check +// is a no-op when either side is null, which keeps the single-tenant reference +// (NoopTenantContextPort resolves null) behaving exactly as before. + +const tenantedExecution = { ...terminalExecution, tenantId: 'acme' }; + +const allowStream = () => allowAll(vi.fn(async () => true)); + +function programStream(execution: unknown) { + // 1st select: the executions row. 2nd: the events catch-up query. + databaseMock.select.mockReturnValueOnce(chainResolving([execution])); + databaseMock.select.mockReturnValue(chainResolving([])); +} + +describe('createExecutionsRoutes - stream tenant cross-check', () => { + it('404 (not 403) when caller tenant differs - no cross-tenant existence leak', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'other' }); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + // Byte-identical to the not-found branch: a foreign execution must be + // indistinguishable from one that does not exist, or the id is enumerable. + expect(response.status).toBe(404); + expect(await response.json()).toEqual({ code: 'execution_not_found', message: 'Execution not found' }); + expect(subscribeMock).not.toHaveBeenCalled(); + }); + + it('streams when caller tenant matches the execution tenant', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'acme' }); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); + + it('no-op when caller has no tenant - single-tenant default - even if the execution is tenanted', async () => { + const app = buildAppWithTenant(allowStream(), null); + programStream(tenantedExecution); + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); + + it('no-op when the execution has no tenant even if the caller is tenanted', async () => { + const app = buildAppWithTenant(allowStream(), { tenantId: 'acme' }); + programStream(terminalExecution); // terminalExecution carries no tenantId + + const response = await app.request('/api/executions/e-1/stream'); + + expect(response.status).toBe(200); + }); +}); diff --git a/apps/backend/src/routes/executions.ts b/apps/backend/src/routes/executions.ts index 2d82a69de..0934f645b 100644 --- a/apps/backend/src/routes/executions.ts +++ b/apps/backend/src/routes/executions.ts @@ -11,13 +11,16 @@ import { subscribe } from '../events/execution-event-bus'; import { type ExecutionEventRow, fetchEventsAfter } from '../events/fetch-events-after'; import { createSerializedDrainer } from '../events/serialized-drainer'; import { logger as backendLogger } from '../logger'; +import type { TenantVariables } from '../tenant'; const logger = backendLogger.child({ component: 'executions-route' }); const TERMINAL_STATUSES = new Set(['completed', 'failed', 'cancelled']); -export function createExecutionsRoutes(assertAuthorized: AssertAuthorized): Hono<{ Variables: AuthVariables }> { - const routes = new Hono<{ Variables: AuthVariables }>(); +export function createExecutionsRoutes( + assertAuthorized: AssertAuthorized, +): Hono<{ Variables: AuthVariables & TenantVariables }> { + const routes = new Hono<{ Variables: AuthVariables & TenantVariables }>(); routes.get('/:id', async (c) => { const executionId = c.req.param('id'); @@ -57,6 +60,31 @@ export function createExecutionsRoutes(assertAuthorized: AssertAuthorized): Hono return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); } + // Tenant cross-check, scoped to the stream on purpose. This is NOT the + // general per-resource tenant guard - resource-level scoping of GET/:id + // and DELETE/:id is the AuthPort's job (it receives { kind: 'execution', + // executionId } and a real adapter checks ownership). The stream gets an + // extra, independent check because EventSource cannot send an + // Authorization header, so its auth falls back to weaker query-param / + // cookie schemes (see the comment above this handler). The tenant resolved + // by TenantContextPort - which can come from a subdomain or cookie - is a + // second line of defence on exactly that weak path. + // + // No-op when either side is null: a tenant-less caller (single-tenant + // reference, NoopTenantContextPort) or an untenanted execution row passes + // through unchanged. Untenanted rows are therefore globally visible at the + // app layer; Postgres RLS (decision-log seam 5) is the systematic backstop + // for deployments that need rows to be invisible across tenants by default. + // + // On mismatch we return 404, byte-identical to the not-found branch above, + // not 403. A distinct "belongs to another tenant" response would confirm + // the id exists in some other tenant, letting a caller enumerate foreign + // executions. Indistinguishable-from-absent is the only safe answer here. + const tenant = c.var.tenant; + if (tenant && execution.tenantId && execution.tenantId !== tenant.tenantId) { + return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); + } + return streamSSE(c, async (stream) => { // Catch-up snapshot. Reuses the same incremental query (afterSequence=0) // that powers live drains — one query shape across the route, not two. diff --git a/apps/backend/src/routes/workflows.test.ts b/apps/backend/src/routes/workflows.test.ts index 7a5dd2dbe..5e6bacb54 100644 --- a/apps/backend/src/routes/workflows.test.ts +++ b/apps/backend/src/routes/workflows.test.ts @@ -10,6 +10,7 @@ import { createAuthMiddleware, makeAssertAuthorized, } from '../auth'; +import { type TenantContext, type TenantVariables, createTenantMiddleware } from '../tenant'; import { createWorkflowsRoutes } from './workflows'; // ---- module mocks ----------------------------------------------------------- @@ -94,6 +95,31 @@ function denyAll(): AuthPort { }; } +// buildApp plus the tenant middleware, so the execute route can read +// `c.var.tenant`. `tenant` is what the configured TenantContextPort resolves +// to (null = single-tenant reference default). +function buildAppWithTenant(port: AuthPort, tenant: TenantContext | null) { + const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); + app.use('*', createAuthMiddleware(port)); + app.use('*', createTenantMiddleware({ resolve: vi.fn(async () => tenant) })); + app.route('/api/workflows', createWorkflowsRoutes(makeAssertAuthorized(port))); + return app; +} + +// Capture what the execute route passes to `database.insert(...).values(...)`. +// The chainable proxy used elsewhere discards call arguments, so the row-stamp +// assertion needs a mock that records the values object. +function captureExecuteInsert(): { values?: Record } { + const captured: { values?: Record } = {}; + databaseMock.insert.mockImplementation(() => ({ + values: (v: Record) => { + captured.values = v; + return { returning: () => chainResolving([{ id: 'e-1', status: 'pending' }]) }; + }, + })); + return captured; +} + const fakeWorkflow = { id: 'w-1', name: 'demo', @@ -203,3 +229,42 @@ describe('createWorkflowsRoutes - deny short-circuits before any DB access', () expect(engineMock.cancel).not.toHaveBeenCalled(); }); }); + +// ---- tenant propagation on execute ----------------------------------------- +// +// The resolved tenant is stamped onto the executions row (the worker's event +// subquery reads it back from there). It is deliberately NOT placed in the +// engine `variables` bag: no reference executor reads it, so `variables` stays +// empty regardless of whether a tenant was resolved. + +function executeRequest(app: ReturnType) { + return app.request('/api/workflows/w-1/execute', { + method: 'POST', + body: JSON.stringify({ sourceVersion: 'draft' }), + headers: { 'content-type': 'application/json' }, + }); +} + +describe('createWorkflowsRoutes - execute propagates tenant identity', () => { + it('stamps the resolved tenantId onto the execution row, not into variables', async () => { + const app = buildAppWithTenant(allowAll(vi.fn(async () => true)), { tenantId: 'acme' }); + databaseMock.select.mockReturnValue(chainResolving([fakeWorkflow])); + const insert = captureExecuteInsert(); + + await executeRequest(app); + + expect(insert.values?.tenantId).toBe('acme'); + expect(engineMock.submit).toHaveBeenCalledWith(expect.objectContaining({ variables: {} })); + }); + + it('writes null tenantId in single-tenant mode', async () => { + const app = buildAppWithTenant(allowAll(vi.fn(async () => true)), null); + databaseMock.select.mockReturnValue(chainResolving([fakeWorkflow])); + const insert = captureExecuteInsert(); + + await executeRequest(app); + + expect(insert.values?.tenantId).toBeNull(); + expect(engineMock.submit).toHaveBeenCalledWith(expect.objectContaining({ variables: {} })); + }); +}); diff --git a/apps/backend/src/routes/workflows.ts b/apps/backend/src/routes/workflows.ts index e08ff7e27..20d632f81 100644 --- a/apps/backend/src/routes/workflows.ts +++ b/apps/backend/src/routes/workflows.ts @@ -9,6 +9,7 @@ import { mapToExecutionModel } from '../domain/mapper/from-integration-data'; import { workflowSnapshotSchema } from '../domain/mapper/snapshot-schema'; import { getWorkflowEngine } from '../engine'; import { logger as backendLogger } from '../logger'; +import type { TenantVariables } from '../tenant'; const logger = backendLogger.child({ component: 'workflows-route' }); @@ -34,8 +35,10 @@ function formatValidationDetails(error: z.ZodError) { })); } -export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono<{ Variables: AuthVariables }> { - const routes = new Hono<{ Variables: AuthVariables }>(); +export function createWorkflowsRoutes( + assertAuthorized: AssertAuthorized, +): Hono<{ Variables: AuthVariables & TenantVariables }> { + const routes = new Hono<{ Variables: AuthVariables & TenantVariables }>(); routes.post('/', async (c) => { await assertAuthorized(c, 'workflows:create', { kind: 'workflows' }); @@ -209,6 +212,17 @@ export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono< ); } + // Propagate tenant identity from the HTTP boundary onto the execution row. + // The worker reads it back via subquery for event tagging (see worker + // database.ts), so it never has to know tenancy exists. Null in + // single-tenant mode (the reference default). + // + // We deliberately do NOT stuff tenantId into the engine `variables` bag: + // no reference node executor reads it, so shipping it would be dead + // transport. Consumers whose executors need tenant at runtime add it to + // `variables` themselves - see seam 2 in tenant-context-port.decision-log.md. + const tenantId = c.var.tenant?.tenantId ?? null; + const [execution] = await database .insert(executions) .values({ @@ -217,6 +231,7 @@ export function createWorkflowsRoutes(assertAuthorized: AssertAuthorized): Hono< workflowSnapshotJson: snapshotJson, status: 'pending', triggerPayloadJson: body.triggerPayload ?? null, + tenantId, }) .returning(); diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index 61a36f952..351dc9a81 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -16,6 +16,7 @@ import { env } from './env'; import { logger } from './logger'; import { createExecutionsRoutes } from './routes/executions'; import { createWorkflowsRoutes } from './routes/workflows'; +import { NoopTenantContextPort, type TenantContextPort, type TenantVariables, createTenantMiddleware } from './tenant'; // Permissive default for local development. The constructor itself emits a // loud startup warning and refuses to boot unless `WB_AUTH_PORT=allow-all` is @@ -25,7 +26,12 @@ const authPort: AuthPort = new AllowAllAuthPort(); const assertAuthorized = makeAssertAuthorized(authPort); -const app = new Hono<{ Variables: AuthVariables }>(); +// Default single-tenant: every request lands with `c.var.tenant === null`. +// Multi-tenant consumers swap in their own implementation (subdomain, JWT +// claim, header, …) — see `apps/backend/tenant-context-port.decision-log.md`. +const tenantPort: TenantContextPort = new NoopTenantContextPort(); + +const app = new Hono<{ Variables: AuthVariables & TenantVariables }>(); app.use('/*', cors()); // Reject request bodies larger than 1 MB to prevent memory exhaustion @@ -46,11 +52,13 @@ app.onError((error, c) => { return c.json({ code: 'internal_error', message: 'Internal server error' }, 500); }); -// Health check is intentionally registered before the auth middleware so it -// stays accessible to monitoring tools that don't carry credentials. +// Health check is intentionally registered before the auth and tenant +// middleware so it stays accessible to monitoring tools that don't carry +// credentials or tenant headers. app.get('/api/health', (c) => c.json({ status: 'ok' })); app.use('/api/*', createAuthMiddleware(authPort)); +app.use('/api/*', createTenantMiddleware(tenantPort)); app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); diff --git a/apps/backend/src/tenant/index.ts b/apps/backend/src/tenant/index.ts new file mode 100644 index 000000000..6b9f3f43e --- /dev/null +++ b/apps/backend/src/tenant/index.ts @@ -0,0 +1,4 @@ +export { createTenantMiddleware, requireTenant } from './middleware'; +export type { TenantVariables } from './middleware'; +export { NoopTenantContextPort } from './noop-tenant-context-port'; +export type { TenantContext, TenantContextPort } from './tenant-context-port'; diff --git a/apps/backend/src/tenant/middleware.test.ts b/apps/backend/src/tenant/middleware.test.ts new file mode 100644 index 000000000..2d7a13741 --- /dev/null +++ b/apps/backend/src/tenant/middleware.test.ts @@ -0,0 +1,120 @@ +import { Hono } from 'hono'; +import { describe, expect, it, vi } from 'vitest'; + +// Import via the barrel — same path a third-party adapter author would use. +import { + type TenantContext, + type TenantContextPort, + type TenantVariables, + createTenantMiddleware, + requireTenant, +} from '.'; + +function fakePort(overrides: Partial = {}): TenantContextPort { + return { + resolve: vi.fn(async () => null), + ...overrides, + }; +} + +function makeApp(port: TenantContextPort): Hono<{ Variables: TenantVariables }> { + const app = new Hono<{ Variables: TenantVariables }>(); + app.use('*', createTenantMiddleware(port)); + + app.get('/probe', (c) => c.json({ tenant: c.var.tenant })); + + app.get('/scoped/:id', (c) => { + const tenant = requireTenant(c); + if (tenant instanceof Response) return tenant; + return c.json({ id: c.req.param('id'), tenantId: tenant.tenantId }); + }); + + return app; +} + +describe('createTenantMiddleware', () => { + it('stashes the resolved tenant on the request context', async () => { + const tenant: TenantContext = { tenantId: 'acme' }; + const port = fakePort({ resolve: vi.fn(async () => tenant) }); + + const response = await makeApp(port).request('/probe'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ tenant }); + expect(port.resolve).toHaveBeenCalledTimes(1); + }); + + it('stashes null when the port returns null — single-tenant default', async () => { + const port = fakePort({ resolve: vi.fn(async () => null) }); + + const response = await makeApp(port).request('/probe'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ tenant: null }); + }); + + it('forwards the raw Request object to the port — adapter sees full url + headers', async () => { + const seen: Request[] = []; + const port = fakePort({ + resolve: vi.fn(async (request: Request) => { + seen.push(request); + return null; + }), + }); + + await makeApp(port).request('/probe', { headers: { 'x-tenant-id': 'beta' } }); + + expect(seen).toHaveLength(1); + expect(seen[0]?.headers.get('x-tenant-id')).toBe('beta'); + }); + + it('resolves exactly once per request even with multiple handlers downstream', async () => { + const port = fakePort({ resolve: vi.fn(async () => ({ tenantId: 'acme' })) }); + + const app = new Hono<{ Variables: TenantVariables }>(); + app.use('*', createTenantMiddleware(port)); + app.get('/multi', async (c) => { + const a = c.var.tenant; + const b = c.var.tenant; + return c.json({ same: a === b, tenantId: a?.tenantId }); + }); + + const response = await app.request('/multi'); + + expect(await response.json()).toEqual({ same: true, tenantId: 'acme' }); + expect(port.resolve).toHaveBeenCalledTimes(1); + }); + + it('propagates an exception from the port — not silently coerced to null', async () => { + // Throw is reserved for unexpected failures (tenant store down, DNS timeout). + // Routes must NOT treat that as anonymous; Hono surfaces it as 500. + const port = fakePort({ + resolve: vi.fn(async () => { + throw new Error('tenant store unreachable'); + }), + }); + + const response = await makeApp(port).request('/probe'); + expect(response.status).toBe(500); + }); +}); + +describe('requireTenant', () => { + it('returns the resolved TenantContext when present', async () => { + const port = fakePort({ resolve: vi.fn(async () => ({ tenantId: 'acme' })) }); + + const response = await makeApp(port).request('/scoped/widget-1'); + + expect(response.status).toBe(200); + expect(await response.json()).toEqual({ id: 'widget-1', tenantId: 'acme' }); + }); + + it('returns a 400 Response when no tenant is on the context', async () => { + const port = fakePort({ resolve: vi.fn(async () => null) }); + + const response = await makeApp(port).request('/scoped/widget-1'); + + expect(response.status).toBe(400); + expect(await response.json()).toEqual({ code: 'tenant_required', message: 'Tenant context required' }); + }); +}); diff --git a/apps/backend/src/tenant/middleware.ts b/apps/backend/src/tenant/middleware.ts new file mode 100644 index 000000000..3a1259880 --- /dev/null +++ b/apps/backend/src/tenant/middleware.ts @@ -0,0 +1,51 @@ +import type { Context, MiddlewareHandler } from 'hono'; + +import type { TenantContext, TenantContextPort } from './tenant-context-port'; + +/** + * Stash for the resolved tenant on a Hono request context. `null` means the + * configured {@link TenantContextPort} did not identify a tenant for this + * request (e.g. health check, single-tenant reference mode). Routes that + * require a tenant use {@link requireTenant}. + */ +export type TenantVariables = { + tenant: TenantContext | null; +}; + +/** + * Identify the tenant once per request and stash it on the Hono context. + * Routes downstream read `c.var.tenant`. With the default + * `NoopTenantContextPort`, every request lands with `tenant === null` — + * the reference backend is single-tenant — and downstream code treats that + * as the standard case. + */ +export function createTenantMiddleware(port: TenantContextPort): MiddlewareHandler<{ Variables: TenantVariables }> { + return async (c, next) => { + c.set('tenant', await port.resolve(c.req.raw)); + await next(); + }; +} + +/** + * Convenience for routes that REQUIRE a tenant. Returns the `TenantContext` + * to use, or a 400 `Response` if no tenant was resolved. Use the early-return + * idiom: + * + * ```ts + * const tenant = requireTenant(c); + * if (tenant instanceof Response) return tenant; + * // tenant is TenantContext from here on + * ``` + * + * In the reference (single-tenant) setup with `NoopTenantContextPort`, this + * helper always short-circuits to 400 — which is fine, because no route in + * the reference backend currently calls it. Multi-tenant consumers call it + * from routes that should reject tenant-less callers. + */ +export function requireTenant(c: Context<{ Variables: TenantVariables }>): TenantContext | Response { + const tenant = c.var.tenant; + if (!tenant) { + return c.json({ code: 'tenant_required', message: 'Tenant context required' }, 400); + } + return tenant; +} diff --git a/apps/backend/src/tenant/noop-tenant-context-port.test.ts b/apps/backend/src/tenant/noop-tenant-context-port.test.ts new file mode 100644 index 000000000..8fbe311f7 --- /dev/null +++ b/apps/backend/src/tenant/noop-tenant-context-port.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, it } from 'vitest'; + +// Imported via the barrel — the same path a third-party adapter author would +// use. Pins that the public surface re-exports both the no-op impl and the +// interface in one place; knip flags if either drifts out of the barrel. +import { NoopTenantContextPort, type TenantContextPort } from '.'; + +describe('NoopTenantContextPort', () => { + // Typed as the interface so the test exercises the contract callers see + // rather than the impl's narrowed signature. + const port: TenantContextPort = new NoopTenantContextPort(); + + it('resolves to null — the reference backend is single-tenant', async () => { + const request = new Request('http://localhost/api/workflows'); + await expect(port.resolve(request)).resolves.toBeNull(); + }); + + it('does not throw for any request shape — tenancy is an opt-in seam', async () => { + // Subdomain hint, header hint, POST without body — none of these should + // make the no-op resolver care. Real adapters interpret one of them. + const requests = [ + new Request('http://localhost/'), + new Request('http://acme.example.com/api/x', { headers: { 'x-tenant-id': 'acme' } }), + new Request('http://localhost/api/health', { method: 'POST' }), + new Request('http://localhost/api/y', { + headers: { authorization: 'Bearer eyJhbGc...' }, + }), + ]; + + for (const request of requests) { + await expect(port.resolve(request)).resolves.toBeNull(); + } + }); +}); diff --git a/apps/backend/src/tenant/noop-tenant-context-port.ts b/apps/backend/src/tenant/noop-tenant-context-port.ts new file mode 100644 index 000000000..1df1bfba0 --- /dev/null +++ b/apps/backend/src/tenant/noop-tenant-context-port.ts @@ -0,0 +1,20 @@ +import type { TenantContext, TenantContextPort } from './tenant-context-port'; + +/** + * Default {@link TenantContextPort} for the single-tenant reference backend: + * resolves to `null` on every request. The reference backend operates as a + * single logical tenant, so attaching tenant context would be ceremony with + * no enforcement. + * + * Multi-tenant consumers swap this for a real adapter that reads from a + * subdomain (`https://acme.example.com/api/…`), a JWT claim, a header, a + * session store, or whatever fits their identity model. See + * `apps/backend/tenant-context-port.decision-log.md` for the integration + * pattern (five seams: HTTP middleware, route propagation, persistence, + * SSE filter, Postgres RLS) the port plugs into. + */ +export class NoopTenantContextPort implements TenantContextPort { + async resolve(): Promise { + return null; + } +} diff --git a/apps/backend/src/tenant/tenant-context-port.ts b/apps/backend/src/tenant/tenant-context-port.ts new file mode 100644 index 000000000..0d38c616a --- /dev/null +++ b/apps/backend/src/tenant/tenant-context-port.ts @@ -0,0 +1,47 @@ +/** + * TenantContextPort — multi-tenant identity seam for the reference backend. + * + * The reference backend ships **single-tenant**. Real multi-tenant + * deployments need to propagate a tenant identifier from the incoming HTTP + * request, through workflow submission, into worker activities, and back + * into event emission and SSE streams. This port surfaces the seam without + * prescribing **how** a tenant is resolved — adapters can pick subdomain, + * `X-Tenant-Id` header, JWT claim, session lookup, anything. + * + * Why a port and not a one-off helper: tenant identity has to flow through + * five different boundaries (HTTP, route handlers, workflow engine, worker + * activities, persistence). Without an explicit seam, multi-tenancy ends up + * as a cross-cutting hack scattered across the codebase. A typo at any + * boundary is a security incident — one tenant reading another's data. + * + * Implementations must not throw for "no tenant" — return `null`. Routes + * decide whether anonymous-tenant traffic is allowed (rejecting `null` on + * scoped routes, accepting it on health checks). + */ + +export type TenantContext = { + /** + * Stable tenant identifier, an opaque string (UUID or slug). Persisted as + * `text` in `executions.tenant_id`, so any string the adapter returns round + * trips. Consumers that need richer per-tenant data (plan tier, region, + * flags) extend this type in their own adapter. + */ + readonly tenantId: string; +}; + +export interface TenantContextPort { + /** + * Resolve the tenant for an incoming request. + * + * - Return a {@link TenantContext} when the request carries enough signal + * to identify a tenant. + * - Return `null` for tenant-less requests (e.g. `/api/health`, public + * landing pages). Scoped routes treat `null` as 4xx; non-scoped routes + * accept it. + * + * Throw only for unexpected failures — the tenant-store is down, the + * subdomain DNS lookup timed out, the JWT verifier choked on JWKS. + * "Could not determine tenant" is `null`, not an exception. + */ + resolve(request: Request): Promise; +} diff --git a/apps/backend/tenant-context-port.decision-log.md b/apps/backend/tenant-context-port.decision-log.md new file mode 100644 index 000000000..7990d8a0e --- /dev/null +++ b/apps/backend/tenant-context-port.decision-log.md @@ -0,0 +1,212 @@ +### Title: TenantContextPort — multi-tenant identity seam for the reference backend + +### Proposed by: Kacper Cierzniewski + +### Date: 21.05.2026 + +## Context + +The reference backend (`apps/backend` + `apps/execution-worker`) ships **single-tenant**. Real-world multi-tenant deployments (the typical enterprise consumer) need to propagate a tenant identifier from the incoming HTTP request, through workflow submission, into worker activities, and back into event emission and SSE streams. + +Without an explicit seam, multi-tenancy ends up as a cross-cutting hack scattered across the codebase: a `getTenantFromRequest(req)` helper called in 17 routes, a hand-rolled middleware, a `tenant_id` parameter quietly threaded through every executor, a `WHERE tenant_id = ?` clause every consumer remembers to add or forgets. Mistakes here are exactly the canonical multi-tenancy bug, one tenant reading another's data. + +## Decision + +Introduce `TenantContextPort` (`apps/backend/src/tenant/`), an interface with a single method `resolve(request)` that returns `TenantContext | null`, plus a default `NoopTenantContextPort` that resolves to `null` on every request. + +**What actually ships (not docs, runtime code):** the reference backend wires four of the five seams below as no-ops. With `NoopTenantContextPort` the tenant is `null` on every request, so every wired seam degrades to its current single-tenant behaviour. A consumer makes the backend multi-tenant by swapping one line (the port instance), not by re-implementing the wiring. Seam 5 (Postgres RLS) is documented as a pattern only, because no sensible default policy exists across deployments (see its section). + +The port: + +- Lives in `apps/backend`, not in `packages/execution-core`. `execution-core` stays tenant-agnostic. If a consumer's node executors need tenant at runtime, the identifier is available to inject via the existing `ExecutionContext.variables` shape; the reference itself does not inject it (see seam 2). +- Ships with a `NoopTenantContextPort` default so the reference backend stays single-tenant out of the box, no enforcement, no ceremony. +- Is meant to be replaced by the consumer for any multi-tenant deployment. + +The port itself is small on purpose. The wiring around it (the seams) is the actual contract, and it is where the value is. + +## Responsibility boundary: TenantContextPort vs AuthPort + +The backend has two identity seams and they own different things. Getting this split wrong is how the cross-tenant bug sneaks back in, so it is stated explicitly. + +- **AuthPort owns per-resource authorization, including tenant ownership.** Every scoped route calls `assertAuthorized(c, action, { kind: 'execution', executionId })` before touching data. A real multi-tenant `AuthPort` checks that the caller may act on that specific resource, which in a multi-tenant deployment means "belongs to the caller's tenant". This is the single, systematic place resource scoping lives. Sprinkling `if (row.tenantId !== caller.tenantId)` into every route is the shotgun anti-pattern this design exists to avoid. +- **TenantContextPort owns identity propagation**, resolving the tenant once at the HTTP boundary and carrying it onto the execution row and (denormalised) event rows, so reads, the worker, and RLS can all scope by it. +- **The SSE stream cross-check is the one deliberate exception**, and it is defence-in-depth, not the general guard. `GET /:id/stream` cannot rely on a bearer token because EventSource cannot send an `Authorization` header, so its auth falls back to weaker query-param or cookie schemes. The tenant resolved by `TenantContextPort` (subdomain, cookie) gives an independent second check on exactly that weak path. `GET /:id` and `DELETE /:id` carry no such cross-check on purpose, they rely on `AuthPort` plus RLS. + +Consequence a consumer must internalise: implementing `TenantContextPort` alone does **not** isolate tenants. It propagates identity. Isolation is `AuthPort` (app layer) plus RLS (DB layer). The seam map below is the full job. + +## Integration patterns + +Seams 1-4 below are shipped and wired in the reference (no-op under `NoopTenantContextPort`). Seam 5 is a documented pattern the consumer enables. + +### Seam 1: HTTP middleware — resolve once per request (shipped) + +`createTenantMiddleware` calls `tenantPort.resolve(c.req.raw)` once at the boundary and stashes the result on the request context. `requireTenant` is a shipped convenience for routes that must reject tenant-less callers. No reference route calls `requireTenant` today (the reference is single-tenant, so it would always 400); it ships and is tested because multi-tenant consumers call it from their scoped routes, and shipping plus testing it is cheaper than every consumer re-deriving the same early-return idiom. + +```ts +// apps/backend/src/tenant/middleware.ts (shipped) +export function createTenantMiddleware(port: TenantContextPort): MiddlewareHandler<{ Variables: TenantVariables }> { + return async (c, next) => { + c.set('tenant', await port.resolve(c.req.raw)); + await next(); + }; +} + +export function requireTenant(c: Context<{ Variables: TenantVariables }>): TenantContext | Response { + const tenant = c.var.tenant; + if (!tenant) { + return c.json({ code: 'tenant_required', message: 'Tenant context required' }, 400); + } + return tenant; +} +``` + +Wired in `server.ts` after the auth middleware (so route handlers run under `AuthVariables & TenantVariables`): + +```ts +const tenantPort: TenantContextPort = new NoopTenantContextPort(); // swap for a real adapter +app.use('/api/*', createAuthMiddleware(authPort)); +app.use('/api/*', createTenantMiddleware(tenantPort)); +``` + +### Seam 2: route — propagate tenant into the execution (shipped) + +`POST /workflows/:id/execute` reads the resolved tenant and stamps it onto the execution row at submission time. `null` in single-tenant mode. The worker reads it back from that row via subquery for event tagging (seam 3), so it never needs the value threaded through the run. + +```ts +// apps/backend/src/routes/workflows.ts (shipped) +const tenantId = c.var.tenant?.tenantId ?? null; + +await database.insert(executions).values({ /* … */, tenantId }).returning(); + +await getWorkflowEngine().submit({ + workflowId, + executionId: execution.id, + definition, + triggerPayload: body.triggerPayload ?? {}, + variables: {}, // reference does NOT inject tenantId here, see below + global: {}, +}); +``` + +**Consumer extension (not shipped):** if a consumer's node executors need the tenant at runtime, inject it into the engine `variables` bag (`variables: tenantId ? { tenantId } : {}`) and read it inside the executor as `context.variables.tenantId`. `execution-core` never reads the field, it is opaque transport. The reference omits this because no reference executor consumes it, shipping it would be dead transport. + +### Seam 3: persistence — every row carries `tenant_id` (shipped) + +A migration adds a **nullable** `text` `tenant_id` to `executions` and `execution_events` plus an index on each. Nullable on purpose: the reference and any pre-existing rows leave it `NULL`, and a `NOT NULL` column would refuse to apply against existing data without a backfill story the reference cannot assume. `text`, not `uuid`: `TenantContext.tenantId` is an opaque string (UUID or slug), so the column must store whatever the consumer's port returns. A consumer who knows their tenants are UUIDs can tighten it. + +```sql +-- apps/backend/drizzle/0001_odd_iron_lad.sql (shipped) +ALTER TABLE "execution_events" ADD COLUMN "tenant_id" text; +ALTER TABLE "executions" ADD COLUMN "tenant_id" text; +CREATE INDEX "execution_events_tenant_id_idx" ON "execution_events" ("tenant_id"); +CREATE INDEX "executions_tenant_id_idx" ON "executions" ("tenant_id"); +``` + +The worker inherits `tenant_id` for each event row from the parent execution via a correlated subquery at INSERT time, rather than threading `tenantId` through the workflow and every `emitEvent` call. This keeps the worker and the Temporal workflow fully tenant-agnostic (no new parameter, no per-call-site leak) and makes the event's tenant DB-guaranteed to match its parent: + +```ts +// apps/execution-worker/src/database.ts (shipped) +await sql` + INSERT INTO execution_events (..., tenant_id, ...) + VALUES (..., (SELECT tenant_id FROM executions WHERE id = ${executionId}), ...) +`; +``` + +The cost is one indexed primary-key lookup per event insert. The benefit is that the worker never has to know tenancy exists. + +### Seam 4: SSE stream filter — tenant of caller must match tenant of execution (shipped) + +Before subscribing, `GET /:id/stream` checks the caller's tenant against the tenant stamped on the execution row. As noted in the responsibility-boundary section, this is defence-in-depth for EventSource's weaker auth, not the general per-resource guard. + +```ts +// apps/backend/src/routes/executions.ts (shipped) +const tenant = c.var.tenant; +if (tenant && execution.tenantId && execution.tenantId !== tenant.tenantId) { + // 404, byte-identical to the not-found branch, not 403. A distinct + // "belongs to another tenant" response would confirm the id exists in + // another tenant and make foreign executions enumerable. + return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); +} +``` + +The mismatch response is deliberately a 404 identical to not-found. Returning a recognisable 403 would leak existence across tenants, the caller could probe ids and learn which executions live in other tenants. Indistinguishable-from-absent is the only safe answer on the isolation boundary. + +The `tenant && execution.tenantId &&` guards make the check a no-op when either side is `null`. That keeps the single-tenant reference behaving exactly as before, but it also means **untenanted rows (`tenant_id IS NULL`) are globally visible at the app layer**. For deployments where that is unacceptable, seam 5 (RLS) is the systematic fix; app-level checks alone are not enough. + +### Seam 5: Postgres Row-Level Security — defence in depth (documented pattern, not shipped) + +App-level scoping (`AuthPort` plus seams 2-4) is the primary enforcement. RLS is the safety net for the day a programmer forgets a `WHERE tenant_id` somewhere. It is **not** wired in the reference: enabling RLS under `NoopTenantContextPort` would compare a `NULL` session variable to a `NULL` column and silently return zero rows for every query, and the "right" policy shape (admin bypass? parent-tenant visibility? separate DB per tenant?) is consumer-specific. So it ships as a pattern, not code. + +```sql +-- consumer migration +ALTER TABLE executions ENABLE ROW LEVEL SECURITY; +ALTER TABLE execution_events ENABLE ROW LEVEL SECURITY; + +CREATE POLICY executions_tenant_isolation ON executions + USING (tenant_id = current_setting('app.current_tenant', true)); + +CREATE POLICY events_tenant_isolation ON execution_events + USING (tenant_id = current_setting('app.current_tenant', true)); +``` + +Each HTTP request opens a transaction and sets the tenant: + +```ts +// apps/backend/src/db/with-tenant.ts (consumer-authored) +export async function withTenantScope(tenantId: string, run: (tx: typeof database) => Promise): Promise { + return await database.transaction(async (tx) => { + await tx.execute(sql`SELECT set_config('app.current_tenant', ${tenantId}, true)`); + return await run(tx); + }); +} +``` + +Notes: + +- Migrations and superuser ops need a role with `BYPASSRLS`. +- `current_setting('app.current_tenant', true)` returns `NULL` when not set; policies should treat `NULL` as "no rows visible". This is exactly why RLS cannot be on by default in the single-tenant reference. +- The `pg_notify` payload must also carry tenant if the SSE dispatcher is to fan out per tenant without reloading the row. + +## Alternative options considered + +- **Bake tenancy into `execution-core`.** Rejected. The runner ships to consumers who may not be multi-tenant. Adding a `tenantId` field to `BaseNode` or `ExecutionContext` would force every adapter to know about tenants. The variables-bag transport keeps execution-core ignorant and the field optional. +- **Single inline `getTenant(request)` helper.** Rejected. A helper plus calls scattered across 17 routes is exactly the cross-cutting hack we want to avoid. A port plus middleware adds one indirection but survives swap-in scenarios (subdomain to JWT migration, multi-region tenant store changes) without touching every route. +- **Thread `tenantId` through the workflow to the event INSERT** (instead of the subquery in seam 3). Rejected. It re-introduces a per-call-site parameter the worker would have to carry and could forget, and couples the otherwise tenant-agnostic worker to tenancy. The subquery costs one indexed lookup per event and guarantees consistency with the parent row. +- **Per-tenant Temporal namespace.** Stronger isolation than RLS but multiplies operational cost and couples the application to Temporal. Out of scope. The port composes with namespace-per-tenant if a consumer chooses it. +- **Header-based vs subdomain-based tenant resolution.** Not our call. The port stays method-of-resolution-agnostic. + +## Consequences + +- **Pros** + - **Reference stays single-tenant** with the no-op default; cloning and running the reference needs zero tenancy ceremony. + - **Swap-one-line multi-tenancy for seams 1-4.** Wiring is shipped, not left as an exercise; the consumer replaces the port instance and implements seam 5. + - **The five-seam map.** Consumers know exactly where to wire and why, and which seam owns what (see responsibility boundary). + - **Independent of `AuthPort`.** `resolve()` takes the raw `Request`, so a tenant adapter never depends on auth having run, the two seams compose without coupling. A consumer whose tenant lives in a JWT claim parses it inside their own adapter. If they want to avoid parsing the token twice they can share it in their own combined middleware, but the reference does not couple the ports to do it for them. + - **No execution-core change.** `execution-core` ships to non-multi-tenant consumers unchanged. The worker stays tenant-agnostic via the subquery. + +- **Cons** + - **Identity propagation is not isolation.** Implementing the port does not make the backend safe to expose to multiple tenants; that needs a real `AuthPort` and (recommended) RLS. The seam map and responsibility boundary spell this out, but it is a foot-gun if skimmed. + - **Untenanted rows are app-layer visible.** The null-tolerant cross-check (required for single-tenant) means `tenant_id IS NULL` rows are not isolated until RLS is on. + - **Tenant tagging of events depends on the `executions.tenant_id` subquery.** One indexed lookup per event insert, traded for a worker that stays fully tenant-agnostic. A consumer who needs tenant inside executors opts into the `variables` transport (seam 2) and accepts its type-unsafe string convention. + +## Default: NoopTenantContextPort + +```ts +import { NoopTenantContextPort, type TenantContextPort } from './tenant'; + +const tenantPort: TenantContextPort = new NoopTenantContextPort(); +// resolves to null on every request — reference backend stays single-tenant +``` + +The reference routes consult the port through the wired seams, but with the no-op default every check degrades to its prior single-tenant behaviour. Switching to a real port is a consumer-side concern: implement the interface, point the `tenantPort` line at it, and enable seam 5. + +## Follow-ups + +- **AuthPort integration recipe**: document the JWT-claim pattern, including how a consumer can parse the token once in a combined middleware and share the result with both ports if they want to avoid double parsing. The ports stay request-only and independent by default. +- **Shipping a recipe RLS migration**: optionally provide the seam-5 policy as a starter under `apps/backend/migrations/recipes/multi-tenant.sql`, gated behind an env flag during `pnpm db:migrate`. +- **Temporal namespace-per-tenant**: separate decision log when a consumer needs stronger isolation than RLS provides. +- **Tenant context in `node_failed` payloads**: stamp tenant on event payloads so a sink can pivot logs by tenant without joining back to the executions table. + +## Status + +Accepted. Port plus `NoopTenantContextPort` default plus seams 1-4 wired as no-ops in the reference backend; seam 5 (RLS) documented as a consumer-enabled pattern. diff --git a/apps/execution-worker/src/database.ts b/apps/execution-worker/src/database.ts index b116186e9..8b2096172 100644 --- a/apps/execution-worker/src/database.ts +++ b/apps/execution-worker/src/database.ts @@ -8,7 +8,7 @@ const sql = postgres(env.DATABASE_URL); export const database = { async emitExecutionEvent(executionId: string, type: string, payload?: unknown, nodeId?: string) { await sql` - INSERT INTO execution_events (id, execution_id, sequence, timestamp, type, node_id, path_id, payload_json, created_at) + INSERT INTO execution_events (id, execution_id, sequence, timestamp, type, node_id, path_id, payload_json, tenant_id, created_at) VALUES ( gen_random_uuid(), ${executionId}, @@ -18,6 +18,7 @@ export const database = { ${nodeId ?? null}, ${null}, ${payload ? JSON.stringify(payload) : null}::jsonb, + (SELECT tenant_id FROM executions WHERE id = ${executionId}), now() ) `; From 8802bf8a256aa788c4b9fce04746a04ed4ded652 Mon Sep 17 00:00:00 2001 From: Kacper Cierzniewski Date: Tue, 9 Jun 2026 09:56:12 +0200 Subject: [PATCH 2/2] docs(backend): split TenantContextPort ADR from multi-tenancy wiring guide --- apps/backend/README.md | 2 +- apps/backend/multi-tenancy.md | 94 ++++++++ .../tenant-context-port.decision-log.md | 227 +++++------------- 3 files changed, 150 insertions(+), 173 deletions(-) create mode 100644 apps/backend/multi-tenancy.md diff --git a/apps/backend/README.md b/apps/backend/README.md index c5cd592c9..b430cac55 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -3,7 +3,7 @@ > ⚠️ **Reference implementation, local development only.** > No real authentication. The bundled `AllowAllAuthPort` permits every caller and every action (see `src/auth/`), and the constructor refuses to start without the explicit `WB_AUTH_PORT=allow-all` opt-in so a forgotten env var fails loudly. No tenant isolation. The HTTP server and the docker-compose services bind to `127.0.0.1` by default. Do not expose to the internet or shared networks without first plugging in a real `AuthPort`, see [`auth-port.decision-log.md`](./auth-port.decision-log.md) for the seam, default, and a JWT adapter sketch. > -> Seams for consumers to plug in: [`AuthPort`](./auth-port.decision-log.md) for authn/authz, [`TenantContextPort`](./tenant-context-port.decision-log.md) for multi-tenant identity propagation. +> Seams for consumers to plug in: [`AuthPort`](./auth-port.decision-log.md) for authn/authz, [`TenantContextPort`](./tenant-context-port.decision-log.md) for multi-tenant identity propagation (wiring guide: [`multi-tenancy.md`](./multi-tenancy.md)). > **Note:** setup is in [root README "Path B. Run the full stack demo"](../../README.md#path-b-run-the-full-stack-demo). This file documents the backend's internals, not how to start it. diff --git a/apps/backend/multi-tenancy.md b/apps/backend/multi-tenancy.md new file mode 100644 index 000000000..df69d486e --- /dev/null +++ b/apps/backend/multi-tenancy.md @@ -0,0 +1,94 @@ +# Multi-tenancy: wiring the TenantContextPort seams + +How to make the reference backend multi-tenant. For **why** it is shaped this way — the responsibility boundary, threat model, and rejected alternatives — read [`tenant-context-port.decision-log.md`](./tenant-context-port.decision-log.md) first. That document is stable; this one tracks the code, so signatures and file paths here are the live truth. + +> **The headline rule:** implementing `TenantContextPort` propagates tenant _identity_. It does **not** isolate tenants. Isolation is `AuthPort` (app layer, see [`auth-port.decision-log.md`](./auth-port.decision-log.md)) + RLS (DB layer, seam 5). All five seams are the job. + +## Seam status + +| Seam | What it does | State | Code | Tested | +| ---- | -------------------------------------------------------------------- | ----------------------------------- | --------------------------------------- | ------- | +| 1 | Resolve tenant once per request, stash on context | shipped (no-op default) | `src/tenant/middleware.ts` | ✅ | +| 2 | Stamp `tenantId` onto the execution row at submit | shipped (no-op default) | `src/routes/workflows.ts` | ✅ | +| 3 | Worker inherits `tenant_id` for each event from its parent execution | shipped (no-op default) | `apps/execution-worker/src/database.ts` | ⚠️ none | +| 4 | SSE stream cross-check (defence-in-depth) | shipped (no-op default) | `src/routes/executions.ts` | ✅ | +| 5 | Postgres Row-Level Security | **documented pattern, not shipped** | — | — | + +"No-op default" means: under `NoopTenantContextPort` the tenant is `null` on every request, every seam degrades to single-tenant behaviour, and the reference runs with zero tenancy ceremony. Swap the port instance to turn seams 1–4 on; enable seam 5 yourself. + +## Make it multi-tenant + +Implement the port and point one line at it: + +```ts +// src/server.ts — swap the no-op for your adapter +const tenantPort: TenantContextPort = new NoopTenantContextPort(); // ← your adapter here +app.use('/api/*', createAuthMiddleware(authPort)); +app.use('/api/*', createTenantMiddleware(tenantPort)); // after auth: handlers run under AuthVariables & TenantVariables +``` + +Your adapter implements one method; how it resolves the tenant (subdomain, cookie, JWT claim) is your call: + +```ts +interface TenantContextPort { + resolve(request: Request): Promise; +} +``` + +## The seams + +### Seam 1 — HTTP middleware: resolve once per request + +`createTenantMiddleware(port)` calls `port.resolve(c.req.raw)` once at the boundary and stashes the result under `c.var.tenant`. `requireTenant(c)` is a shipped convenience that returns a `400 tenant_required` for tenant-less callers — no reference route calls it (the single-tenant reference would always 400), but it ships tested so multi-tenant consumers do not re-derive the early-return idiom in every scoped route. + +→ `src/tenant/middleware.ts` · `src/tenant/tenant-context-port.ts` + +### Seam 2 — route: propagate tenant into the execution + +`POST /workflows/:id/execute` reads `c.var.tenant?.tenantId ?? null` and stamps it onto the execution row at submission. The worker reads it back from that row (seam 3), so the tenant is **not** threaded through the run. + +The reference deliberately does **not** inject the tenant into the engine `variables` bag — no reference executor consumes it, so that would be dead transport. If _your_ executors need the tenant at runtime, inject `variables: tenantId ? { tenantId } : {}` and read `context.variables.tenantId` inside the executor. `execution-core` never reads the field; it is opaque transport (and an untyped string convention — accept that trade-off). + +→ `src/routes/workflows.ts` + +### Seam 3 — persistence: every row carries `tenant_id` + +Migration `0001_odd_iron_lad` adds a **nullable** `text` `tenant_id` to `executions` and `execution_events`, each indexed. + +- **Nullable** on purpose: a `NOT NULL` column will not apply against existing rows without a backfill story the reference cannot assume. +- **`text`, not `uuid`**: `TenantContext.tenantId` is an opaque string (UUID or slug). A consumer who knows their ids are UUIDs can tighten it. + +The worker inherits each event's `tenant_id` from its parent execution via a correlated subquery **at INSERT time** (`SELECT tenant_id FROM executions WHERE id = …`), rather than threading the value through the workflow and every `emitEvent` call. This keeps the worker and the Temporal workflow tenant-agnostic. Cost: one indexed primary-key lookup per event insert. The match holds _at insert time_ — it is not a maintained invariant, and **there is no test exercising it yet** (see the decision log's Verification status and follow-ups). + +→ `apps/execution-worker/src/database.ts` · `drizzle/0001_odd_iron_lad.sql` + +### Seam 4 — SSE stream cross-check (defence-in-depth) + +Before subscribing, `GET /:id/stream` checks the caller's tenant against the tenant on the execution row. This is the one execution-row endpoint with a tenant guard, because EventSource's auth is weaker (no `Authorization` header) — see the decision log's responsibility boundary for why the other routes deliberately do not have it. + +On mismatch the response is a **404 byte-identical to not-found, not a 403** — a recognisable 403 would confirm the id exists in another tenant and make foreign executions enumerable. The check is a no-op when either side is `null`, preserving single-tenant behaviour (which is also why untenanted rows stay app-layer visible — the threat-model trade-off). + +→ `src/routes/executions.ts` + +### Seam 5 — Postgres Row-Level Security (you enable this) + +App-level scoping (`AuthPort` + seams 2–4) is the primary enforcement. RLS is the safety net for the day someone forgets a `WHERE tenant_id`. It is **not** shipped: under the no-op default a `NULL` session variable compared to a `NULL` column silently returns zero rows for every query, and the right policy (admin bypass? parent-tenant visibility? DB-per-tenant?) is consumer-specific. + +```sql +-- consumer migration +ALTER TABLE executions ENABLE ROW LEVEL SECURITY; +ALTER TABLE execution_events ENABLE ROW LEVEL SECURITY; + +CREATE POLICY executions_tenant_isolation ON executions + USING (tenant_id = current_setting('app.current_tenant', true)); +CREATE POLICY events_tenant_isolation ON execution_events + USING (tenant_id = current_setting('app.current_tenant', true)); +``` + +Each request opens a transaction and sets `app.current_tenant` before querying (a `withTenantScope(tenantId, run)` helper wrapping `database.transaction` + `SELECT set_config(...)`). This is **not yet shipped or tested** — treat it as a sketch until the recipe lands (decision-log follow-up); do not paste it into production untested. + +Gotchas the code cannot teach you: + +- **`current_setting('app.current_tenant', true)` returns `NULL` when unset, and `NULL = NULL` matches nothing** — a request that forgets to set the tenant sees zero rows, not all rows. This is the exact reason RLS cannot be on by default in the single-tenant reference. +- Migrations and superuser ops need a role with `BYPASSRLS`. +- If the SSE dispatcher should fan out per tenant without reloading the row, the `pg_notify` payload must also carry the tenant. diff --git a/apps/backend/tenant-context-port.decision-log.md b/apps/backend/tenant-context-port.decision-log.md index 7990d8a0e..ef937c2d8 100644 --- a/apps/backend/tenant-context-port.decision-log.md +++ b/apps/backend/tenant-context-port.decision-log.md @@ -2,211 +2,94 @@ ### Proposed by: Kacper Cierzniewski -### Date: 21.05.2026 +### Proposed: 21.05.2026 — Landed: 03.06.2026 (`fa5999dd`) + +> This is the **decision** (why this shape, what was rejected, what it does and does not protect). The **how-to-wire-it** lives in [`multi-tenancy.md`](./multi-tenancy.md) — that document tracks the code and is the source of truth for current signatures and per-seam status. If a snippet here ever disagrees with the code, the code wins. ## Context -The reference backend (`apps/backend` + `apps/execution-worker`) ships **single-tenant**. Real-world multi-tenant deployments (the typical enterprise consumer) need to propagate a tenant identifier from the incoming HTTP request, through workflow submission, into worker activities, and back into event emission and SSE streams. +The reference backend (`apps/backend` + `apps/execution-worker`) ships **single-tenant**. Real multi-tenant consumers need a tenant identifier to flow from the incoming HTTP request, through workflow submission, into worker activities, and back into event emission and SSE streams. -Without an explicit seam, multi-tenancy ends up as a cross-cutting hack scattered across the codebase: a `getTenantFromRequest(req)` helper called in 17 routes, a hand-rolled middleware, a `tenant_id` parameter quietly threaded through every executor, a `WHERE tenant_id = ?` clause every consumer remembers to add or forgets. Mistakes here are exactly the canonical multi-tenancy bug, one tenant reading another's data. +Without an explicit seam, multi-tenancy degrades into a cross-cutting hack: a `getTenantFromRequest(req)` helper called in 17 routes, a `tenant_id` quietly threaded through every executor, a `WHERE tenant_id = ?` clause every consumer remembers to add or forgets. Forgetting it once is the canonical multi-tenancy bug — one tenant reading another's data. ## Decision -Introduce `TenantContextPort` (`apps/backend/src/tenant/`), an interface with a single method `resolve(request)` that returns `TenantContext | null`, plus a default `NoopTenantContextPort` that resolves to `null` on every request. - -**What actually ships (not docs, runtime code):** the reference backend wires four of the five seams below as no-ops. With `NoopTenantContextPort` the tenant is `null` on every request, so every wired seam degrades to its current single-tenant behaviour. A consumer makes the backend multi-tenant by swapping one line (the port instance), not by re-implementing the wiring. Seam 5 (Postgres RLS) is documented as a pattern only, because no sensible default policy exists across deployments (see its section). - -The port: +Introduce `TenantContextPort` (`apps/backend/src/tenant/`): an interface with a single method `resolve(request) → TenantContext | null`, plus a default `NoopTenantContextPort` that resolves to `null` on every request. A consumer makes the backend multi-tenant by swapping one line (the port instance), not by re-implementing the wiring. -- Lives in `apps/backend`, not in `packages/execution-core`. `execution-core` stays tenant-agnostic. If a consumer's node executors need tenant at runtime, the identifier is available to inject via the existing `ExecutionContext.variables` shape; the reference itself does not inject it (see seam 2). -- Ships with a `NoopTenantContextPort` default so the reference backend stays single-tenant out of the box, no enforcement, no ceremony. -- Is meant to be replaced by the consumer for any multi-tenant deployment. +The port is small on purpose; the **wiring around it (the five seams) is the actual contract**, documented in [`multi-tenancy.md`](./multi-tenancy.md). Two structural commitments are load-bearing and stated here because the code does not enforce them: -The port itself is small on purpose. The wiring around it (the seams) is the actual contract, and it is where the value is. +- **The port lives in `apps/backend`, not `packages/execution-core`.** `execution-core` stays tenant-agnostic. If a consumer's executors need the tenant at runtime, it rides the existing opaque `ExecutionContext.variables` bag; the reference never injects it. +- **`resolve()` takes the raw `Request`**, so a tenant adapter never depends on auth having run. The two seams compose without coupling. ## Responsibility boundary: TenantContextPort vs AuthPort -The backend has two identity seams and they own different things. Getting this split wrong is how the cross-tenant bug sneaks back in, so it is stated explicitly. - -- **AuthPort owns per-resource authorization, including tenant ownership.** Every scoped route calls `assertAuthorized(c, action, { kind: 'execution', executionId })` before touching data. A real multi-tenant `AuthPort` checks that the caller may act on that specific resource, which in a multi-tenant deployment means "belongs to the caller's tenant". This is the single, systematic place resource scoping lives. Sprinkling `if (row.tenantId !== caller.tenantId)` into every route is the shotgun anti-pattern this design exists to avoid. -- **TenantContextPort owns identity propagation**, resolving the tenant once at the HTTP boundary and carrying it onto the execution row and (denormalised) event rows, so reads, the worker, and RLS can all scope by it. -- **The SSE stream cross-check is the one deliberate exception**, and it is defence-in-depth, not the general guard. `GET /:id/stream` cannot rely on a bearer token because EventSource cannot send an `Authorization` header, so its auth falls back to weaker query-param or cookie schemes. The tenant resolved by `TenantContextPort` (subdomain, cookie) gives an independent second check on exactly that weak path. `GET /:id` and `DELETE /:id` carry no such cross-check on purpose, they rely on `AuthPort` plus RLS. - -Consequence a consumer must internalise: implementing `TenantContextPort` alone does **not** isolate tenants. It propagates identity. Isolation is `AuthPort` (app layer) plus RLS (DB layer). The seam map below is the full job. - -## Integration patterns - -Seams 1-4 below are shipped and wired in the reference (no-op under `NoopTenantContextPort`). Seam 5 is a documented pattern the consumer enables. - -### Seam 1: HTTP middleware — resolve once per request (shipped) - -`createTenantMiddleware` calls `tenantPort.resolve(c.req.raw)` once at the boundary and stashes the result on the request context. `requireTenant` is a shipped convenience for routes that must reject tenant-less callers. No reference route calls `requireTenant` today (the reference is single-tenant, so it would always 400); it ships and is tested because multi-tenant consumers call it from their scoped routes, and shipping plus testing it is cheaper than every consumer re-deriving the same early-return idiom. - -```ts -// apps/backend/src/tenant/middleware.ts (shipped) -export function createTenantMiddleware(port: TenantContextPort): MiddlewareHandler<{ Variables: TenantVariables }> { - return async (c, next) => { - c.set('tenant', await port.resolve(c.req.raw)); - await next(); - }; -} - -export function requireTenant(c: Context<{ Variables: TenantVariables }>): TenantContext | Response { - const tenant = c.var.tenant; - if (!tenant) { - return c.json({ code: 'tenant_required', message: 'Tenant context required' }, 400); - } - return tenant; -} -``` - -Wired in `server.ts` after the auth middleware (so route handlers run under `AuthVariables & TenantVariables`): - -```ts -const tenantPort: TenantContextPort = new NoopTenantContextPort(); // swap for a real adapter -app.use('/api/*', createAuthMiddleware(authPort)); -app.use('/api/*', createTenantMiddleware(tenantPort)); -``` - -### Seam 2: route — propagate tenant into the execution (shipped) - -`POST /workflows/:id/execute` reads the resolved tenant and stamps it onto the execution row at submission time. `null` in single-tenant mode. The worker reads it back from that row via subquery for event tagging (seam 3), so it never needs the value threaded through the run. - -```ts -// apps/backend/src/routes/workflows.ts (shipped) -const tenantId = c.var.tenant?.tenantId ?? null; - -await database.insert(executions).values({ /* … */, tenantId }).returning(); - -await getWorkflowEngine().submit({ - workflowId, - executionId: execution.id, - definition, - triggerPayload: body.triggerPayload ?? {}, - variables: {}, // reference does NOT inject tenantId here, see below - global: {}, -}); -``` +The backend has two identity seams and they own different things. **Getting this split wrong is how the cross-tenant bug sneaks back in**, so it is stated explicitly — the code cannot express it. -**Consumer extension (not shipped):** if a consumer's node executors need the tenant at runtime, inject it into the engine `variables` bag (`variables: tenantId ? { tenantId } : {}`) and read it inside the executor as `context.variables.tenantId`. `execution-core` never reads the field, it is opaque transport. The reference omits this because no reference executor consumes it, shipping it would be dead transport. +- **AuthPort owns per-resource authorization, including tenant ownership.** Every scoped route calls `assertAuthorized(c, action, resource)` before touching data. In a multi-tenant deployment "may act on this resource" _means_ "belongs to the caller's tenant". This is the single, systematic place resource scoping lives. Sprinkling `if (row.tenantId !== caller.tenantId)` into every route is the shotgun anti-pattern this design exists to avoid. +- **TenantContextPort owns identity propagation** — resolve the tenant once at the HTTP boundary, carry it onto the execution row and (denormalised) event rows, so reads, the worker, and RLS can all scope by it. **Propagation is not isolation.** -### Seam 3: persistence — every row carries `tenant_id` (shipped) +The SSE stream cross-check (seam 4) is the **one deliberate exception** — defence-in-depth, not the general guard. `GET /:id/stream` cannot use a bearer token (EventSource sends no `Authorization` header), so its auth falls back to weaker query-param/cookie schemes; the tenant resolved independently by `TenantContextPort` gives a second check on exactly that weak path. `GET /:id` and `DELETE /:id` carry no such cross-check on purpose — they rely on `AuthPort` plus RLS. -A migration adds a **nullable** `text` `tenant_id` to `executions` and `execution_events` plus an index on each. Nullable on purpose: the reference and any pre-existing rows leave it `NULL`, and a `NOT NULL` column would refuse to apply against existing data without a backfill story the reference cannot assume. `text`, not `uuid`: `TenantContext.tenantId` is an opaque string (UUID or slug), so the column must store whatever the consumer's port returns. A consumer who knows their tenants are UUIDs can tighten it. +**Consequence a consumer must internalise:** implementing `TenantContextPort` alone does **not** isolate tenants. Isolation is `AuthPort` (app layer) plus RLS (DB layer). The seam map is the full job. -```sql --- apps/backend/drizzle/0001_odd_iron_lad.sql (shipped) -ALTER TABLE "execution_events" ADD COLUMN "tenant_id" text; -ALTER TABLE "executions" ADD COLUMN "tenant_id" text; -CREATE INDEX "execution_events_tenant_id_idx" ON "execution_events" ("tenant_id"); -CREATE INDEX "executions_tenant_id_idx" ON "executions" ("tenant_id"); -``` +## Threat model -The worker inherits `tenant_id` for each event row from the parent execution via a correlated subquery at INSERT time, rather than threading `tenantId` through the workflow and every `emitEvent` call. This keeps the worker and the Temporal workflow fully tenant-agnostic (no new parameter, no per-call-site leak) and makes the event's tenant DB-guaranteed to match its parent: +What this design defends against, and what it deliberately does not — so a reviewer can tell an accepted risk from an oversight. -```ts -// apps/execution-worker/src/database.ts (shipped) -await sql` - INSERT INTO execution_events (..., tenant_id, ...) - VALUES (..., (SELECT tenant_id FROM executions WHERE id = ${executionId}), ...) -`; -``` - -The cost is one indexed primary-key lookup per event insert. The benefit is that the worker never has to know tenancy exists. - -### Seam 4: SSE stream filter — tenant of caller must match tenant of execution (shipped) - -Before subscribing, `GET /:id/stream` checks the caller's tenant against the tenant stamped on the execution row. As noted in the responsibility-boundary section, this is defence-in-depth for EventSource's weaker auth, not the general per-resource guard. - -```ts -// apps/backend/src/routes/executions.ts (shipped) -const tenant = c.var.tenant; -if (tenant && execution.tenantId && execution.tenantId !== tenant.tenantId) { - // 404, byte-identical to the not-found branch, not 403. A distinct - // "belongs to another tenant" response would confirm the id exists in - // another tenant and make foreign executions enumerable. - return c.json({ code: 'execution_not_found', message: 'Execution not found' }, 404); -} -``` - -The mismatch response is deliberately a 404 identical to not-found. Returning a recognisable 403 would leak existence across tenants, the caller could probe ids and learn which executions live in other tenants. Indistinguishable-from-absent is the only safe answer on the isolation boundary. - -The `tenant && execution.tenantId &&` guards make the check a no-op when either side is `null`. That keeps the single-tenant reference behaving exactly as before, but it also means **untenanted rows (`tenant_id IS NULL`) are globally visible at the app layer**. For deployments where that is unacceptable, seam 5 (RLS) is the systematic fix; app-level checks alone are not enough. - -### Seam 5: Postgres Row-Level Security — defence in depth (documented pattern, not shipped) - -App-level scoping (`AuthPort` plus seams 2-4) is the primary enforcement. RLS is the safety net for the day a programmer forgets a `WHERE tenant_id` somewhere. It is **not** wired in the reference: enabling RLS under `NoopTenantContextPort` would compare a `NULL` session variable to a `NULL` column and silently return zero rows for every query, and the "right" policy shape (admin bypass? parent-tenant visibility? separate DB per tenant?) is consumer-specific. So it ships as a pattern, not code. - -```sql --- consumer migration -ALTER TABLE executions ENABLE ROW LEVEL SECURITY; -ALTER TABLE execution_events ENABLE ROW LEVEL SECURITY; - -CREATE POLICY executions_tenant_isolation ON executions - USING (tenant_id = current_setting('app.current_tenant', true)); - -CREATE POLICY events_tenant_isolation ON execution_events - USING (tenant_id = current_setting('app.current_tenant', true)); -``` - -Each HTTP request opens a transaction and sets the tenant: - -```ts -// apps/backend/src/db/with-tenant.ts (consumer-authored) -export async function withTenantScope(tenantId: string, run: (tx: typeof database) => Promise): Promise { - return await database.transaction(async (tx) => { - await tx.execute(sql`SELECT set_config('app.current_tenant', ${tenantId}, true)`); - return await run(tx); - }); -} -``` - -Notes: - -- Migrations and superuser ops need a role with `BYPASSRLS`. -- `current_setting('app.current_tenant', true)` returns `NULL` when not set; policies should treat `NULL` as "no rows visible". This is exactly why RLS cannot be on by default in the single-tenant reference. -- The `pg_notify` payload must also carry tenant if the SSE dispatcher is to fan out per tenant without reloading the row. +- **Defended:** a tenant reading or streaming another tenant's executions/events through the API, once a real `AuthPort` is plugged in (app layer) and RLS is enabled (DB layer). Enumeration of foreign execution ids is closed by the 404-not-403 choice on the stream boundary (a recognisable 403 would confirm the id exists in another tenant; an indistinguishable 404 does not). +- **Out of scope (accepted, not overlooked):** + - **Untenanted rows (`tenant_id IS NULL`) are globally visible at the app layer.** The null-tolerant cross-check required for the single-tenant reference means these rows are not isolated until RLS is on. Acceptable because the reference is single-tenant; a multi-tenant consumer must enable RLS and backfill. + - **An insider or process with direct Postgres access.** RLS raises the bar but the app trusts its DB session; this is not a defence against a compromised database role. + - **Side-channel / timing inference.** Response-time differences between tenanted paths are not equalised. ## Alternative options considered -- **Bake tenancy into `execution-core`.** Rejected. The runner ships to consumers who may not be multi-tenant. Adding a `tenantId` field to `BaseNode` or `ExecutionContext` would force every adapter to know about tenants. The variables-bag transport keeps execution-core ignorant and the field optional. -- **Single inline `getTenant(request)` helper.** Rejected. A helper plus calls scattered across 17 routes is exactly the cross-cutting hack we want to avoid. A port plus middleware adds one indirection but survives swap-in scenarios (subdomain to JWT migration, multi-region tenant store changes) without touching every route. -- **Thread `tenantId` through the workflow to the event INSERT** (instead of the subquery in seam 3). Rejected. It re-introduces a per-call-site parameter the worker would have to carry and could forget, and couples the otherwise tenant-agnostic worker to tenancy. The subquery costs one indexed lookup per event and guarantees consistency with the parent row. -- **Per-tenant Temporal namespace.** Stronger isolation than RLS but multiplies operational cost and couples the application to Temporal. Out of scope. The port composes with namespace-per-tenant if a consumer chooses it. -- **Header-based vs subdomain-based tenant resolution.** Not our call. The port stays method-of-resolution-agnostic. +- **Bake tenancy into `execution-core`.** Rejected. The runner ships to consumers who may not be multi-tenant; a `tenantId` on `BaseNode`/`ExecutionContext` would force every adapter to know about tenants. The variables-bag transport keeps execution-core ignorant and the field optional. +- **A single inline `getTenant(request)` helper.** Rejected. A helper called across 17 routes is exactly the cross-cutting hack to avoid. A port plus middleware adds one indirection but survives swap-in scenarios (subdomain → JWT, multi-region tenant store) without touching every route. +- **Thread `tenantId` through the workflow to the event INSERT** (instead of the seam-3 subquery). Rejected. Re-introduces a per-call-site parameter the worker must carry and could forget, coupling the otherwise tenant-agnostic worker to tenancy. The subquery costs one indexed lookup per event and ties the event's tenant to its parent at insert time. +- **Per-tenant Temporal namespace.** Stronger isolation than RLS but multiplies operational cost and couples the application to Temporal. Out of scope; the port composes with it if a consumer chooses it. +- **Header-based vs subdomain-based resolution.** Not our call. The port stays resolution-method-agnostic. ## Consequences - **Pros** - - **Reference stays single-tenant** with the no-op default; cloning and running the reference needs zero tenancy ceremony. - - **Swap-one-line multi-tenancy for seams 1-4.** Wiring is shipped, not left as an exercise; the consumer replaces the port instance and implements seam 5. - - **The five-seam map.** Consumers know exactly where to wire and why, and which seam owns what (see responsibility boundary). - - **Independent of `AuthPort`.** `resolve()` takes the raw `Request`, so a tenant adapter never depends on auth having run, the two seams compose without coupling. A consumer whose tenant lives in a JWT claim parses it inside their own adapter. If they want to avoid parsing the token twice they can share it in their own combined middleware, but the reference does not couple the ports to do it for them. - - **No execution-core change.** `execution-core` ships to non-multi-tenant consumers unchanged. The worker stays tenant-agnostic via the subquery. - + - Reference stays single-tenant with the no-op default; cloning and running it needs zero tenancy ceremony. + - Swap-one-line multi-tenancy for seams 1–4 — wiring is shipped, not left as an exercise; the consumer replaces the port instance and enables seam 5. + - A five-seam map tells consumers exactly where to wire and which seam owns what. + - No `execution-core` change; the worker stays tenant-agnostic. - **Cons** - - **Identity propagation is not isolation.** Implementing the port does not make the backend safe to expose to multiple tenants; that needs a real `AuthPort` and (recommended) RLS. The seam map and responsibility boundary spell this out, but it is a foot-gun if skimmed. - - **Untenanted rows are app-layer visible.** The null-tolerant cross-check (required for single-tenant) means `tenant_id IS NULL` rows are not isolated until RLS is on. - - **Tenant tagging of events depends on the `executions.tenant_id` subquery.** One indexed lookup per event insert, traded for a worker that stays fully tenant-agnostic. A consumer who needs tenant inside executors opts into the `variables` transport (seam 2) and accepts its type-unsafe string convention. + - **Identity propagation is not isolation** — the headline foot-gun if the responsibility boundary is skimmed. + - **Untenanted rows are app-layer visible** until RLS is on (see threat model). + - Event tenant-tagging depends on the `executions.tenant_id` subquery — one indexed lookup per event insert, traded for a worker that stays fully tenant-agnostic. + +## Verification status + +What is enforced by a test versus what is a contract only on paper. Honesty here matters more than completeness — an untested seam is a claim, not a guarantee. -## Default: NoopTenantContextPort +| Seam | Behaviour under test | Status | +| -------------------------------------------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | +| 1 — middleware `resolve` / `requireTenant` | resolves once onto context; noop yields `null` | ✅ `tenant/middleware.test.ts`, `tenant/noop-tenant-context-port.test.ts` | +| 2 — execute stamps tenant on the row | `tenantId` written to the execution row, `null` in single-tenant mode, never into `variables` | ✅ `routes/workflows.test.ts` | +| 4 — SSE stream cross-check | 404-not-403 on mismatch (no existence leak); streams on match; no-op when either side `null` | ✅ `routes/executions.test.ts` | +| 3 — worker inherits `tenant_id` via subquery | event row's `tenant_id` matches its parent execution | ⚠️ **Not tested.** `execution-worker` has no test exercising `src/database.ts`. Paper contract — see follow-ups. | +| 5 — Postgres RLS | — | Not shipped; documented pattern in [`multi-tenancy.md`](./multi-tenancy.md). | -```ts -import { NoopTenantContextPort, type TenantContextPort } from './tenant'; +## Revisit triggers -const tenantPort: TenantContextPort = new NoopTenantContextPort(); -// resolves to null on every request — reference backend stays single-tenant -``` +This decision stands until one of these fires: -The reference routes consult the port through the wired seams, but with the no-op default every check degrades to its prior single-tenant behaviour. Switching to a real port is a consumer-side concern: implement the interface, point the `tenantPort` line at it, and enable seam 5. +- A real multi-tenant consumer proves the `variables`-bag transport for tenant-in-executor is too type-unsafe → reconsider a typed `tenantId` on `ExecutionContext` (reopens "bake into execution-core"). +- The reference itself goes multi-tenant, making globally-visible untenanted rows unacceptable → ship RLS plus a `NOT NULL` backfill, flipping seam 5 from pattern to code. +- The per-event subquery shows up in profiling at scale → revisit threading tenant through the worker. +- A consumer needs stronger isolation than RLS → open a namespace-per-tenant decision log. ## Follow-ups -- **AuthPort integration recipe**: document the JWT-claim pattern, including how a consumer can parse the token once in a combined middleware and share the result with both ports if they want to avoid double parsing. The ports stay request-only and independent by default. -- **Shipping a recipe RLS migration**: optionally provide the seam-5 policy as a starter under `apps/backend/migrations/recipes/multi-tenant.sql`, gated behind an env flag during `pnpm db:migrate`. -- **Temporal namespace-per-tenant**: separate decision log when a consumer needs stronger isolation than RLS provides. -- **Tenant context in `node_failed` payloads**: stamp tenant on event payloads so a sink can pivot logs by tenant without joining back to the executions table. +- **Add a worker-side test for seam 3** (the subquery inheritance) — close the paper-contract gap flagged above. +- **AuthPort integration recipe**: the JWT-claim pattern, including parsing the token once in a combined middleware shared by both ports. +- **Ship a tested RLS recipe** under `apps/backend/migrations/recipes/multi-tenant.sql` (policy + a `withTenantScope` transaction helper), gated behind an env flag during `pnpm db:migrate`. Until it exists and is tested, seam 5 stays a documented pattern, not shipped code. +- **Tenant in `node_failed` payloads**: stamp tenant on event payloads so a sink can pivot logs by tenant without joining back to `executions`. ## Status -Accepted. Port plus `NoopTenantContextPort` default plus seams 1-4 wired as no-ops in the reference backend; seam 5 (RLS) documented as a consumer-enabled pattern. +Accepted. Port + `NoopTenantContextPort` default + seams 1–4 wired as no-ops in the reference backend (seam 3 untested — see Verification status); seam 5 (RLS) documented as a consumer-enabled pattern in [`multi-tenancy.md`](./multi-tenancy.md).