From 73888e1e090970c42576cc38ea20e05ef151e92c Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 1 Jul 2026 16:20:01 +0100 Subject: [PATCH 01/14] feat(run-ops): webapp db topology, flags, migration helpers + store/engine wiring Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/webapp/CLAUDE.md | 4 +- apps/webapp/app/db.server.ts | 305 +++++++- apps/webapp/app/entry.server.tsx | 13 +- apps/webapp/app/env.server.ts | 75 +- .../app/models/runtimeEnvironment.server.ts | 21 +- apps/webapp/app/v3/engineVersion.server.ts | 5 + .../app/v3/eventRepository/index.server.ts | 31 +- apps/webapp/app/v3/featureFlags.ts | 7 + apps/webapp/app/v3/runEngine.server.ts | 9 + .../webapp/app/v3/runEngineHandlers.server.ts | 462 +++++------- .../app/v3/runEngineHandlersShared.server.ts | 225 ++++++ .../controlPlaneCache.server.ts | 180 +++++ .../controlPlaneResolver.server.ts | 446 ++++++++++++ .../runOpsMigration/crossSeamGuard.server.ts | 102 +++ .../distinctDbSentinel.server.ts | 55 ++ .../knownMigratedFilter.server.test.ts | 133 ++++ .../knownMigratedFilter.server.ts | 138 ++++ .../mintBatchFriendlyId.server.test.ts | 168 +++++ .../mintBatchFriendlyId.server.ts | 54 ++ .../readThrough.server.test.ts | 216 ++++++ .../v3/runOpsMigration/readThrough.server.ts | 114 +++ .../resolveInheritedMintKind.server.test.ts | 64 ++ .../resolveInheritedMintKind.server.ts | 21 + .../runEngineControlPlaneResolver.server.ts | 97 +++ .../runOpsCascadeCleanup.server.ts | 275 +++++++ .../runOpsMintKind.flipLatency.test.ts | 75 ++ .../runOpsMintKind.server.test.ts | 61 ++ .../runOpsMigration/runOpsMintKind.server.ts | 84 +++ .../v3/runOpsMigration/runOpsSplitReadGate.ts | 14 + .../v3/runOpsMigration/splitMode.server.ts | 61 ++ apps/webapp/app/v3/runOpsMigration/types.ts | 25 + .../v3/runOpsMigration/unblockRouteCatalog.ts | 96 +++ apps/webapp/app/v3/runStore.server.test.ts | 267 +++++++ apps/webapp/app/v3/runStore.server.ts | 126 +++- .../app/v3/taskRunHeartbeatFailed.server.ts | 27 +- apps/webapp/package.json | 2 + ...findEnvironmentFromRun.readthrough.test.ts | 135 ++++ ...teLoaders.controlPlane.readthrough.test.ts | 164 +++++ apps/webapp/test/runEngineHandlers.test.ts | 674 ++++++++++++++++++ apps/webapp/test/runOpsCrossSeamGuard.test.ts | 134 ++++ apps/webapp/test/runOpsDbTopology.test.ts | 121 ++++ apps/webapp/test/runOpsMintCutover.test.ts | 193 +++++ apps/webapp/test/runOpsSplitMode.test.ts | 95 +++ apps/webapp/test/runOpsSplitReadGate.test.ts | 75 ++ .../services.controlPlane.readthrough.test.ts | 115 +++ ...ilLoaders.controlPlane.readthrough.test.ts | 176 +++++ .../controlPlaneRepoint.server.test.ts | 213 ++++++ .../controlPlaneResolver.server.test.ts | 609 ++++++++++++++++ ...nEngineControlPlaneResolver.server.test.ts | 194 +++++ apps/webapp/vitest.config.ts | 13 +- 50 files changed, 6640 insertions(+), 329 deletions(-) create mode 100644 apps/webapp/app/v3/runEngineHandlersShared.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/readThrough.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/splitMode.server.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/types.ts create mode 100644 apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts create mode 100644 apps/webapp/app/v3/runStore.server.test.ts create mode 100644 apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts create mode 100644 apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts create mode 100644 apps/webapp/test/runEngineHandlers.test.ts create mode 100644 apps/webapp/test/runOpsCrossSeamGuard.test.ts create mode 100644 apps/webapp/test/runOpsDbTopology.test.ts create mode 100644 apps/webapp/test/runOpsMintCutover.test.ts create mode 100644 apps/webapp/test/runOpsSplitMode.test.ts create mode 100644 apps/webapp/test/runOpsSplitReadGate.test.ts create mode 100644 apps/webapp/test/services.controlPlane.readthrough.test.ts create mode 100644 apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts create mode 100644 apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts create mode 100644 apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts create mode 100644 apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md index a4de6ab57b7..68efaffd41e 100644 --- a/apps/webapp/CLAUDE.md +++ b/apps/webapp/CLAUDE.md @@ -75,8 +75,8 @@ const signal = getRequestAbortSignal(); Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** For testable code, **never import env.server.ts** in test files. Pass configuration as options instead: -- `realtimeClient.server.ts` (testable service, takes config as constructor arg) -- `realtimeClientGlobal.server.ts` (creates singleton with env config) +- `realtime/nativeRealtimeClient.server.ts` (testable service, takes config as constructor arg) +- `realtime/nativeRealtimeClientInstance.server.ts` (creates singleton with env config) ## Run Engine 2.0 diff --git a/apps/webapp/app/db.server.ts b/apps/webapp/app/db.server.ts index 09c30b92568..179f9976205 100644 --- a/apps/webapp/app/db.server.ts +++ b/apps/webapp/app/db.server.ts @@ -7,6 +7,7 @@ import { type PrismaTransactionClient, type PrismaTransactionOptions, } from "@trigger.dev/database"; +import { RunOpsPrismaClient } from "@internal/run-ops-database"; import invariant from "tiny-invariant"; import { z } from "zod"; import { env } from "./env.server"; @@ -18,6 +19,8 @@ import { logTransactionInfrastructureError, } from "./utils/prismaErrors"; import { singleton } from "./utils/singleton"; +import { isSplitEnabled } from "./v3/runOpsMigration/splitMode.server"; +import { computeRunOpsSplitReadEnabled } from "./v3/runOpsMigration/runOpsSplitReadGate"; import { DATASOURCE_CONTEXT_KEY, startActiveSpan } from "./v3/tracer.server"; import type { Span } from "@opentelemetry/api"; import { context, trace } from "@opentelemetry/api"; @@ -130,6 +133,32 @@ function tagDatasource(datasource: "writer" | "replica", }) as unknown as T; } +// Same extension as tagDatasource but typed for RunOpsPrismaClient (different +// generated package — does not extend @trigger.dev/database.PrismaClient). +function tagDatasourceRunOps( + datasource: "writer" | "replica", + client: RunOpsPrismaClient +): RunOpsPrismaClient { + return client.$extends({ + name: "datasource-tagger", + query: { + $allOperations: ({ query, args }) => { + trace.getActiveSpan()?.setAttribute("db.datasource", datasource); + return context.with( + context.active().setValue(DATASOURCE_CONTEXT_KEY, datasource), + async () => await query(args) + ); + }, + }, + }) as unknown as RunOpsPrismaClient; +} + +// Same wrapper as captureInfrastructureErrors, bridged via double cast because +// that helper is constrained to T extends @trigger.dev/database.PrismaClient. +function captureInfraErrorsRunOps(client: RunOpsPrismaClient): RunOpsPrismaClient { + return captureInfrastructureErrors(client as unknown as PrismaClient) as unknown as RunOpsPrismaClient; +} + export const prisma = singleton("prisma", () => captureInfrastructureErrors(tagDatasource("writer", getClient())) ); @@ -139,11 +168,146 @@ export const $replica: PrismaReplicaClient = singleton("replica", () => { return replica ? captureInfrastructureErrors(tagDatasource("replica", replica)) : prisma; }); +export type RunOpsClients = { writer: PrismaClient; replica: PrismaReplicaClient }; +export type NewRunOpsClients = { writer: RunOpsPrismaClient; replica: RunOpsPrismaClient }; +export type RunOpsTopology = { + newRunOps: NewRunOpsClients; + legacyRunOps: RunOpsClients; + controlPlane: RunOpsClients; +}; +export type SelectRunOpsTopologyConfig = { + splitEnabled: boolean; + legacyUrl?: string; + newUrl?: string; + newReplicaUrl?: string; +}; +export type RunOpsClientBuilders = { + controlPlane: RunOpsClients; + buildNewWriter: (url: string, clientType: string) => RunOpsPrismaClient; + buildNewReplica: (url: string, clientType: string) => RunOpsPrismaClient; +}; + +// Pure run-ops client selector. No env, no isSplitEnabled() — those +// belong in the env-bound singleton (see runOpsTopology below). The builder +// callbacks are the only side-effecting boundary, so split-OFF (the default) +// calls NEITHER and opens no second connection. +export function selectRunOpsTopology( + config: SelectRunOpsTopologyConfig, + builders: RunOpsClientBuilders +): RunOpsTopology { + const { controlPlane } = builders; + + const cpFallback: NewRunOpsClients = { + writer: controlPlane.writer as unknown as RunOpsPrismaClient, + replica: controlPlane.replica as unknown as RunOpsPrismaClient, + }; + + if (!config.splitEnabled) { + return { newRunOps: cpFallback, legacyRunOps: controlPlane, controlPlane }; + } + + if (!config.legacyUrl || !config.newUrl) { + return { newRunOps: cpFallback, legacyRunOps: controlPlane, controlPlane }; + } + + const legacyRunOps = controlPlane; + + const newWriter = builders.buildNewWriter(config.newUrl, "run-ops-new-writer"); + const newReplica: RunOpsPrismaClient = config.newReplicaUrl + ? builders.buildNewReplica(config.newReplicaUrl, "run-ops-new-reader") + : newWriter; + + return { + newRunOps: { writer: newWriter, replica: newReplica }, + legacyRunOps, + controlPlane, + }; +} + +// The env-bound run-ops topology singleton. The split decision uses +// a cheap synchronous env predicate (governs whether a second pool is opened); +// the async distinct-DB sentinel is enforced separately at boot via +// assertRunOpsSplitSentinel(). Because the builder callbacks only run when +// splitEnabled is true, single-DB reuses prisma/$replica by reference and opens +// nothing new. The builders apply the SAME wrapper pair the control-plane +// singletons use (captureInfrastructureErrors(tagDatasource(role, raw))). +const runOpsTopology: RunOpsTopology = singleton("runOpsTopology", () => { + const newUrl = env.TASK_RUN_DATABASE_URL; + const splitEnabled = !!newUrl && !!env.TASK_RUN_LEGACY_DATABASE_URL; + + return selectRunOpsTopology( + { + splitEnabled, + legacyUrl: env.TASK_RUN_LEGACY_DATABASE_URL, + newUrl, + newReplicaUrl: env.TASK_RUN_DATABASE_READ_REPLICA_URL, + }, + { + controlPlane: { writer: prisma, replica: $replica }, + buildNewWriter: (url, clientType) => + captureInfraErrorsRunOps( + tagDatasourceRunOps("writer", buildRunOpsWriterClient({ url, clientType })) + ), + buildNewReplica: (url, clientType) => + captureInfraErrorsRunOps( + tagDatasourceRunOps("replica", buildRunOpsReplicaClient({ url, clientType })) + ), + } + ); +}); + +// Typed as RunOpsPrismaClient for the run-store boundary. +export const runOpsNewPrismaClient: RunOpsPrismaClient = runOpsTopology.newRunOps.writer; +export const runOpsNewReplicaClient: RunOpsPrismaClient = runOpsTopology.newRunOps.replica; +// Legacy-typed aliases kept for the remaining consumers that still expect PrismaClient / +// PrismaReplicaClient (idempotency residency, read-through, handlers, cascade cleanup). +export const runOpsNewPrisma: PrismaClient = runOpsTopology.newRunOps.writer as unknown as PrismaClient; +export const runOpsNewReplica: PrismaReplicaClient = runOpsTopology.newRunOps.replica as unknown as PrismaReplicaClient; +export const runOpsLegacyPrisma: PrismaClient = runOpsTopology.legacyRunOps.writer; +export const runOpsLegacyReplica: PrismaReplicaClient = runOpsTopology.legacyRunOps.replica; + +export const runOpsSplitReadEnabled: boolean = computeRunOpsSplitReadEnabled({ + newReplica: runOpsNewReplicaClient, + controlPlaneWriter: prisma, + controlPlaneReplica: $replica, + hasNewUrl: !!env.TASK_RUN_DATABASE_URL, + hasLegacyUrl: !!env.TASK_RUN_LEGACY_DATABASE_URL, +}); + +// Boot-time interlock: if the flag is on but the distinct-DB sentinel does not +// confirm two physically-distinct run-ops DBs, refuse to enable split (data-loss +// interlock). Async, so it cannot live in the synchronous singleton factory — +// call it from the eager-boot path before any run-ops routing is wired. +export async function assertRunOpsSplitSentinel(): Promise { + if (!env.RUN_OPS_SPLIT_ENABLED) return; + const ok = await isSplitEnabled(); + if (!ok) { + throw new Error( + "RUN_OPS_SPLIT_ENABLED is on but the distinct-DB sentinel did not confirm two physically-distinct run-ops DBs; refusing to enable split (data-loss interlock)." + ); + } +} + function getClient() { - const { DATABASE_URL } = process.env; - invariant(typeof DATABASE_URL === "string", "DATABASE_URL env var not set"); + // Control-plane datasource repoint: prefer the dedicated control-plane DSN, falling back to + // DATABASE_URL so self-host / single-DB installs boot byte-identical when CONTROL_PLANE_DATABASE_URL is unset. + const url = env.CONTROL_PLANE_DATABASE_URL ?? env.DATABASE_URL; + invariant(typeof url === "string", "neither CONTROL_PLANE_DATABASE_URL nor DATABASE_URL is set"); - const databaseUrl = extendQueryParams(DATABASE_URL, { + return buildWriterClient({ url, clientType: "writer" }); +} + +// Generalized writer builder shared by the control-plane client and the run-ops +// clients. Returns a RAW, untagged, un-wrapped PrismaClient — the +// caller applies tagDatasource + captureInfrastructureErrors. +export function buildWriterClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): PrismaClient { + const databaseUrl = extendQueryParams(url, { connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), @@ -215,7 +379,7 @@ function getClient() { if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { client.$on("info", (log) => { logger.info("PrismaClient info", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -226,7 +390,7 @@ function getClient() { client.$on("warn", (log) => { logger.warn("PrismaClient warn", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -237,7 +401,7 @@ function getClient() { client.$on("error", (log) => { logger.error("PrismaClient error", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -268,12 +432,29 @@ function getClient() { } function getReplicaClient() { - if (!env.DATABASE_READ_REPLICA_URL) { + // Control-plane replica repoint: prefer the dedicated control-plane replica, falling back to + // DATABASE_READ_REPLICA_URL. Early-return undefined only when BOTH are unset, so $replica keeps + // falling back to prisma exactly as today when no replica is configured. + const url = env.CONTROL_PLANE_DATABASE_READ_REPLICA_URL ?? env.DATABASE_READ_REPLICA_URL; + if (!url) { console.log(`🔌 No database replica, using the regular client`); return; } - const replicaUrl = extendQueryParams(env.DATABASE_READ_REPLICA_URL, { + return buildReplicaClient({ url, clientType: "reader" }); +} + +// Generalized replica builder shared by the control-plane replica and the run-ops +// replicas. Returns a RAW, untagged, un-wrapped PrismaClient — the +// caller applies tagDatasource + captureInfrastructureErrors. +export function buildReplicaClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): PrismaClient { + const replicaUrl = extendQueryParams(url, { connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), @@ -345,7 +526,7 @@ function getReplicaClient() { if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { replicaClient.$on("info", (log) => { logger.info("PrismaClient info", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -356,7 +537,7 @@ function getReplicaClient() { replicaClient.$on("warn", (log) => { logger.warn("PrismaClient warn", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -367,7 +548,7 @@ function getReplicaClient() { replicaClient.$on("error", (log) => { logger.error("PrismaClient error", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -396,6 +577,108 @@ function getReplicaClient() { return replicaClient; } +function buildRunOpsWriterClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): RunOpsPrismaClient { + const databaseUrl = extendQueryParams(url, { + connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), + pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), + connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), + application_name: env.SERVICE_NAME, + }); + + console.log(`🔌 setting up run-ops prisma client to ${redactUrlSecrets(databaseUrl)}`); + + const client = new RunOpsPrismaClient({ + datasources: { db: { url: databaseUrl.href } }, + log: [ + { emit: "event", level: "error" }, + { emit: "event", level: "info" }, + { emit: "event", level: "warn" }, + ...((process.env.VERBOSE_PRISMA_LOGS === "1" || + process.env.VERY_SLOW_QUERY_THRESHOLD_MS !== undefined + ? [{ emit: "event", level: "query" }] + : []) as { emit: "event"; level: "query" }[]), + ], + }); + + if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { + client.$on("info", (log) => logger.info("RunOpsPrismaClient info", { clientType, event: log })); + client.$on("warn", (log) => logger.warn("RunOpsPrismaClient warn", { clientType, event: log })); + client.$on("error", (log) => + logger.error("RunOpsPrismaClient error", { clientType, event: log, ignoreError: true }) + ); + } + + client.$on("query", (log) => queryPerformanceMonitor.onQuery("writer", log)); + + const connectPromise = client.$connect(); + if (env.NODE_ENV === "test") { + connectPromise.catch((error) => { + logger.warn("Failed to eagerly connect run-ops prisma client (writer)", { error }); + }); + } + + console.log(`🔌 run-ops prisma client connected`); + + return client; +} + +function buildRunOpsReplicaClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): RunOpsPrismaClient { + const replicaUrl = extendQueryParams(url, { + connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), + pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), + connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), + application_name: env.SERVICE_NAME, + }); + + console.log(`🔌 setting up run-ops read replica connection to ${redactUrlSecrets(replicaUrl)}`); + + const client = new RunOpsPrismaClient({ + datasources: { db: { url: replicaUrl.href } }, + log: [ + { emit: "event", level: "error" }, + { emit: "event", level: "info" }, + { emit: "event", level: "warn" }, + ...((process.env.VERBOSE_PRISMA_LOGS === "1" || + process.env.VERY_SLOW_QUERY_THRESHOLD_MS !== undefined + ? [{ emit: "event", level: "query" }] + : []) as { emit: "event"; level: "query" }[]), + ], + }); + + if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { + client.$on("info", (log) => logger.info("RunOpsPrismaClient info", { clientType, event: log })); + client.$on("warn", (log) => logger.warn("RunOpsPrismaClient warn", { clientType, event: log })); + client.$on("error", (log) => + logger.error("RunOpsPrismaClient error", { clientType, event: log }) + ); + } + + client.$on("query", (log) => queryPerformanceMonitor.onQuery("replica", log)); + + const connectPromise = client.$connect(); + if (env.NODE_ENV === "test") { + connectPromise.catch((error) => { + logger.warn("Failed to eagerly connect run-ops prisma client (replica)", { error }); + }); + } + + console.log(`🔌 run-ops read replica connected`); + + return client; +} + function extendQueryParams(hrefOrUrl: string | URL, queryParams: Record) { const url = new URL(hrefOrUrl); const query = url.searchParams; diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index ab1941ef0bb..091f2f28ccf 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -15,7 +15,7 @@ import { bootstrap } from "./bootstrap"; import { LocaleContextProvider } from "./components/primitives/LocaleProvider"; import type { OperatingSystemPlatform } from "./components/primitives/OperatingSystemProvider"; import { OperatingSystemContextProvider } from "./components/primitives/OperatingSystemProvider"; -import { Prisma } from "./db.server"; +import { assertRunOpsSplitSentinel, Prisma } from "./db.server"; import { env } from "./env.server"; import { eventLoopMonitor } from "./eventLoopMonitor.server"; import { logger } from "./services/logger.server"; @@ -271,6 +271,17 @@ process.on("uncaughtException", (error, origin) => { process.exit(1); }); +// Boot-time run-ops split interlock. Async, so it runs as a +// fire-and-forget at startup; a flag-on-but-sentinel-fails misconfig crashes +// the process loudly before any run-ops routing is wired. +singleton("AssertRunOpsSplitSentinel", () => { + assertRunOpsSplitSentinel().catch((error) => { + logger.error("Run-ops split sentinel assertion failed; refusing to start", { error }); + process.exit(1); + }); + return true; +}); + singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); // Attach the realtime run-changed publish delegations to the engine event bus. diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 367e9a3362d..130ca2c1494 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -97,8 +97,8 @@ const EnvironmentSchema = z DATABASE_CONNECTION_LIMIT: z.coerce.number().int().default(10), DATABASE_POOL_TIMEOUT: z.coerce.number().int().default(60), DATABASE_CONNECTION_TIMEOUT: z.coerce.number().int().default(20), - // Dashboard-agent conversation store. Cloud points this at the dedicated - // PlanetScale database; when unset it falls back to DATABASE_URL (OSS), where + // Dashboard-agent conversation store. Cloud points this at a dedicated + // database; when unset it falls back to DATABASE_URL (OSS), where // the tables live in the isolated `trigger_dashboard_agent` schema. DASHBOARD_AGENT_DATABASE_URL: z.string().optional(), // The secret key (tr_*) for the runtime environment the dashboard-agent task @@ -128,6 +128,54 @@ const EnvironmentSchema = z "DIRECT_URL is invalid, for details please check the additional output above this message." ), DATABASE_READ_REPLICA_URL: z.string().optional(), + // --- Run-ops DB split — Cloud-only scaling concern; OFF by default. --- + // Explicit positive opt-in. Split behavior is unreachable unless this is true + // AND the distinct-DB sentinel confirms the two URLs are physically distinct DBs. + RUN_OPS_SPLIT_ENABLED: BoolEnv.default(false), + // Canonical URL for the dedicated run-ops DB. Takes precedence over TASK_RUN_DATABASE_URL. + RUN_OPS_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "RUN_OPS_DATABASE_URL is invalid") + .optional(), + // The NEW dedicated run-ops DB writer. Optional so single-DB installs never set it. + TASK_RUN_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_URL is invalid") + .optional(), + // The NEW run-ops DB unpooled/direct endpoint (Prisma migrate/introspection; + // connection poolers break advisory locks). Consumed by the migrations. + TASK_RUN_DATABASE_DIRECT_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_DIRECT_URL is invalid") + .optional(), + // The LEGACY run-ops DB (the control-plane DB during the transition). When unset, legacy + // run-ops reuses the existing DATABASE_URL (legacy run-ops == control-plane DB initially). + TASK_RUN_LEGACY_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_LEGACY_DATABASE_URL is invalid") + .optional(), + // The NEW dedicated run-ops DB read replica. Optional; self-host never sets it. + // Refined (unlike the unrefined control-plane DATABASE_READ_REPLICA_URL) so a malformed run-ops + // replica URL fails boot loudly rather than silently degrading — do not align it down to the CP shape. + TASK_RUN_DATABASE_READ_REPLICA_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_READ_REPLICA_URL is invalid") + .optional(), + // --- Control-plane datasource repoint. Additive-only. --- + // Optional control-plane DB. Unset (self-host/single-DB) -> getClient()/getReplicaClient() fall back to + // DATABASE_URL/DATABASE_READ_REPLICA_URL, so boot is byte-identical. When set, these point at the + // dedicated control-plane DSN; moving off the shared DB is an ops config change, not a code edit. + CONTROL_PLANE_DATABASE_URL: z + .string() + .refine( + (v) => v === undefined || isValidDatabaseUrl(v), + "CONTROL_PLANE_DATABASE_URL is invalid" + ) + .optional(), + CONTROL_PLANE_DATABASE_READ_REPLICA_URL: z.string().optional(), + // Control-plane cache relax knobs. Unset -> defaults (DEFAULT_CP_CACHE_TTL_MS / _MAX_ENTRIES). + CONTROL_PLANE_CACHE_TTL_MS: z.coerce.number().int().optional(), + CONTROL_PLANE_CACHE_MAX_ENTRIES: z.coerce.number().int().optional(), SESSION_SECRET: z.string(), MAGIC_LINK_SECRET: z.string(), ENCRYPTION_KEY: z @@ -1673,6 +1721,29 @@ const EnvironmentSchema = z RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"), RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"), + // --- Run-ops DB split — second replication source (the NEW dedicated run-ops DB). --- + // Cloud-only; only consulted when isSplitEnabled() is true. Self-host never sets these. + // The NEW source's connection URL is TASK_RUN_DATABASE_URL; these add + // the NEW source's replication slot/publication and an explicit per-source enable so it can be + // brought up independently of the legacy source during the transition. + RUN_REPLICATION_NEW_SLOT_NAME: z.string().default("task_runs_to_clickhouse_v2"), + RUN_REPLICATION_NEW_PUBLICATION_NAME: z + .string() + .default("task_runs_to_clickhouse_v2_publication"), + RUN_REPLICATION_NEW_ENABLED: z.string().default("0"), + // Origin generations packed into _version via composeTaskRunVersion. + // Legacy DB = 0, new dedicated run-ops DB = 1. Exposed as env so the mapping is auditable + // per-deploy, but DEFAULTS encode the canonical legacy=0 / new=1 contract. + RUN_REPLICATION_LEGACY_ORIGIN_GENERATION: z.coerce.number().int().default(0), + RUN_REPLICATION_NEW_ORIGIN_GENERATION: z.coerce.number().int().default(1), + + // Run-ops KSUID mint cutover — per-env, canary-first, OFF by default. + // Even when on, an env mints KSUID only if its per-org runOpsMintKsuid flag is + // "ksuid" AND isSplitEnabled() is true. Cache mirrors REALTIME_BACKEND_FLAG_CACHE_*. + RUN_OPS_MINT_KSUID_ENABLED: BoolEnv.default(false), + RUN_OPS_MINT_FLAG_CACHE_TTL_MS: z.coerce.number().int().default(30_000), + RUN_OPS_MINT_FLAG_CACHE_MAX_ENTRIES: z.coerce.number().int().default(10_000), + // Session replication (Postgres → ClickHouse sessions_v1). Shares Redis // with the runs replicator for leader locking but has its own slot and // publication so the two consume independently. diff --git a/apps/webapp/app/models/runtimeEnvironment.server.ts b/apps/webapp/app/models/runtimeEnvironment.server.ts index efcfdc524fa..5e6974cb0f1 100644 --- a/apps/webapp/app/models/runtimeEnvironment.server.ts +++ b/apps/webapp/app/models/runtimeEnvironment.server.ts @@ -2,6 +2,7 @@ import type { AuthenticatedEnvironment } from "@internal/run-engine"; import type { Prisma, PrismaClientOrTransaction, RuntimeEnvironment } from "@trigger.dev/database"; import { $replica, prisma } from "~/db.server"; import { runStore } from "~/v3/runStore.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { logger } from "~/services/logger.server"; import { getUsername } from "~/utils/username"; import { isDefaultDevBranch, sanitizeBranchName } from "@trigger.dev/core/v3/utils/gitBranch"; @@ -271,24 +272,32 @@ export async function findEnvironmentFromRun( runId: string, tx?: PrismaClientOrTransaction ): Promise { - // The include (no select) already pulls every taskRun scalar, so runTags/batchId - // ride along for free — no extra query for the realtime publish to send a full record. + // Run-ops scalars (runTags/batchId/runtimeEnvironmentId) from the run store; the env half is + // resolved via the control-plane resolver so the run-ops DB can split without a cross-DB join. const taskRun = await runStore.findRun( { id: runId, }, { - include: { - runtimeEnvironment: { include: authIncludeBase }, + select: { + runTags: true, + batchId: true, + runtimeEnvironmentId: true, }, }, tx ?? $replica ); - if (!taskRun?.runtimeEnvironment) { + if (!taskRun) { + return null; + } + const environment = await controlPlaneResolver.resolveAuthenticatedEnv( + taskRun.runtimeEnvironmentId + ); + if (!environment) { return null; } return { - environment: toAuthenticated(taskRun.runtimeEnvironment), + environment, runTags: taskRun.runTags, batchId: taskRun.batchId, }; diff --git a/apps/webapp/app/v3/engineVersion.server.ts b/apps/webapp/app/v3/engineVersion.server.ts index 0d0c6ecfdbf..32eca6fb882 100644 --- a/apps/webapp/app/v3/engineVersion.server.ts +++ b/apps/webapp/app/v3/engineVersion.server.ts @@ -5,6 +5,11 @@ import { getCurrentWorkerDeploymentEngineVersion, } from "./models/workerDeployment.server"; +// Co-locate the per-env run-ops residency/mint decision next to the +// engine-version decision. determineEngineVersion is intentionally left untouched so its +// read-only callers (presenters, admin routes, pauseQueue) never pay the mint flag read. +export { resolveRunIdMintKind, type RunIdMintKind } from "./runOpsMigration/runOpsMintKind.server"; + type Environment = { id: string; type: RuntimeEnvironmentType; diff --git a/apps/webapp/app/v3/eventRepository/index.server.ts b/apps/webapp/app/v3/eventRepository/index.server.ts index 614424a1993..d89d241bf22 100644 --- a/apps/webapp/app/v3/eventRepository/index.server.ts +++ b/apps/webapp/app/v3/eventRepository/index.server.ts @@ -3,6 +3,7 @@ import { eventRepository } from "./eventRepository.server"; import { type IEventRepository, type TraceEventOptions } from "./eventRepository.types"; import { prisma } from "~/db.server"; import { runStore } from "../runStore.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { logger } from "~/services/logger.server"; import { FEATURE_FLAG } from "../featureFlags"; import { flag } from "../featureFlags.server"; @@ -261,7 +262,7 @@ async function recordRunEvent( } async function findRunForEventCreation(runId: string) { - return runStore.findRun( + const foundRun = await runStore.findRun( { id: runId, }, @@ -271,21 +272,23 @@ async function findRunForEventCreation(runId: string) { taskIdentifier: true, traceContext: true, taskEventStore: true, - runtimeEnvironment: { - select: { - id: true, - type: true, - organizationId: true, - projectId: true, - project: { - select: { - externalRef: true, - }, - }, - }, - }, + runtimeEnvironmentId: true, }, }, prisma ); + + if (!foundRun) { + return null; + } + + const environment = await controlPlaneResolver.resolveAuthenticatedEnv( + foundRun.runtimeEnvironmentId + ); + + if (!environment) { + return null; + } + + return { ...foundRun, runtimeEnvironment: environment }; } diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 46434bebf30..4617179eda1 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -17,6 +17,8 @@ export const FEATURE_FLAG = { computeMigrationFreePercentage: "computeMigrationFreePercentage", computeMigrationPaidPercentage: "computeMigrationPaidPercentage", computeMigrationRequireTemplate: "computeMigrationRequireTemplate", + devBranchesEnabled: "devBranchesEnabled", + runOpsMintKsuid: "runOpsMintKsuid", } as const; export const FeatureFlagCatalog = { @@ -47,6 +49,11 @@ export const FeatureFlagCatalog = { // When on, migrated orgs build their compute template in required mode at deploy // (fails the deploy on error) instead of shadow. Strict boolean (see above). [FEATURE_FLAG.computeMigrationRequireTemplate]: z.boolean(), + // Per-org access to development branches. Off unless enabled for the org. + [FEATURE_FLAG.devBranchesEnabled]: z.coerce.boolean(), + // Per-org KSUID mint cutover. Defaults to "cuid"; only honored when + // RUN_OPS_MINT_KSUID_ENABLED is on AND isSplitEnabled() is true. + [FEATURE_FLAG.runOpsMintKsuid]: z.enum(["cuid", "ksuid"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index 3f9cd603b07..4d9e263d6be 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -8,6 +8,9 @@ import { defaultMachine, getCurrentPlan } from "~/services/platform.v3.server"; import { singleton } from "~/utils/singleton"; import { allMachines } from "./machinePresets.server"; import { runEnginePendingVersionLookup } from "./runEnginePendingVersionLookup.server"; +import { pickRunOpsStoreForCompletion } from "./runOpsMigration/crossSeamGuard.server"; +import { runEngineControlPlaneResolver } from "./runOpsMigration/runEngineControlPlaneResolver.server"; +import { runStore } from "./runStore.server"; import { meter, tracer } from "./tracer.server"; export const engine = singleton("RunEngine", createRunEngine); @@ -18,6 +21,12 @@ function createRunEngine() { const engine = new RunEngine({ prisma, readOnlyPrisma: $replica, + crossSeamGuard: pickRunOpsStoreForCompletion, + // Inject the shared run-store singleton so the engine and the webapp presenters/ + // services route through ONE store. When split is off this is the same passthrough + // PostgresRunStore the engine would have defaulted to, so behavior is unchanged. + store: runStore, + controlPlaneResolver: runEngineControlPlaneResolver, logLevel: env.RUN_ENGINE_WORKER_LOG_LEVEL, treatProductionExecutionStallsAsOOM: env.RUN_ENGINE_TREAT_PRODUCTION_EXECUTION_STALLS_AS_OOM === "1", diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 6e99898cdfe..021b04d5822 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -4,7 +4,14 @@ import { tryCatch } from "@trigger.dev/core/utils"; import { createJsonErrorObject, sanitizeError, TaskRunErrorCodes } from "@trigger.dev/core/v3"; import { RunId } from "@trigger.dev/core/v3/isomorphic"; import type { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; -import { $replica, prisma } from "~/db.server"; +import { + $replica, + prisma, + runOpsLegacyPrisma, + runOpsNewPrisma, + runOpsNewReplica, + runOpsLegacyReplica, +} from "~/db.server"; import { env } from "~/env.server"; import { findEnvironmentById, findEnvironmentFromRun } from "~/models/runtimeEnvironment.server"; import { TriggerFailedTaskService } from "~/runEngine/services/triggerFailedTask.server"; @@ -22,39 +29,52 @@ import { getEventRepositoryForStore, recordRunDebugLog } from "./eventRepository import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server"; import { engine } from "./runEngine.server"; import { runStore } from "./runStore.server"; +import { isSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; +import { + handleBatchCompletion, + QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE, + readRunForEvent, + readRunForEventOrThrow, + type EventReadDeps, +} from "./runEngineHandlersShared.server"; export function registerRunEngineEventBusHandlers() { + // Resolve the split-mode gate ONCE at registration scope (never per-event). + const splitEnabledPromise = isSplitEnabled(); + const eventReadDeps = async (): Promise => ({ + store: runStore, + newReplica: runOpsNewReplica, + legacyReplica: runOpsLegacyReplica, + splitEnabled: await splitEnabledPromise, + }); engine.eventBus.on("runSucceeded", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + environment.id, { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read so the - // per-env channel carries the membership keys (no separate query). No-op when - // the native backend is disabled. - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read so the + // per-env channel carries the membership keys (no separate query). No-op when + // the native backend is disabled. + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -113,33 +133,30 @@ export function registerRunEngineEventBusHandlers() { const exception = createExceptionPropertiesFromError(sanitizedError); const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + run.id, + environment.id, { - id: run.id, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, - }, - $replica + await eventReadDeps() ) ); @@ -185,33 +202,33 @@ export function registerRunEngineEventBusHandlers() { const exception = createExceptionPropertiesFromError(sanitizedError); const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + // runAttemptFailed carries no environment param; the env is derived from + // the read row afterwards. environmentId is informational for read-through + // (residency is keyed on runId), so an empty value is safe here. + "", { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -273,29 +290,28 @@ export function registerRunEngineEventBusHandlers() { return; } + const deps = await eventReadDeps(); + const [cachedRunError, cachedRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + cachedRunId ?? "", + "", { - id: cachedRunId, - }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, }, - $replica + deps ) ); @@ -308,29 +324,26 @@ export function registerRunEngineEventBusHandlers() { } const [blockedRunError, blockedRun] = await tryCatch( - runStore.findRun( - { - id: blockedRunId, - }, + readRunForEvent( + blockedRunId, + "", { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, }, - $replica + deps ) ); @@ -387,33 +400,30 @@ export function registerRunEngineEventBusHandlers() { } const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + environment.id, { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -456,33 +466,30 @@ export function registerRunEngineEventBusHandlers() { engine.eventBus.on("runCancelled", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + run.id, + environment.id, { - id: run.id, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, - }, - $replica + await eventReadDeps() ) ); @@ -771,15 +778,6 @@ export function registerRunEngineEventBusHandlers() { }); } -/** - * errorCode returned by the batch process-item callback when the trigger was - * rejected because the environment's queue is at its maximum size. The - * BatchQueue (via `skipRetries`) short-circuits retries for this code, and the - * batch completion callback collapses per-item errors into a single aggregate - * `BatchTaskRunError` row instead of writing one per item. - */ -const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; - /** * Set up the BatchQueue processing callbacks. * These handle creating runs from batch items and completing batches. @@ -790,6 +788,9 @@ const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; * - The run engine will download from R2 when the task executes */ export function setupBatchQueueCallbacks() { + // Resolve the split-mode gate ONCE at registration scope (never per-callback). + const splitEnabledPromise = isSplitEnabled(); + // Item processing callback - creates a run for each batch item engine.setBatchProcessItemCallback( async ({ batchId, friendlyId, itemIndex, item, meta, attempt, isFinalAttempt }) => { @@ -1035,104 +1036,17 @@ export function setupBatchQueueCallbacks() { } ); - // Batch completion callback - updates Postgres with results + // Batch completion callback - updates Postgres with results. The source callback + // is a thin wrapper that resolves the split-mode gate and supplies the run-ops + // handles; the body lives in handleBatchCompletion for testability. engine.setBatchCompletionCallback(async (result: CompleteBatchResult) => { - const { batchId, runIds, successfulRunCount, failedRunCount, failures } = result; - - // Determine final status - let status: BatchTaskRunStatus; - if (failedRunCount > 0 && successfulRunCount === 0) { - status = "ABORTED"; - } else if (failedRunCount > 0) { - status = "PARTIAL_FAILED"; - } else { - status = "PENDING"; // All runs created, waiting for completion - } - - try { - // Use a transaction to ensure atomicity of batch update and error record creation - // skipDuplicates handles idempotency when callback is retried (relies on unique constraint) - await prisma.$transaction(async (tx) => { - // Update BatchTaskRun - await tx.batchTaskRun.update({ - where: { id: batchId }, - data: { - status, - runIds, - successfulRunCount, - failedRunCount, - completedAt: status === "ABORTED" ? new Date() : undefined, - processingCompletedAt: new Date(), - }, - }); - - // Create error records if there were failures. - // - // Fast-path for queue-size-limit overload: when every failure is the - // same QUEUE_SIZE_LIMIT_EXCEEDED error, collapse them into a single - // aggregate row instead of writing one per item. This keeps the DB - // write volume bounded to O(batches) instead of O(items) when a noisy - // tenant fills their queue and all of their batches start bouncing. - if (failures.length > 0) { - const allQueueSizeLimit = failures.every( - (f) => f.errorCode === QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE - ); - - if (allQueueSizeLimit) { - const sample = failures[0]!; - await tx.batchTaskRunError.createMany({ - data: [ - { - batchTaskRunId: batchId, - // Use the first item's index as a stable anchor for the - // (batchTaskRunId, index) unique constraint so callback - // retries remain idempotent. - index: sample.index, - taskIdentifier: sample.taskIdentifier, - payload: sample.payload, - options: sample.options as Prisma.InputJsonValue | undefined, - error: `${sample.error} (${failures.length} items in this batch failed with the same error)`, - errorCode: sample.errorCode, - }, - ], - skipDuplicates: true, - }); - } else { - await tx.batchTaskRunError.createMany({ - data: failures.map((failure) => ({ - batchTaskRunId: batchId, - index: failure.index, - taskIdentifier: failure.taskIdentifier, - payload: failure.payload, - options: failure.options as Prisma.InputJsonValue | undefined, - error: failure.error, - errorCode: failure.errorCode, - })), - skipDuplicates: true, - }); - } - } - }); - - // Try to complete the batch (handles waitpoint completion if all runs are done) - if (status !== "ABORTED") { - await engine.tryCompleteBatch({ batchId }); - } - - logger.info("Batch completion handled", { - batchId, - status, - successfulRunCount, - failedRunCount, - }); - } catch (error) { - logger.error("Failed to handle batch completion", { - batchId, - error: error instanceof Error ? error.message : String(error), - }); - // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate) - throw error; - } + await handleBatchCompletion(result, { + splitEnabled: await splitEnabledPromise, + newReplica: runOpsNewReplica, + newWriter: runOpsNewPrisma, + legacyWriter: runOpsLegacyPrisma, + tryCompleteBatch: (batchId) => engine.tryCompleteBatch({ batchId }), + }); }); logger.info("BatchQueue callbacks configured"); diff --git a/apps/webapp/app/v3/runEngineHandlersShared.server.ts b/apps/webapp/app/v3/runEngineHandlersShared.server.ts new file mode 100644 index 00000000000..4f648fefc19 --- /dev/null +++ b/apps/webapp/app/v3/runEngineHandlersShared.server.ts @@ -0,0 +1,225 @@ +/** + * Pure, store-routing helpers extracted from runEngineHandlers.server.ts so they + * are testable without constructing the engine (importing that module pulls in the + * whole webapp service graph). The handlers wire the production defaults; tests + * inject per-container stores/replicas, so these helpers never import db.server. + */ +import type { CompleteBatchResult } from "@internal/run-engine"; +import type { RunStore } from "@internal/run-store"; +import { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; +import type { PrismaClient, PrismaReplicaClient } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { readThroughRun } from "~/v3/runOpsMigration/readThrough.server"; + +export type EventReadDeps = { + store: RunStore; + newReplica: PrismaReplicaClient; + legacyReplica: PrismaReplicaClient; + splitEnabled: boolean; + // Pure boundaries forwarded to read-through; production leaves them undefined + // so the read-through layer uses its own wired defaults. Tests inject fakes. + isKnownMigrated?: (runId: string) => Promise; + isPastRetention?: (runId: string) => boolean; +}; + +/** + * Resolve a TaskRun for an event-bus enrichment read through the run-ops + * read-through layer. The store stays the read mechanism (the + * closures call `store.findRun(...)`); read-through only chooses which replica. + * Returns null when not-found / past-retention. Passthrough in single-DB. + */ +export async function readRunForEvent( + runId: string, + environmentId: string, + select: S, + deps: EventReadDeps +): Promise | null> { + const result = await readThroughRun>({ + runId, + environmentId, + readNew: (client) => deps.store.findRun({ id: runId }, { select }, client), + readLegacy: (replica) => deps.store.findRun({ id: runId }, { select }, replica), + deps: { + newClient: deps.newReplica, + legacyReplica: deps.legacyReplica, + splitEnabled: deps.splitEnabled, + isKnownMigrated: deps.isKnownMigrated, + isPastRetention: deps.isPastRetention, + }, + }); + + return result.source === "not-found" || result.source === "past-retention" ? null : result.value; +} + +/** + * Reproduces the `findRunOrThrow` not-found-as-error semantics the 6 throwing + * read sites rely on (a missing run throws, which their `tryCatch` turns into + * the existing error-log + early-return — never a silent no-op). + */ +export async function readRunForEventOrThrow( + runId: string, + environmentId: string, + select: S, + deps: EventReadDeps +): Promise> { + const run = await readRunForEvent(runId, environmentId, select, deps); + if (!run) { + throw new Error("Task run not found"); + } + return run; +} + +/** + * Resolve which run-ops writer physically owns the `BatchTaskRun` row for + * `batchId` by probing where the row lives, so the batch-completion txn commits + * on a single run-ops DB. Length classification is INVALID here: a batch id may + * be a ksuid (cut-over orgs) or a cuid (and cuid-shaped ids can be backfilled + * onto NEW), so id-shape does not reliably indicate the row's actual residency. + * The existence probe is the correct signal. + */ +export async function resolveBatchRunOpsWriter( + batchId: string, + deps: { + newReplica: PrismaReplicaClient; + newWriter: PrismaClient; + legacyWriter: PrismaClient; + } +): Promise { + const onNew = await deps.newReplica.batchTaskRun.findFirst({ + where: { id: batchId }, + select: { id: true }, + }); + return onNew ? deps.newWriter : deps.legacyWriter; +} + +/** + * errorCode returned by the batch process-item callback when the trigger was + * rejected because the environment's queue is at its maximum size. The + * BatchQueue (via `skipRetries`) short-circuits retries for this code, and the + * batch completion callback collapses per-item errors into a single aggregate + * `BatchTaskRunError` row instead of writing one per item. + */ +export const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; + +export type BatchCompletionDeps = { + splitEnabled: boolean; + newReplica: PrismaReplicaClient; + newWriter: PrismaClient; + legacyWriter: PrismaClient; + tryCompleteBatch: (batchId: string) => Promise; +}; + +/** + * Routes the batch-completion transaction (BatchTaskRun update + BatchTaskRunError + * createMany — both run-ops tables) onto the run-ops writer that physically owns + * the BatchTaskRun row for `batchId`, so the whole txn commits on a single DB. The + * transaction body is unchanged from before the split; only the client changes. + */ +export async function handleBatchCompletion( + result: CompleteBatchResult, + deps: BatchCompletionDeps +) { + const { batchId, runIds, successfulRunCount, failedRunCount, failures } = result; + + // Determine final status + let status: BatchTaskRunStatus; + if (failedRunCount > 0 && successfulRunCount === 0) { + status = "ABORTED"; + } else if (failedRunCount > 0) { + status = "PARTIAL_FAILED"; + } else { + status = "PENDING"; // All runs created, waiting for completion + } + + // Always probe residency — never special-case on splitEnabled (see commit msg). + const runOpsWriter = await resolveBatchRunOpsWriter(batchId, { + newReplica: deps.newReplica, + newWriter: deps.newWriter, + legacyWriter: deps.legacyWriter, + }); + + try { + // Use a transaction to ensure atomicity of batch update and error record creation + // skipDuplicates handles idempotency when callback is retried (relies on unique constraint) + await runOpsWriter.$transaction(async (tx) => { + // Update BatchTaskRun + await tx.batchTaskRun.update({ + where: { id: batchId }, + data: { + status, + runIds, + successfulRunCount, + failedRunCount, + completedAt: status === "ABORTED" ? new Date() : undefined, + processingCompletedAt: new Date(), + }, + }); + + // Create error records if there were failures. + // + // Fast-path for queue-size-limit overload: when every failure is the + // same QUEUE_SIZE_LIMIT_EXCEEDED error, collapse them into a single + // aggregate row instead of writing one per item. This keeps the DB + // write volume bounded to O(batches) instead of O(items) when a noisy + // tenant fills their queue and all of their batches start bouncing. + if (failures.length > 0) { + const allQueueSizeLimit = failures.every( + (f) => f.errorCode === QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE + ); + + if (allQueueSizeLimit) { + const sample = failures[0]!; + await tx.batchTaskRunError.createMany({ + data: [ + { + batchTaskRunId: batchId, + // Use the first item's index as a stable anchor for the + // (batchTaskRunId, index) unique constraint so callback + // retries remain idempotent. + index: sample.index, + taskIdentifier: sample.taskIdentifier, + payload: sample.payload, + options: sample.options as Prisma.InputJsonValue | undefined, + error: `${sample.error} (${failures.length} items in this batch failed with the same error)`, + errorCode: sample.errorCode, + }, + ], + skipDuplicates: true, + }); + } else { + await tx.batchTaskRunError.createMany({ + data: failures.map((failure) => ({ + batchTaskRunId: batchId, + index: failure.index, + taskIdentifier: failure.taskIdentifier, + payload: failure.payload, + options: failure.options as Prisma.InputJsonValue | undefined, + error: failure.error, + errorCode: failure.errorCode, + })), + skipDuplicates: true, + }); + } + } + }); + + // Try to complete the batch (handles waitpoint completion if all runs are done) + if (status !== "ABORTED") { + await deps.tryCompleteBatch(batchId); + } + + logger.info("Batch completion handled", { + batchId, + status, + successfulRunCount, + failedRunCount, + }); + } catch (error) { + logger.error("Failed to handle batch completion", { + batchId, + error: error instanceof Error ? error.message : String(error), + }); + // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate) + throw error; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts new file mode 100644 index 00000000000..cc1731fd547 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts @@ -0,0 +1,180 @@ +import type { + BackgroundWorker, + BackgroundWorkerTask, + Prisma, + RuntimeEnvironmentType, + TaskQueue, + WorkerDeployment, +} from "@trigger.dev/database"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environment"; + +/** + * Cache policy + invalidation for the cross-DB control-plane resolver. + * + * One-way dependency: this module is imported by `controlPlaneResolver.server.ts`; + * it must NEVER import the resolver. The shared `Resolved*` return types live here + * so both files reference an identical definition (the resolver re-exports them for + * consumers). + * + * Invalidation note: the underlying `BoundedTtlCache` exposes no public `delete`, so + * explicit invalidation is implemented with a per-key epoch map. A write stamps the + * stored value with the key's current epoch; a read returns the value only if its + * stamped epoch still matches the current epoch, otherwise it is treated as a miss. + * `invalidate*` bumps the key's epoch, forcing the next read to miss. (If a future + * rebase gives `BoundedTtlCache` a public `delete`, prefer it and drop the epoch map.) + */ + +export const DEFAULT_CP_CACHE_TTL_MS = 30_000; +export const DEFAULT_CP_CACHE_MAX_ENTRIES = 10_000; + +export type ResolvedEnv = { + id: string; + type: RuntimeEnvironmentType; + projectId: string; + organizationId: string; + archivedAt: Date | null; + // The parent env's type, or null when this env has no parent. Alerts compute + // `parentEnvironmentType ?? type` (byte-identical to `parentEnvironment?.type ?? type`). + parentEnvironmentType: RuntimeEnvironmentType | null; + // Concurrency + nested ids the run-engine ControlPlaneResolver adapter maps to + // `ResolvedEngineEnv` (a MinimalAuthenticatedEnvironment superset). Existing app consumers + // ignore these additive fields. + maximumConcurrencyLimit: number; + concurrencyLimitBurstFactor: Prisma.Decimal; +}; + +/** Mirrors `WorkerDeploymentWithWorkerTasks` in `dequeueSystem.ts` exactly. */ +export type ResolvedWorkerVersion = { + worker: BackgroundWorker; + tasks: BackgroundWorkerTask[]; + queues: TaskQueue[]; + deployment: WorkerDeployment | null; +}; + +// The canonical authenticated-environment shape (slug/type/project/organization/orgMember/…). +// Re-aliased from the engine type so the cache slot cannot drift from `toAuthenticated()`'s output. +export type ResolvedAuthenticatedEnv = AuthenticatedEnvironment; + +/** + * The slim `lockedBy` (BackgroundWorkerTask) + `lockedToVersion` (BackgroundWorker, with its + * WorkerDeployment) shape — the UNION of every field webapp run sites read off these two + * cross-DB worker relations. Each field is optional because a run may be locked to a version + * but not a task (or neither); resolvers return only what exists. + */ +export type ResolvedRunLockedWorker = { + lockedBy: { + id: string; + filePath: string; + exportName: string | null; + slug: string; + machineConfig: Prisma.JsonValue | null; + worker: { + id: string; + version: string; + sdkVersion: string; + cliVersion: string; + supportsLazyAttempts: boolean; + deployment: { + friendlyId: string; + shortCode: string; + version: string; + runtime: string | null; + runtimeVersion: string | null; + git: Prisma.JsonValue | null; + } | null; + }; + } | null; + lockedToVersion: { + version: string; + sdkVersion: string; + runtime: string | null; + runtimeVersion: string | null; + supportsLazyAttempts: boolean; + } | null; +}; + +type Stamped = { value: V; epoch: number }; + +export class ControlPlaneCache { + readonly #env: BoundedTtlCache>; + readonly #version: BoundedTtlCache>; + readonly #envExists: BoundedTtlCache>; + readonly #authEnv: BoundedTtlCache>; + readonly #lockedWorker: BoundedTtlCache>; + + // Explicit invalidation: bumping a key's epoch forces the next read to miss. + readonly #epochs = new Map(); + + constructor(opts?: { ttlMs?: number; maxEntries?: number }) { + const ttl = opts?.ttlMs ?? DEFAULT_CP_CACHE_TTL_MS; + const max = opts?.maxEntries ?? DEFAULT_CP_CACHE_MAX_ENTRIES; + this.#env = new BoundedTtlCache(ttl, max); + this.#version = new BoundedTtlCache(ttl, max); + this.#envExists = new BoundedTtlCache(ttl, max); + this.#authEnv = new BoundedTtlCache(ttl, max); + this.#lockedWorker = new BoundedTtlCache(ttl, max); + } + + #epoch(key: string): number { + return this.#epochs.get(key) ?? 0; + } + + #read(cache: BoundedTtlCache>, key: string): V | undefined { + const entry = cache.get(key); + if (entry === undefined || entry.epoch !== this.#epoch(key)) { + return undefined; + } + return entry.value; + } + + #write(cache: BoundedTtlCache>, key: string, value: V): void { + cache.set(key, { value, epoch: this.#epoch(key) }); + } + + #bump(key: string): void { + this.#epochs.set(key, this.#epoch(key) + 1); + } + + getEnv(id: string): (ResolvedEnv | null) | undefined { + return this.#read(this.#env, `env:${id}`); + } + setEnv(id: string, value: ResolvedEnv | null): void { + this.#write(this.#env, `env:${id}`, value); + } + invalidateEnv(id: string): void { + this.#bump(`env:${id}`); + } + + // worker version: key = `${environmentId}:${backgroundWorkerId ?? "current"}` + getWorkerVersion(key: string): (ResolvedWorkerVersion | null) | undefined { + return this.#read(this.#version, `version:${key}`); + } + setWorkerVersion(key: string, value: ResolvedWorkerVersion | null): void { + this.#write(this.#version, `version:${key}`, value); + } + + // env existence (boolean; for the dropped-FK replacement check) + getEnvExists(id: string): boolean | undefined { + return this.#read(this.#envExists, `envExists:${id}`); + } + setEnvExists(id: string, exists: boolean): void { + this.#write(this.#envExists, `envExists:${id}`, exists); + } + + // full authenticated environment (toAuthenticated shape) + getAuthEnv(id: string): (ResolvedAuthenticatedEnv | null) | undefined { + return this.#read(this.#authEnv, `authEnv:${id}`); + } + setAuthEnv(id: string, value: ResolvedAuthenticatedEnv | null): void { + this.#write(this.#authEnv, `authEnv:${id}`, value); + } + + // run-locked worker (lockedBy + lockedToVersion); key = `${lockedById ?? "_"}:${lockedToVersionId ?? "_"}` + getLockedWorker(key: string): (ResolvedRunLockedWorker | null) | undefined { + return this.#read(this.#lockedWorker, `lockedWorker:${key}`); + } + setLockedWorker(key: string, value: ResolvedRunLockedWorker | null): void { + this.#write(this.#lockedWorker, `lockedWorker:${key}`, value); + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts new file mode 100644 index 00000000000..57595e6214b --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts @@ -0,0 +1,446 @@ +import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; +import { RuntimeEnvironmentType } from "@trigger.dev/database"; +import type { PrismaClient, PrismaReplicaClient } from "@trigger.dev/database"; +import { prisma, $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { + ControlPlaneCache, + DEFAULT_CP_CACHE_MAX_ENTRIES, + DEFAULT_CP_CACHE_TTL_MS, + type ResolvedEnv, + type ResolvedWorkerVersion, +} from "./controlPlaneCache.server"; +import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; +import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environment"; +import type { ResolvedRunLockedWorker } from "./controlPlaneCache.server"; + +/** + * App-level control-plane resolution + cache layer. Replaces the run-ops -> control-plane + * Prisma joins (env/project/org, the pinned/current worker version + its tasks/queues, the + * TaskQueue, the TaskSchedule friendlyId mapping) with cached lookups against the + * control-plane client, so the split (cross-DB) hot path avoids a cross-WAN round-trip per + * resolution. + * + * Split ON (cloud): cache-first reads against the control-plane replica; `null` is cached as + * a confirmed absence. Split OFF (self-host/local/CI): plain Prisma join against the single + * control-plane client on every call, NO cache — byte-identical to today's inline join. + * + * The split gate is a SYNCHRONOUS `splitEnabled: () => boolean` injected at construction; the + * resolver never awaits the async `isSplitEnabled()` (that gate is reserved for the boot + * sentinel). Tests inject testcontainer clients + a sync predicate; only the module-level + * singleton at the bottom reads from `db.server.ts` / `env.server.ts`. + * + * Scope boundary: this unit owns ONLY control-plane resolution (env, worker version, + * env existence). The run-ops batchId friendlyId->id resolution belongs to the + * run-ops read path (the unit owning `runsRepository.server.ts`); do not duplicate it here. + */ + +export { ResolvedEnv, ResolvedWorkerVersion }; +export type { ResolvedRunLockedWorker }; + +/** Thrown by `assertEnvExists` when a referenced control-plane env does not exist. */ +export class ControlPlaneReferenceError extends Error { + constructor(message: string) { + super(message); + this.name = "ControlPlaneReferenceError"; + } +} + +export type ControlPlaneResolverOptions = { + controlPlanePrimary: PrismaClient; + controlPlaneReplica: PrismaReplicaClient; + cache: ControlPlaneCache; + splitEnabled: () => boolean; +}; + +type CpClient = PrismaClient | PrismaReplicaClient; + +function workerVersionKey( + environmentId: string, + backgroundWorkerId: string | undefined, + type: RuntimeEnvironmentType | undefined +): string { + return `${environmentId}:${backgroundWorkerId ?? "current"}:${type ?? "any"}`; +} + +function lockedWorkerKey(lockedById?: string | null, lockedToVersionId?: string | null): string { + return `${lockedById ?? "_"}:${lockedToVersionId ?? "_"}`; +} + +export class ControlPlaneResolver { + private readonly controlPlanePrimary: PrismaClient; + private readonly controlPlaneReplica: PrismaReplicaClient; + private readonly cache: ControlPlaneCache; + private readonly splitEnabled: () => boolean; + + constructor(opts: ControlPlaneResolverOptions) { + this.controlPlanePrimary = opts.controlPlanePrimary; + this.controlPlaneReplica = opts.controlPlaneReplica; + this.cache = opts.cache; + this.splitEnabled = opts.splitEnabled; + } + + async resolveEnv(environmentId: string): Promise { + if (!this.splitEnabled()) { + return this.#queryEnv(this.controlPlanePrimary, environmentId); + } + + const cached = this.cache.getEnv(environmentId); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryEnv(this.controlPlaneReplica, environmentId); + this.cache.setEnv(environmentId, resolved); + return resolved; + } + + async #queryEnv(client: CpClient, environmentId: string): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { + id: true, + type: true, + projectId: true, + archivedAt: true, + maximumConcurrencyLimit: true, + concurrencyLimitBurstFactor: true, + project: { select: { organizationId: true } }, + parentEnvironment: { select: { type: true } }, + }, + }); + + if (!env) { + return null; + } + + return { + id: env.id, + type: env.type, + projectId: env.projectId, + organizationId: env.project.organizationId, + archivedAt: env.archivedAt, + parentEnvironmentType: env.parentEnvironment?.type ?? null, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor, + }; + } + + async resolveAuthenticatedEnv(environmentId: string): Promise { + if (!this.splitEnabled()) { + return this.#queryAuthenticatedEnv(this.controlPlanePrimary, environmentId); + } + + const cached = this.cache.getAuthEnv(environmentId); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryAuthenticatedEnv(this.controlPlaneReplica, environmentId); + this.cache.setAuthEnv(environmentId, resolved); + return resolved; + } + + async #queryAuthenticatedEnv( + client: CpClient, + environmentId: string + ): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + include: authIncludeWithParent, + }); + + if (!env) { + return null; + } + + return toAuthenticated(env); + } + + async resolveRunLockedWorker(args: { + lockedById?: string | null; + lockedToVersionId?: string | null; + }): Promise { + const { lockedById, lockedToVersionId } = args; + + if (!this.splitEnabled()) { + return this.#queryRunLockedWorker(this.controlPlanePrimary, lockedById, lockedToVersionId); + } + + const key = lockedWorkerKey(lockedById, lockedToVersionId); + const cached = this.cache.getLockedWorker(key); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryRunLockedWorker( + this.controlPlaneReplica, + lockedById, + lockedToVersionId + ); + this.cache.setLockedWorker(key, resolved); + return resolved; + } + + async #queryRunLockedWorker( + client: CpClient, + lockedById?: string | null, + lockedToVersionId?: string | null + ): Promise { + const lockedByRow = lockedById + ? await client.backgroundWorkerTask.findFirst({ + where: { id: lockedById }, + select: { + id: true, + filePath: true, + exportName: true, + slug: true, + machineConfig: true, + worker: { + select: { + id: true, + version: true, + sdkVersion: true, + cliVersion: true, + supportsLazyAttempts: true, + deployment: { + select: { + friendlyId: true, + shortCode: true, + version: true, + runtime: true, + runtimeVersion: true, + git: true, + }, + }, + }, + }, + }, + }) + : null; + + const lockedToVersionRow = lockedToVersionId + ? await client.backgroundWorker.findFirst({ + where: { id: lockedToVersionId }, + select: { + version: true, + sdkVersion: true, + runtime: true, + runtimeVersion: true, + supportsLazyAttempts: true, + }, + }) + : null; + + return { + lockedBy: lockedByRow, + lockedToVersion: lockedToVersionRow, + }; + } + + async resolveWorkerVersion(args: { + environmentId: string; + backgroundWorkerId?: string; + /** + * When provided, the full run-engine dequeue dispatch is used (DEV resolves the most-recent + * worker; deployed resolves the promoted MANAGED deployment with the latest-v2 fallback). + * When omitted, the original app behavior applies (worker-by-id, else current promotion). + */ + type?: RuntimeEnvironmentType; + }): Promise { + const { environmentId, backgroundWorkerId, type } = args; + + if (!this.splitEnabled()) { + return this.#queryWorkerVersion( + this.controlPlanePrimary, + environmentId, + backgroundWorkerId, + type + ); + } + + const key = workerVersionKey(environmentId, backgroundWorkerId, type); + const cached = this.cache.getWorkerVersion(key); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryWorkerVersion( + this.controlPlaneReplica, + environmentId, + backgroundWorkerId, + type + ); + this.cache.setWorkerVersion(key, resolved); + return resolved; + } + + async #queryWorkerVersion( + client: CpClient, + environmentId: string, + backgroundWorkerId?: string, + type?: RuntimeEnvironmentType + ): Promise { + // Full run-engine dequeue dispatch (mirrors dequeueSystem's four helpers) when the env type is + // known. DEVELOPMENT envs resolve by most-recent worker; deployed envs resolve the promoted + // MANAGED deployment. + if (type === "DEVELOPMENT") { + return backgroundWorkerId + ? this.#queryWorkerById(client, backgroundWorkerId) + : this.#queryMostRecentWorker(client, environmentId); + } + + if (backgroundWorkerId) { + const worker = await client.backgroundWorker.findFirst({ + where: { id: backgroundWorkerId }, + include: { deployment: true, tasks: true, queues: true }, + }); + + if (!worker) { + return null; + } + + return { + worker, + tasks: worker.tasks, + queues: worker.queues, + deployment: worker.deployment, + }; + } + + // Deployed env, no workerId: resolve the currently-promoted deployment's worker. When `type` + // is known (engine dispatch) apply the MANAGED guard + latest-v2 fallback that the run-engine + // path requires; without `type` keep the original app behavior (return the promoted worker). + const promotion = await client.workerDeploymentPromotion.findFirst({ + where: { environmentId, label: CURRENT_DEPLOYMENT_LABEL }, + include: { + deployment: { + include: { worker: { include: { tasks: true, queues: true } } }, + }, + }, + }); + + if (!promotion?.deployment.worker) { + return null; + } + + if (type === undefined || promotion.deployment.type === "MANAGED") { + return { + worker: promotion.deployment.worker, + tasks: promotion.deployment.worker.tasks, + queues: promotion.deployment.worker.queues, + deployment: promotion.deployment, + }; + } + + // Engine dispatch only: the promoted deployment is not run-engine v2; fall back to the latest + // MANAGED deployment. + const latestV2Deployment = await client.workerDeployment.findFirst({ + where: { environmentId, type: "MANAGED" }, + orderBy: { id: "desc" }, + include: { worker: { include: { tasks: true, queues: true } } }, + }); + + if (!latestV2Deployment?.worker) { + return null; + } + + return { + worker: latestV2Deployment.worker, + tasks: latestV2Deployment.worker.tasks, + queues: latestV2Deployment.worker.queues, + deployment: latestV2Deployment, + }; + } + + async #queryWorkerById( + client: CpClient, + workerId: string + ): Promise { + const worker = await client.backgroundWorker.findFirst({ + where: { id: workerId }, + include: { deployment: true, tasks: true, queues: true }, + orderBy: { id: "desc" }, + }); + + if (!worker) { + return null; + } + + return { worker, tasks: worker.tasks, queues: worker.queues, deployment: worker.deployment }; + } + + async #queryMostRecentWorker( + client: CpClient, + environmentId: string + ): Promise { + const worker = await client.backgroundWorker.findFirst({ + where: { runtimeEnvironmentId: environmentId }, + include: { tasks: true, queues: true }, + orderBy: { id: "desc" }, + }); + + if (!worker) { + return null; + } + + return { worker, tasks: worker.tasks, queues: worker.queues, deployment: null }; + } + + async assertEnvExists(environmentId: string): Promise { + if (!this.splitEnabled()) { + const exists = await this.#queryEnvExists(this.controlPlanePrimary, environmentId); + if (!exists) { + throw new ControlPlaneReferenceError( + `Referenced environment does not exist: ${environmentId}` + ); + } + return; + } + + const cached = this.cache.getEnvExists(environmentId); + if (cached !== undefined) { + if (!cached) { + throw new ControlPlaneReferenceError( + `Referenced environment does not exist: ${environmentId}` + ); + } + return; + } + + const exists = await this.#queryEnvExists(this.controlPlaneReplica, environmentId); + this.cache.setEnvExists(environmentId, exists); + if (!exists) { + throw new ControlPlaneReferenceError( + `Referenced environment does not exist: ${environmentId}` + ); + } + } + + async #queryEnvExists(client: CpClient, environmentId: string): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { id: true }, + }); + return env !== null; + } +} + +// Module-level singleton: wires the real control-plane clients + env split predicate. +// The control-plane writer/replica are the unchanged `prisma` / `$replica` exports. The +// split decision is a boot constant derived once from the env predicate (same one the +// run-ops topology factory uses); the async isSplitEnabled() distinct-DB sentinel is enforced +// at boot elsewhere and is never awaited on a resolver hot path. +const SPLIT_ENABLED = + env.RUN_OPS_SPLIT_ENABLED && !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; + +export const controlPlaneResolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma, + controlPlaneReplica: $replica, + // Relax the cache via config. Unset env knobs -> built-in defaults (byte-identical). + cache: new ControlPlaneCache({ + ttlMs: env.CONTROL_PLANE_CACHE_TTL_MS ?? DEFAULT_CP_CACHE_TTL_MS, + maxEntries: env.CONTROL_PLANE_CACHE_MAX_ENTRIES ?? DEFAULT_CP_CACHE_MAX_ENTRIES, + }), + splitEnabled: () => SPLIT_ENABLED, +}); diff --git a/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts new file mode 100644 index 00000000000..0364115eab0 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts @@ -0,0 +1,102 @@ +import { ownerEngine, UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; +import { isSplitEnabled } from "./splitMode.server"; +import type { + CrossSeamGuardDecision, + CrossSeamGuardInput, + RunOpsResidency, + StoreTarget, + UnblockRouteKind, +} from "./types"; + +const KNOWN_ROUTE_KINDS: ReadonlySet = new Set([ + "MANUAL", + "DATETIME", + "RESUME_TOKEN", + "IDEMPOTENCY_REUSE", + "RUN", +]); + +// There is NO default store: an unrecognised route is a loud failure. +function assertKnownRouteKind(routeKind: UnblockRouteKind): void { + if (!KNOWN_ROUTE_KINDS.has(routeKind)) { + throw new Error(`Unknown unblock routeKind: ${JSON.stringify(routeKind)}`); + } +} + +function storeForResidency(residency: RunOpsResidency): StoreTarget { + return residency === "NEW" ? "new" : "legacy"; +} + +/** + * Pin precedence (deterministic, documented order): + * 1. non-tree-owned (treeOwnerResidency === "LEGACY") + * 2. cross-tree-idempotency (isCrossTreeIdempotency === true) + * 3. legacy-parent-descendant (hasLegacyParent === true) + * Any hit overrides the store to "legacy"; the waitpoint's own residency is + * preserved on the decision so callers/metrics can see "NEW pinned to legacy". + */ +function applyPinningRules( + input: CrossSeamGuardInput +): CrossSeamGuardDecision["pinnedReason"] | undefined { + if (input.treeOwnerResidency === "LEGACY") return "non-tree-owned"; + if (input.isCrossTreeIdempotency === true) return "cross-tree-idempotency"; + if (input.hasLegacyParent === true) return "legacy-parent-descendant"; + return undefined; +} + +/** + * Pure store-selection core. No env import, no I/O — driven exhaustively by the + * downstream proof harness via the optional `classify` seam. + */ +export function selectStoreForWaitpoint( + input: CrossSeamGuardInput, + deps?: { classify?: (id: string) => RunOpsResidency } +): CrossSeamGuardDecision { + assertKnownRouteKind(input.routeKind); + + const classify = deps?.classify ?? ownerEngine; + + let residency: RunOpsResidency; + try { + residency = classify(input.waitpointId); + } catch (error) { + // Loud on ambiguity: rethrow with context, never catch-and-default. + if (error instanceof UnclassifiableRunId) { + throw new UnclassifiableRunId(`${input.waitpointId} (routeKind=${input.routeKind})`); + } + throw error; + } + + const pinnedReason = applyPinningRules(input); + const store: StoreTarget = pinnedReason ? "legacy" : storeForResidency(residency); + + return { + store, + residency, + routeKind: input.routeKind, + ...(pinnedReason ? { pinnedReason } : {}), + }; +} + +/** + * Pure flag-aware core. In single-DB mode "legacy" IS the single store, so we + * return it WITHOUT ever consulting the classifier (off in single-DB). When + * split is on, delegate to the pure selection core. + */ +export function computeStoreForCompletion( + input: CrossSeamGuardInput, + opts: { splitEnabled: boolean; classify?: (id: string) => RunOpsResidency } +): CrossSeamGuardDecision { + if (opts.splitEnabled === false) { + return { store: "legacy", residency: "LEGACY", routeKind: input.routeKind }; + } + return selectStoreForWaitpoint(input, { classify: opts.classify }); +} + +/** Thin server entry the waitpoint-completion consumers call. */ +export async function pickRunOpsStoreForCompletion( + input: CrossSeamGuardInput +): Promise { + const splitEnabled = await isSplitEnabled(); + return computeStoreForCompletion(input, { splitEnabled }); +} diff --git a/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts b/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts new file mode 100644 index 00000000000..2c92178f82d --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts @@ -0,0 +1,55 @@ +import { PrismaClient } from "@trigger.dev/database"; + +type DatabaseFingerprint = { systemIdentifier: string; databaseName: string }; + +async function readDatabaseFingerprint(url: string): Promise { + const client = new PrismaClient({ datasources: { db: { url } } }); + try { + const rows = await client.$queryRawUnsafe< + Array<{ system_identifier: string; database_name: string }> + >( + "SELECT system_identifier::text AS system_identifier, current_database() AS database_name FROM pg_control_system()" + ); + const row = rows[0]; + if (!row) { + throw new Error("distinct-db sentinel: pg_control_system() returned no rows"); + } + return { systemIdentifier: row.system_identifier, databaseName: row.database_name }; + } finally { + await client.$disconnect(); + } +} + +export async function probeDistinctDatabases( + legacyUrl: string, + newUrl: string, + opts?: { logger?: { warn: (msg: string, meta?: Record) => void } } +): Promise<{ distinct: true } | { distinct: false; reason: string }> { + try { + const [legacy, next] = await Promise.all([ + readDatabaseFingerprint(legacyUrl), + readDatabaseFingerprint(newUrl), + ]); + const sameCluster = legacy.systemIdentifier === next.systemIdentifier; + const sameDb = sameCluster && legacy.databaseName === next.databaseName; + // Same-cluster-different-database policy: two databases inside the SAME cluster + // (same system identifier, different current_database()) are reported distinct: true. + // That is acceptable — they are genuinely separate Postgres databases with separate + // WAL-visible state for our purposes, and the Cloud topology always uses separate + // clusters anyway. A stricter "must be a different cluster" policy would gate on + // sameCluster alone; that is flagged as an open question, not decided here. + if (sameDb) { + const reason = + "run-ops legacy and new URLs resolve to the SAME physical database " + + `(systemIdentifier=${legacy.systemIdentifier}, database=${legacy.databaseName}); ` + + "refusing to enable split — pooler/replica likely."; + opts?.logger?.warn(reason); + return { distinct: false, reason }; + } + return { distinct: true }; + } catch (error) { + const reason = `distinct-db sentinel probe failed; failing closed (single-DB). ${String(error)}`; + opts?.logger?.warn(reason, { error }); + return { distinct: false, reason }; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts new file mode 100644 index 00000000000..06d725a9867 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts @@ -0,0 +1,133 @@ +// Pure-core tests for the known-migrated filter. The injected `readMarker`/`probeNew` +// are PURE BOUNDARIES (the marker source and the new-DB existence predicate), not DB +// mocks — the DB-crossing proof for `probeNew` lives in readThrough.server.test.ts. +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { containerTest } from "@internal/testcontainers"; +import { + ensureRedirectMarkerTable, + writeRedirectMarker, + isFenced, +} from "@internal/run-engine"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { + computeKnownMigrated, + isKnownMigrated, + __resetKnownMigratedCacheForTests, +} from "./knownMigratedFilter.server"; + +describe("computeKnownMigrated", () => { + beforeEach(() => { + __resetKnownMigratedCacheForTests(); + }); + + it("(a) marker present → migrated, without probing new", async () => { + const readMarker = vi.fn(async () => true); + const probeNew = vi.fn(async () => false); + + const result = await computeKnownMigrated("run_a", { readMarker, probeNew }); + + expect(result).toBe(true); + expect(readMarker).toHaveBeenCalledTimes(1); + expect(probeNew).not.toHaveBeenCalled(); + }); + + it("(b) marker absent + new-probe hit → migrated", async () => { + const readMarker = vi.fn(async () => false); + const probeNew = vi.fn(async () => true); + + const result = await computeKnownMigrated("run_b", { readMarker, probeNew }); + + expect(result).toBe(true); + expect(readMarker).toHaveBeenCalledTimes(1); + expect(probeNew).toHaveBeenCalledTimes(1); + }); + + it("(c) marker absent + new-probe miss → NOT migrated", async () => { + const readMarker = vi.fn(async () => false); + const probeNew = vi.fn(async () => false); + + const result = await computeKnownMigrated("run_c", { readMarker, probeNew }); + + expect(result).toBe(false); + expect(readMarker).toHaveBeenCalledTimes(1); + expect(probeNew).toHaveBeenCalledTimes(1); + }); + + it("(d) a positive is memoized: second call re-invokes neither readMarker nor probeNew", async () => { + const cache = new BoundedTtlCache(60_000, 100); + const readMarker = vi.fn(async () => false); + const probeNew = vi.fn(async () => true); + + const first = await computeKnownMigrated("run_d", { + readMarker, + probeNew, + cache, + ttlMs: 60_000, + }); + expect(first).toBe(true); + + const second = await computeKnownMigrated("run_d", { + readMarker, + probeNew, + cache, + ttlMs: 60_000, + }); + expect(second).toBe(true); + + // The boundaries ran exactly once, on the first call only. + expect(readMarker).toHaveBeenCalledTimes(1); + expect(probeNew).toHaveBeenCalledTimes(1); + }); +}); + +describe("isKnownMigrated marker authority", () => { + beforeEach(() => { + __resetKnownMigratedCacheForTests(); + }); + + // The OLD-side redirect marker is the authority: once written, the run is "known + // migrated" WITHOUT a NEW-DB probe. `containerTest` gives a real PG to host the + // marker table; `probeNew` is forced false to prove the marker path alone decides. + containerTest( + "a written redirect marker makes a run known-migrated via isFenced (no new-probe)", + async ({ prisma }) => { + await ensureRedirectMarkerTable(prisma); + const runId = "run_marker_authority"; + + const probeNew = vi.fn(async () => false); + const readMarker = (id: string) => isFenced(prisma, id); + + // Before the marker: not fenced → not migrated → probeNew consulted (and false). + expect(await computeKnownMigrated(runId, { readMarker, probeNew })).toBe(false); + expect(probeNew).toHaveBeenCalledTimes(1); + + // Write the OLD-side marker, reset the cache, re-evaluate: now migrated by marker + // alone, and probeNew is NOT consulted again. + await writeRedirectMarker(prisma, { runId, reason: "live-migration" }); + __resetKnownMigratedCacheForTests(); + probeNew.mockClear(); + + expect(await computeKnownMigrated(runId, { readMarker, probeNew })).toBe(true); + expect(probeNew).not.toHaveBeenCalled(); + } + ); + + containerTest( + "the DEFAULT readMarker consults isFenced on the legacy replica", + async ({ prisma }) => { + await ensureRedirectMarkerTable(prisma); + const runId = "run_default_marker"; + + // Inject the legacy-replica client the default adapter reads from; force probeNew + // false so only the marker can flip the result. + const probeNew = vi.fn(async () => false); + + // No `readMarker` passed → the wired default must read the marker via isFenced. + await writeRedirectMarker(prisma, { runId, reason: "live-migration" }); + expect( + await isKnownMigrated(runId, { legacyMarkerClient: prisma, probeNew }) + ).toBe(true); + expect(probeNew).not.toHaveBeenCalled(); + } + ); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts new file mode 100644 index 00000000000..4307f4197ab --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts @@ -0,0 +1,138 @@ +/** + * Known-migrated filter. + * + * "Known migrated" is true when a run's row has been copied to the NEW run-ops DB + * and the OLD side has been fenced. The read-through layer consults this predicate + * to AVOID re-probing the legacy read replica for runs that already live on new — + * that re-probe is exactly the read load we are shedding off the legacy DB's replica. + * + * Authority order: + * 1. Cache hit → return it. + * 2. Redirect-marker on the OLD side (`readMarker(runId)` true) → migrated. + * The marker is the authoritative "this row now lives on the new DB" signal + * written by the live-migration fencing primitive. + * 3. Fall back to a NEW-DB existence probe (`probeNew(runId)`) — covers + * backfilled/straggler-swept rows whose marker is gone (GC'd) or whose mere + * presence on new is the only remaining evidence. + * + * Caching policy: positives are cached aggressively (a migrated row never + * un-migrates within the retention window); negatives are NOT cached (a + * not-yet-migrated row may migrate at any moment, and re-reading legacy for it is + * still correct — the row is there until termination — so the only cost of a stale + * negative would be a brief extra probe, which we avoid by simply not caching it). + */ +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { isFenced, type RedirectMarkerClient } from "@internal/run-engine"; + +type KnownMigratedDeps = { + /** Authoritative migrated-marker source: true iff the OLD side is fenced for this run. */ + readMarker?: (runId: string) => Promise; + /** Fallback NEW-DB existence probe: true iff the run already exists on the new store. */ + probeNew?: (runId: string) => Promise; + /** Bounded TTL memo for positive results. */ + cache?: BoundedTtlCache; + /** TTL (ms) used by the default module-level cache. */ + ttlMs?: number; + /** OLD/LEGACY run-ops client the default `readMarker` reads the fence from. */ + legacyMarkerClient?: RedirectMarkerClient; +}; + +/** Default positive-cache TTL: long, because a migrated row never un-migrates in the window. */ +const DEFAULT_TTL_MS = 5 * 60_000; +const DEFAULT_MAX_ENTRIES = 50_000; + +/** + * PURE testable core (no `env`/`db.server`/`process.env` import — webapp testability rule). + * Tests inject `readMarker`/`probeNew` as pure boundaries (NOT DB mocks). + */ +export async function computeKnownMigrated( + runId: string, + deps: KnownMigratedDeps +): Promise { + const cache = deps.cache; + + // We only ever store positives, so a hit is always `true`. + const cached = cache?.get(runId); + if (cached !== undefined) { + return cached; + } + + // Marker present → migrated, never probe new. + if (deps.readMarker && (await deps.readMarker(runId))) { + cache?.set(runId, true); + return true; + } + + if (deps.probeNew && (await deps.probeNew(runId))) { + cache?.set(runId, true); + return true; + } + + // Not migrated. Negatives are not cached (see policy note above). + return false; +} + +let defaultCache: BoundedTtlCache | undefined; + +function getDefaultCache(ttlMs: number): BoundedTtlCache { + if (!defaultCache) { + defaultCache = new BoundedTtlCache(ttlMs, DEFAULT_MAX_ENTRIES); + } + return defaultCache; +} + +/** + * Default `readMarker` adapter. Delegates to the OLD-side fence (`isFenced`) so the + * redirect marker is the migrated authority. The legacy run-ops replica + * client is injected by the wired wrapper (`isKnownMigrated`) — the pure core never + * imports `db.server`. + */ +function makeDefaultReadMarker( + client: RedirectMarkerClient +): (runId: string) => Promise { + return (runId: string) => isFenced(client, runId); +} + +/** + * Wired wrapper. Defaults `readMarker` to the marker adapter above, `probeNew` to a + * NEW run-ops existence check, and `cache` to a module-level singleton. + * + * The `probeNew` default uses `findFirst` (NEVER `findUnique` — webapp Prisma rule) + * against the new run-ops writer handle. + */ +export async function isKnownMigrated(runId: string, deps?: KnownMigratedDeps): Promise { + const ttlMs = deps?.ttlMs ?? DEFAULT_TTL_MS; + + // Lazy default for probeNew so the db.server import stays out of the pure core and + // only resolves when the wired wrapper actually needs it. + const probeNew = + deps?.probeNew ?? + (async (id: string) => { + const { runOpsNewPrisma } = await import("~/db.server"); + const row = await runOpsNewPrisma.taskRun.findFirst({ + where: { friendlyId: id }, + select: { friendlyId: true }, + }); + return row !== null; + }); + + // Resolve the OLD/LEGACY marker client (injected for tests; the legacy run-ops + // replica in production). Only needed when no explicit readMarker is provided. + let readMarker = deps?.readMarker; + if (!readMarker) { + const legacyMarkerClient = + deps?.legacyMarkerClient ?? (await import("~/db.server")).runOpsLegacyReplica; + readMarker = makeDefaultReadMarker(legacyMarkerClient); + } + + return computeKnownMigrated(runId, { + readMarker, + probeNew, + cache: deps?.cache ?? getDefaultCache(ttlMs), + ttlMs, + }); +} + +export function __resetKnownMigratedCacheForTests(): void { + defaultCache = undefined; +} diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts new file mode 100644 index 00000000000..3f393c66075 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts @@ -0,0 +1,168 @@ +import { describe, expect, it, vi } from "vitest"; +import { batchIdForMintKind, resolveBatchMintKind } from "./mintBatchFriendlyId.server"; +import { classifyKind } from "@trigger.dev/core/v3/isomorphic"; + +describe("batchIdForMintKind (pure)", () => { + it("ksuid -> 27-char classifiable NEW batch id (no 21-char ids)", () => { + const r = batchIdForMintKind("ksuid"); + expect(r.friendlyId.startsWith("batch_")).toBe(true); + expect(r.id.length).toBe(27); + expect(classifyKind(r.id)).toBe("ksuid"); + expect(classifyKind(r.friendlyId)).toBe("ksuid"); + }); + + it("cuid -> 25-char classifiable LEGACY batch id", () => { + const r = batchIdForMintKind("cuid"); + expect(r.id.length).toBe(25); + expect(classifyKind(r.id)).toBe("cuid"); + expect(classifyKind(r.friendlyId)).toBe("cuid"); + }); + + it("never mints a 21-char id", () => { + for (const kind of ["cuid", "ksuid"] as const) { + expect([25, 27]).toContain(batchIdForMintKind(kind).id.length); + } + }); +}); + +describe("resolveBatchMintKind", () => { + const environment = { organizationId: "org_1", id: "env_1", orgFeatureFlags: {} }; + + it("ROOT batch (no parent) resolves per-org kind via resolveRunIdMintKind", async () => { + const resolveRunIdMintKind = vi.fn().mockResolvedValue("ksuid"); + const kind = await resolveBatchMintKind({ + environment, + deps: { + resolveRunIdMintKind, + isKnownMigrated: vi.fn(), + isSplitEnabled: vi.fn(), + }, + }); + expect(kind).toBe("ksuid"); + expect(resolveRunIdMintKind).toHaveBeenCalledWith({ + organizationId: "org_1", + id: "env_1", + orgFeatureFlags: {}, + }); + }); + + it("ROOT batch on a non-cut-over org -> cuid, isKnownMigrated NOT called", async () => { + const resolveRunIdMintKind = vi.fn().mockResolvedValue("cuid"); + const isKnownMigrated = vi.fn(); + const kind = await resolveBatchMintKind({ + environment, + deps: { + resolveRunIdMintKind, + isKnownMigrated, + isSplitEnabled: vi.fn(), + }, + }); + expect(kind).toBe("cuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + }); + + it("CHILD batch inherits a ksuid (NEW) parent by id-shape, split off, no marker read", async () => { + const parentRunFriendlyId = `run_${"a".repeat(27)}`; + const resolveRunIdMintKind = vi.fn(); + const isKnownMigrated = vi.fn(); + const isSplitEnabled = vi.fn().mockResolvedValue(false); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { resolveRunIdMintKind, isKnownMigrated, isSplitEnabled }, + }); + + expect(kind).toBe("ksuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); + + it("CHILD batch inherits a cuid (LEGACY) parent by id-shape", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const isSplitEnabled = vi.fn().mockResolvedValue(false); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { + resolveRunIdMintKind: vi.fn(), + isKnownMigrated: vi.fn(), + isSplitEnabled, + }, + }); + + expect(kind).toBe("cuid"); + }); + + it("CHILD batch with a legacy-by-shape parent already migrated (split on + marker) -> ksuid", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const isSplitEnabled = vi.fn().mockResolvedValue(true); + const isKnownMigrated = vi.fn().mockResolvedValue(true); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { + resolveRunIdMintKind: vi.fn(), + isKnownMigrated, + isSplitEnabled, + }, + }); + + expect(kind).toBe("ksuid"); + }); + + it("CHILD inheritance does NOT consult the marker when split is OFF (hot-path zero-IO)", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const isSplitEnabled = vi.fn().mockResolvedValue(false); + const isKnownMigrated = vi.fn().mockResolvedValue(true); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { + resolveRunIdMintKind: vi.fn(), + isKnownMigrated, + isSplitEnabled, + }, + }); + + expect(kind).toBe("cuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + }); + + // mint-on-FLIP invariant: a child follows its parent's store even after the org flag + // flips the other way. The flag resolver must NEVER be consulted for a child. + it("FLIP cuid->ksuid: a cuid (LEGACY) parent still mints a cuid child though the flag now says ksuid", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const resolveRunIdMintKind = vi.fn().mockResolvedValue("ksuid"); // flag flipped to ksuid + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { + resolveRunIdMintKind, + isKnownMigrated: vi.fn().mockResolvedValue(false), + isSplitEnabled: vi.fn().mockResolvedValue(true), + }, + }); + expect(kind).toBe("cuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); + + it("FLIP ksuid->cuid: a ksuid (NEW) parent still mints a ksuid child though the flag now says cuid", async () => { + const parentRunFriendlyId = `run_${"a".repeat(27)}`; + const resolveRunIdMintKind = vi.fn().mockResolvedValue("cuid"); // flag flipped back to cuid + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { + resolveRunIdMintKind, + isKnownMigrated: vi.fn().mockResolvedValue(false), + isSplitEnabled: vi.fn().mockResolvedValue(true), + }, + }); + expect(kind).toBe("ksuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts new file mode 100644 index 00000000000..bdbb83b51dd --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts @@ -0,0 +1,54 @@ +import { BatchId, generateKsuidId } from "@trigger.dev/core/v3/isomorphic"; +import { + resolveRunIdMintKind as defaultResolveRunIdMintKind, + type RunIdMintKind, +} from "~/v3/engineVersion.server"; +import { isKnownMigrated as defaultIsKnownMigrated } from "~/v3/runOpsMigration/knownMigratedFilter.server"; +import { isSplitEnabled as defaultIsSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; +import { resolveInheritedMintKind } from "~/v3/runOpsMigration/resolveInheritedMintKind.server"; + +type ResolveDeps = { + resolveRunIdMintKind: typeof defaultResolveRunIdMintKind; + isKnownMigrated: (runId: string) => Promise; + isSplitEnabled: () => Promise; +}; + +const defaultDeps: ResolveDeps = { + resolveRunIdMintKind: defaultResolveRunIdMintKind, + isKnownMigrated: defaultIsKnownMigrated, + isSplitEnabled: defaultIsSplitEnabled, +}; + +export function batchIdForMintKind(kind: RunIdMintKind): { id: string; friendlyId: string } { + if (kind === "ksuid") { + const id = generateKsuidId(); + return { id, friendlyId: BatchId.toFriendlyId(id) }; + } + return BatchId.generate(); +} + +export async function resolveBatchMintKind(args: { + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }; + parentRunFriendlyId?: string; + deps?: Partial; +}): Promise { + const deps = { ...defaultDeps, ...args.deps }; + return args.parentRunFriendlyId + ? resolveInheritedMintKind(args.parentRunFriendlyId, { + isSplitEnabled: deps.isSplitEnabled, + isKnownMigrated: deps.isKnownMigrated, + }) + : deps.resolveRunIdMintKind({ + organizationId: args.environment.organizationId, + id: args.environment.id, + orgFeatureFlags: args.environment.orgFeatureFlags, + }); +} + +export async function mintBatchFriendlyId(args: { + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }; + parentRunFriendlyId?: string; + deps?: Partial; +}): Promise<{ id: string; friendlyId: string }> { + return batchIdForMintKind(await resolveBatchMintKind(args)); +} diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts new file mode 100644 index 00000000000..4bd292b4dfd --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts @@ -0,0 +1,216 @@ +// Real legacy-replica + new-DB proof for the read-through layer. +// We NEVER mock the DB: the reads run as real `$queryRaw` against the two containers, +// crossing the actual legacy↔new boundary the migration relies on. The only injected +// fakes are the pure boundaries — `isKnownMigrated`, `isPastRetention`, +// `splitEnabled` — plus throwing spies used to assert a store was NEVER touched. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { describe, expect, vi } from "vitest"; +import type { PrismaReplicaClient } from "~/db.server"; +import { readThroughRun, type ReadThroughResult } from "./readThrough.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// 25-char cuid body → LEGACY residency. 27-char body → NEW residency. +const LEGACY_RUN_ID = "run_" + "a".repeat(25); +const NEW_RUN_ID = "run_" + "b".repeat(27); + +// Lightweight real read: a trivial `$queryRaw` that genuinely hits the given container. +// `hit` controls whether the read "finds" the run, so we exercise routing without +// seeding a full TaskRun (many required FKs) — the routing DoD is store-order, not shape. +async function realRead( + client: PrismaReplicaClient, + hit: boolean +): Promise<{ marker: number } | null> { + const rows = await client.$queryRaw<{ marker: number }[]>`SELECT 1 AS marker`; + return hit ? (rows[0] ?? null) : null; +} + +// A presenter-shaped mapping: both "not-found" and "past-retention" collapse to the +// same 404-ish surface, so an old run after termination yields the normal response. +function toHttpish(result: ReadThroughResult): { status: number; value?: T } { + switch (result.source) { + case "new": + case "legacy-replica": + return { status: 200, value: result.value }; + case "not-found": + case "past-retention": + return { status: 404 }; + } +} + +describe("readThroughRun (legacy replica + new DB)", () => { + heteroPostgresTest( + "Step 1: old in-retention run is served from the legacy REPLICA, never a primary", + async ({ prisma14, prisma17 }) => { + // legacy hit, new miss. The layer has NO legacy-writer handle at all — the + // read resolving through `legacyReplica` (prisma14) IS the structural guarantee + // that the primary is never touched. + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, true), + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: async () => false, + }, + }); + + expect(result.source).toBe("legacy-replica"); + expect(toHttpish(result).status).toBe(200); + } + ); + + heteroPostgresTest( + "Step 2: a migrated run is filtered from old-probing", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never be called for a known-migrated run"); + }); + + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), // new misses → step (b) short-circuit + readLegacy: throwingLegacy, + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: async () => true, + }, + }); + + expect(result.source).toBe("not-found"); + expect(throwingLegacy).not.toHaveBeenCalled(); + } + ); + + heteroPostgresTest( + "Step 2b: a migrated run that the new read hits returns source=new", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never be called when new hits"); + }); + + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, true), + readLegacy: throwingLegacy, + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: async () => true, + }, + }); + + expect(result.source).toBe("new"); + expect(throwingLegacy).not.toHaveBeenCalled(); + } + ); + + heteroPostgresTest( + "Step 3: post-termination past-retention returns the normal not-found surface", + async ({ prisma14, prisma17 }) => { + const pastRetentionResult = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, false), // legacy gone / retention elapsed + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: async () => false, + isPastRetention: () => true, + }, + }); + + expect(pastRetentionResult.source).toBe("past-retention"); + + // A run that is simply absent (not past retention) yields not-found. + const notFoundResult = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, false), + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: async () => false, + isPastRetention: () => false, + }, + }); + + expect(notFoundResult.source).toBe("not-found"); + // Both collapse to the same 404-ish surface. + expect(toHttpish(pastRetentionResult).status).toBe(toHttpish(notFoundResult).status); + expect(toHttpish(pastRetentionResult).status).toBe(404); + } + ); + + heteroPostgresTest( + "Step 4: single-DB passthrough — only readNew runs, legacy + filter never touched", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never run in single-DB mode"); + }); + const throwingFilter = vi.fn(async (): Promise => { + throw new Error("isKnownMigrated must never run in single-DB mode"); + }); + const newRead = vi.fn((c: PrismaReplicaClient) => realRead(c, true)); + + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: newRead, + readLegacy: throwingLegacy, + deps: { + splitEnabled: false, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: throwingFilter, + }, + }); + + expect(result.source).toBe("new"); + expect(newRead).toHaveBeenCalledTimes(1); + expect(throwingLegacy).not.toHaveBeenCalled(); + expect(throwingFilter).not.toHaveBeenCalled(); + } + ); + + heteroPostgresTest( + "Step 5: new-residency fast-path — legacy replica is never touched", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never run for a NEW-residency id"); + }); + const throwingFilter = vi.fn(async (): Promise => { + throw new Error("isKnownMigrated must never run for a NEW-residency id"); + }); + + const result = await readThroughRun({ + runId: NEW_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, true), + readLegacy: throwingLegacy, + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isKnownMigrated: throwingFilter, + }, + }); + + expect(result.source).toBe("new"); + expect(throwingLegacy).not.toHaveBeenCalled(); + expect(throwingFilter).not.toHaveBeenCalled(); + } + ); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts new file mode 100644 index 00000000000..a4be2fdd1e5 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts @@ -0,0 +1,114 @@ +/** + * Read-through reads the LEGACY RUN-OPS READ REPLICA ONLY — never the legacy primary + * (which carries the read load we are shedding). Disabled entirely when isSplitEnabled() + * is false (single-DB passthrough). + * + * During the retention window, old not-yet-migrated run-ops rows are served off + * the legacy read replica. A known-migrated filter short-circuits re-probing legacy for + * runs already living on the new DB. After termination, past-retention runs return the + * normal not-found response. Patterned on `mollifier/resolveRunForMutation.server.ts` + * (`?? default` DI), but with the legacy-primary/writer fallback deliberately removed: + * this layer has NO legacy-writer handle at all (structural guarantee). + */ +import type { PrismaReplicaClient } from "~/db.server"; +import { + runOpsLegacyReplica as defaultLegacyReplica, + runOpsNewReplica as defaultNewClient, +} from "~/db.server"; +import { logger as defaultLogger } from "~/services/logger.server"; +import { ownerEngine, UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; +import { isKnownMigrated as defaultIsKnownMigrated } from "./knownMigratedFilter.server"; +import { isSplitEnabled } from "./splitMode.server"; + +export type ReadThroughSource = "new" | "legacy-replica"; + +export type ReadThroughResult = + | { source: ReadThroughSource; value: T } + | { source: "not-found" } + | { source: "past-retention" }; + +export type ReadThroughDeps = { + newClient?: PrismaReplicaClient; + legacyReplica?: PrismaReplicaClient; + /** Resolved boot constant; never `await`ed per-request when supplied. */ + splitEnabled?: boolean; + isKnownMigrated?: (runId: string) => Promise; + isPastRetention?: (runId: string) => boolean; + logger?: { warn: (m: string, meta?: unknown) => void }; + /** Saturation-signal emit hook: called on each legacy-replica hit. */ + onLegacyReplicaRead?: (runId: string) => void; +}; + +type ReadThroughRunInput = { + runId: string; + environmentId: string; + readNew: (client: PrismaReplicaClient) => Promise; + readLegacy: (replica: PrismaReplicaClient) => Promise; + deps?: ReadThroughDeps; +}; + +export async function readThroughRun( + input: ReadThroughRunInput +): Promise> { + const { runId, deps } = input; + const newClient = deps?.newClient ?? defaultNewClient; + const legacyReplica = deps?.legacyReplica ?? defaultLegacyReplica; + const logger = deps?.logger ?? defaultLogger; + + const splitEnabled = deps?.splitEnabled ?? (await isSplitEnabled()); + + // Passthrough: single plain read against the one collapsed store. No legacy read, + // no marker check, no isKnownMigrated, no second connection. + if (!splitEnabled) { + const v = await input.readNew(newClient); + return v != null ? { source: "new", value: v } : { source: "not-found" }; + } + + // Split is on. Classify residency; an unclassifiable id is treated as LEGACY + // (conservative — probe rather than drop a real run). + let residency: "LEGACY" | "NEW"; + try { + residency = ownerEngine(runId); + } catch (e) { + if (e instanceof UnclassifiableRunId) { + logger.warn("readThroughRun: UnclassifiableRunId, treating as LEGACY", { + runId, + valueLength: e.valueLength, + }); + residency = "LEGACY"; + } else { + throw e; + } + } + + // A ksuid id can only live on the new DB — skip the legacy replica entirely. + if (residency === "NEW") { + const v = await input.readNew(newClient); + return v != null ? { source: "new", value: v } : { source: "not-found" }; + } + + // LEGACY (or unclassifiable→LEGACY) fan-out: new first. + const v = await input.readNew(newClient); + if (v != null) { + return { source: "new", value: v }; + } + + // Known-migrated short-circuit: the row is on new but the new read missed it + // (lag / select shape). Do NOT re-probe legacy. + const isMigrated = deps?.isKnownMigrated ?? defaultIsKnownMigrated; + if (await isMigrated(runId)) { + return { source: "not-found" }; + } + + // Legacy READ REPLICA only — never a legacy writer/primary (no such handle exists). + const lv = await input.readLegacy(legacyReplica); + if (lv != null) { + deps?.onLegacyReplicaRead?.(runId); + return { source: "legacy-replica", value: lv }; + } + + if (deps?.isPastRetention?.(runId)) { + return { source: "past-retention" }; + } + return { source: "not-found" }; +} diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts new file mode 100644 index 00000000000..ce4ae5a1d4a --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it, vi } from "vitest"; +import { resolveInheritedMintKind } from "./resolveInheritedMintKind.server"; + +const NEW_PARENT = `run_${"a".repeat(27)}`; // ksuid id-shape -> NEW +const LEGACY_PARENT = `run_${"b".repeat(25)}`; // cuid id-shape -> LEGACY + +describe("resolveInheritedMintKind (pure, shared across all mint paths)", () => { + it("inherits a ksuid (NEW) parent by id-shape, split off, marker never read", async () => { + const isKnownMigrated = vi.fn(); + const kind = await resolveInheritedMintKind(NEW_PARENT, { + isSplitEnabled: async () => false, + isKnownMigrated, + }); + expect(kind).toBe("ksuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + }); + + it("inherits a cuid (LEGACY) parent by id-shape, split off, marker never read", async () => { + const isKnownMigrated = vi.fn(); + const kind = await resolveInheritedMintKind(LEGACY_PARENT, { + isSplitEnabled: async () => false, + isKnownMigrated, + }); + expect(kind).toBe("cuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + }); + + // The gap this helper closes: split OFF = one physical DB, and a probeNew-backed + // isKnownMigrated returns true for any extant parent. The guard must skip the marker + // when split is off so a cuid parent keeps minting cuid children (byte-identical to today). + it("does NOT consult the marker when split is OFF (hot-path zero-IO; byte-identical to today)", async () => { + const isKnownMigrated = vi.fn().mockResolvedValue(true); + const kind = await resolveInheritedMintKind(LEGACY_PARENT, { + isSplitEnabled: async () => false, + isKnownMigrated, + }); + expect(kind).toBe("cuid"); + expect(isKnownMigrated).not.toHaveBeenCalled(); + }); + + it("split ON + legacy-by-shape parent already migrated (marker true) -> ksuid (co-resident on NEW)", async () => { + const kind = await resolveInheritedMintKind(LEGACY_PARENT, { + isSplitEnabled: async () => true, + isKnownMigrated: async () => true, + }); + expect(kind).toBe("ksuid"); + }); + + it("split ON + legacy-by-shape parent NOT migrated (marker false) -> cuid (stays LEGACY)", async () => { + const kind = await resolveInheritedMintKind(LEGACY_PARENT, { + isSplitEnabled: async () => true, + isKnownMigrated: async () => false, + }); + expect(kind).toBe("cuid"); + }); + + it("split ON + ksuid parent -> ksuid regardless of marker (already NEW)", async () => { + const kind = await resolveInheritedMintKind(NEW_PARENT, { + isSplitEnabled: async () => true, + isKnownMigrated: async () => false, + }); + expect(kind).toBe("ksuid"); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts new file mode 100644 index 00000000000..79e41b41dff --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts @@ -0,0 +1,21 @@ +import { ownerEngine } from "@trigger.dev/core/v3/isomorphic"; +import type { RunIdMintKind } from "./runOpsMintKind.server"; + +type InheritedMintKindDeps = { + isSplitEnabled: () => Promise; + isKnownMigrated: (runId: string) => Promise; +}; + +// Mint a child in the SAME physical store as its anchor (parent run / owning batch), +// regardless of the org's current mint flag — keeps a subgraph co-resident across a +// flip. Marker-aware inheritance only matters with split on; split off is a pure +// id-shape check (zero hot-path I/O, byte-identical to today). +export async function resolveInheritedMintKind( + parentRunFriendlyId: string, + deps: InheritedMintKindDeps +): Promise { + if ((await deps.isSplitEnabled()) && (await deps.isKnownMigrated(parentRunFriendlyId))) { + return "ksuid"; + } + return ownerEngine(parentRunFriendlyId) === "NEW" ? "ksuid" : "cuid"; +} diff --git a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts new file mode 100644 index 00000000000..b958d6d24f7 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts @@ -0,0 +1,97 @@ +import type { + ControlPlaneResolver as EngineControlPlaneResolver, + ResolvedAuthenticatedEnv, + ResolvedEngineEnv, + ResolvedWorkerVersion, +} from "@internal/run-engine"; +import type { RuntimeEnvironmentType } from "@trigger.dev/database"; +import { $replica } from "~/db.server"; +import { + authIncludeWithParent, + toAuthenticated, +} from "~/models/runtimeEnvironment.server"; +import { + ControlPlaneResolver as AppControlPlaneResolver, + controlPlaneResolver, +} from "./controlPlaneResolver.server"; + +/** + * Adapter that presents the webapp's cross-DB cached ControlPlaneResolver as the + * run-engine `ControlPlaneResolver` seam. Injected in `runEngine.server.ts`, it replaces the + * default `PassthroughControlPlaneResolver` so the engine's dequeue/waitpoint/checkpoint/delayTTL + * reads resolve the control-plane half cache-first instead of via an in-DB join. + * + * `resolveEnv` maps the app `ResolvedEnv` (widened to carry the concurrency + nested ids the engine + * needs) onto `ResolvedEngineEnv`. `resolveWorkerVersion` forwards the env `type` so the app + * resolver runs the full run-engine dequeue dispatch (DEV most-recent / MANAGED promotion). + */ +export class RunEngineControlPlaneResolver implements EngineControlPlaneResolver { + readonly #resolver: AppControlPlaneResolver; + + constructor(resolver: AppControlPlaneResolver) { + this.#resolver = resolver; + } + + async resolveEnv(environmentId: string): Promise { + const env = await this.#resolver.resolveEnv(environmentId); + + if (!env) { + return null; + } + + return { + id: env.id, + type: env.type, + archivedAt: env.archivedAt, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor, + projectId: env.projectId, + organizationId: env.organizationId, + project: { id: env.projectId }, + organization: { id: env.organizationId }, + }; + } + + async resolveWorkerVersion(args: { + environmentId: string; + type: RuntimeEnvironmentType; + workerId?: string; + }): Promise { + return this.#resolver.resolveWorkerVersion({ + environmentId: args.environmentId, + backgroundWorkerId: args.workerId, + type: args.type, + }); + } + + async resolveAuthenticatedEnv( + environmentId: string + ): Promise { + // Mirror findEnvironmentById's data source ($replica) and auth shape, but the + // engine needs `git` too. A single findFirst with `include: authIncludeWithParent` + // returns all RuntimeEnvironment scalars (including `git`) on the row, so we map + // the auth shape via toAuthenticated() and add `git` from the same row. + const environment = await $replica.runtimeEnvironment.findFirst({ + where: { + id: environmentId, + }, + include: authIncludeWithParent, + }); + + if (!environment || environment.project.deletedAt !== null) { + return null; + } + + return { + ...toAuthenticated(environment), + git: environment.git, + }; + } + + async assertEnvExists(environmentId: string): Promise { + await this.#resolver.assertEnvExists(environmentId); + } +} + +// Module-level singleton over the app resolver singleton. +export const runEngineControlPlaneResolver = new RunEngineControlPlaneResolver(controlPlaneResolver); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts b/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts new file mode 100644 index 00000000000..2392d516180 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts @@ -0,0 +1,275 @@ +import { type PrismaClient } from "@trigger.dev/database"; +import { type RunOpsPrismaClient } from "@internal/run-ops-database"; +import { runOpsLegacyPrisma, runOpsNewPrismaClient } from "~/db.server"; + +/** + * Structural client covering exactly the run-subgraph delegates + WHERE filters the cascade uses on + * a run-ops writer. Both `@trigger.dev/database`'s `PrismaClient` (full schema, legacy writer) and + * `@internal/run-ops-database`'s `RunOpsPrismaClient` (dedicated SUBSET schema, new writer) are + * assignable to it — the two concrete clients are NOT mutually assignable (the subset adds FK-free + * join models the full schema lacks), so a shared structural type is the only common ground. + * + * Crucially it does NOT expose control-plane-resident models (e.g. `bulkActionItem`) nor scalarized + * relations that don't exist on the subset (e.g. `TaskRunWaitpoint.taskRun`), so the compiler now + * rejects the two bugs an `as unknown as PrismaClient` cast would otherwise mask. + */ +type CountResult = { count: number }; +type RunSubgraphCleanupClient = { + taskRun: { + findMany(args: { + where: { runtimeEnvironmentId: string }; + select: { id: true }; + }): Promise>; + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + taskRunAttempt: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { taskRun: { projectId: string } }; + }): Promise; + }; + taskRunWaitpoint: { + deleteMany(args: { + where: { taskRunId: { in: string[] } } | { projectId: string }; + }): Promise; + }; + taskRunCheckpoint: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + checkpoint: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + checkpointRestoreEvent: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + waitpoint: { + deleteMany(args: { + where: { environmentId: string } | { projectId: string }; + }): Promise; + }; + batchTaskRun: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { runs: { some: { projectId: string } } }; + }): Promise; + }; +}; + +// Compile-time assertion that both concrete writers satisfy the structural shape. +const _newWriterAssignable: RunSubgraphCleanupClient = undefined as unknown as RunOpsPrismaClient; +const _legacyWriterAssignable: RunSubgraphCleanupClient = undefined as unknown as PrismaClient; +void _newWriterAssignable; +void _legacyWriterAssignable; + +/** + * RunOpsCascadeCleanupService — application-level env/project-delete cascade-cleanup that replaces + * the cloud-only dropped cross-seam `onDelete: Cascade` FKs crossing run-ops -> control-plane. + * + * Deletes route through the dedicated run-ops write clients (`runOpsNewPrismaClient` + + * `runOpsLegacyPrisma`), NOT the control-plane `prisma`. The ordered delete pass runs against BOTH + * writers: a migrating env/project's run-ops rows split across the new (KSUID) and + * legacy (cuid) DBs per the per-env cutover + roll-new-forward rollback, and the + * cloud DB that lost its physical FK has no cascade to clean the other writer's miss. In single-DB + * both handles are reference-equal to the one collapsed client, so de-dup-by-reference runs the + * pass once; the FK cascade also fires there, making these deletes idempotent no-ops. + * + * The NEW run-ops writer is a dedicated `RunOpsPrismaClient` over the run-subgraph SUBSET schema: + * it does NOT carry control-plane-resident models. `BulkActionItem` is one such control-plane model + * (it lives in `@trigger.dev/database` but NOT in the run-ops subset), so cleaning it on the NEW + * writer would dereference an `undefined` delegate at runtime. Its cleanup therefore runs ONLY + * against the control-plane writer; the run-subgraph deletes (which DO exist on both schemas) run + * per run-ops writer. Typing the run-ops writers as `RunOpsPrismaClient` makes the compiler reject + * any future control-plane-only model access on the NEW writer, so this class of bug can't recur. + * + * Deliberately NOT gated behind `isSplitEnabled()` (cloud relies on it; self-host treats it as + * idempotent insurance). Every delete is `deleteMany`, so a zero-row scope is a no-op and rows a + * concurrent FK cascade already removed return `count: 0`. Deletes are not wrapped in one + * `$transaction` (no cross-DB txn is possible, and a single huge txn risks long locks); a crash + * mid-cleanup is recovered by re-running. + */ + +/** Per-table deleted row counts, summed across the distinct run-ops writers actually run. */ +type CascadeCleanupResult = Record; + +type CleanupServiceDeps = { + /** + * Run-ops write clients to run the run-subgraph delete pass against. Defaults to the two + * run-ops writers — NOT the control-plane `prisma`. Typed as the structural + * `RunSubgraphCleanupClient` so the compiler rejects control-plane-only model access (e.g. + * `bulkActionItem`) and subset-absent relations. De-duped by reference so the single-DB + * reference-equal collapse runs the pass once. + */ + runOpsWriters?: RunSubgraphCleanupClient[]; + /** + * Control-plane writer for control-plane-resident models the run-subgraph cascade must also clean + * (currently only `BulkActionItem`, which has no env/project column and is NOT in the run-ops + * subset schema). Runs exactly once. Defaults to the legacy run-ops writer, which IS the + * control-plane client. + */ + controlPlaneWriter?: PrismaClient; +}; + +export class RunOpsCascadeCleanupService { + #writers: RunSubgraphCleanupClient[]; + #controlPlaneWriter: PrismaClient; + + constructor(deps: CleanupServiceDeps = {}) { + const writers = deps.runOpsWriters ?? [runOpsNewPrismaClient, runOpsLegacyPrisma]; + this.#writers = Array.from(new Set(writers)); + this.#controlPlaneWriter = deps.controlPlaneWriter ?? runOpsLegacyPrisma; + } + + /** Delete all run-ops rows scoped to one environment, across every distinct run-ops writer. */ + public async cleanupEnvironment(runtimeEnvironmentId: string): Promise { + const result: CascadeCleanupResult = {}; + await this.#cleanupBulkActionItemsForEnvironment(runtimeEnvironmentId, result); + for (const writer of this.#writers) { + await this.#cleanupEnvironmentOnWriter(writer, runtimeEnvironmentId, result); + } + return result; + } + + /** Delete all run-ops rows scoped to one project, across every distinct run-ops writer. */ + public async cleanupProject(projectId: string): Promise { + const result: CascadeCleanupResult = {}; + await this.#cleanupBulkActionItemsForProject(projectId, result); + for (const writer of this.#writers) { + await this.#cleanupProjectOnWriter(writer, projectId, result); + } + return result; + } + + // BulkActionItem is control-plane-resident (it exists in @trigger.dev/database, NOT in the + // run-ops subset schema), so it is cleaned only on the control-plane writer. It has no env column; + // clean via both run relations (destination may differ). + async #cleanupBulkActionItemsForEnvironment( + runtimeEnvironmentId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "bulkActionItem", async () => { + const a = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { sourceRun: { runtimeEnvironmentId } }, + }); + const b = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { destinationRun: { runtimeEnvironmentId } }, + }); + return a.count + b.count; + }); + } + + // BulkActionItem has no projectId column; clean via both run relations. + async #cleanupBulkActionItemsForProject( + projectId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "bulkActionItem", async () => { + const a = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { sourceRun: { projectId } }, + }); + const b = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { destinationRun: { projectId } }, + }); + return a.count + b.count; + }); + } + + // Child-before-parent ordering: an FK-retained DB never errors on an out-of-order delete, and an + // FK-dropped DB leaves no orphans. TaskRun self-relations and TaskRun.batchId are SetNull, so a + // single deleteMany of all scoped TaskRuns is order-safe within the table; Waitpoint's run/batch + // links are SetNull (nullable) so its position is for tidiness only. + async #cleanupEnvironmentOnWriter( + writer: RunSubgraphCleanupClient, + runtimeEnvironmentId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "checkpointRestoreEvent", () => + writer.checkpointRestoreEvent + .deleteMany({ where: { runtimeEnvironmentId } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "checkpoint", () => + writer.checkpoint.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunCheckpoint", () => + writer.taskRunCheckpoint.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + // TaskRunWaitpoint has neither an env column nor (on the subset schema) a `taskRun` relation to + // filter through, so resolve the scoped run ids first and delete by the scalar `taskRunId`. + await this.#accumulate(result, "taskRunWaitpoint", async () => { + const runs = await writer.taskRun.findMany({ + where: { runtimeEnvironmentId }, + select: { id: true }, + }); + if (runs.length === 0) return 0; + const r = await writer.taskRunWaitpoint.deleteMany({ + where: { taskRunId: { in: runs.map((run) => run.id) } }, + }); + return r.count; + }); + // Waitpoint's env column is `environmentId`, NOT `runtimeEnvironmentId`. + await this.#accumulate(result, "waitpoint", () => + writer.waitpoint + .deleteMany({ where: { environmentId: runtimeEnvironmentId } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "taskRunAttempt", () => + writer.taskRunAttempt.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "batchTaskRun", () => + writer.batchTaskRun.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRun", () => + writer.taskRun.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + } + + async #cleanupProjectOnWriter( + writer: RunSubgraphCleanupClient, + projectId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "checkpointRestoreEvent", () => + writer.checkpointRestoreEvent.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "checkpoint", () => + writer.checkpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunCheckpoint", () => + writer.taskRunCheckpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunWaitpoint", () => + writer.taskRunWaitpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "waitpoint", () => + writer.waitpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + // TaskRunAttempt has no projectId column; clean via its TaskRun relation. + await this.#accumulate(result, "taskRunAttempt", () => + writer.taskRunAttempt.deleteMany({ where: { taskRun: { projectId } } }).then((r) => r.count) + ); + // BatchTaskRun has no projectId column; clean via its TaskRun (`runs`) members. + await this.#accumulate(result, "batchTaskRun", () => + writer.batchTaskRun + .deleteMany({ where: { runs: { some: { projectId } } } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "taskRun", () => + writer.taskRun.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + } + + async #accumulate( + result: CascadeCleanupResult, + table: string, + run: () => Promise + ): Promise { + const count = await run(); + result[table] = (result[table] ?? 0) + count; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts new file mode 100644 index 00000000000..fc346dad897 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts @@ -0,0 +1,75 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { computeRunIdMintKind, type RunIdMintKind } from "./runOpsMintKind.server"; + +// LOCK of the CURRENT (intentional) flip-latency behavior, NOT a change request. +// resolveRunIdMintKind caches the per-org mint kind in a process-singleton +// BoundedTtlCache (TTL RUN_OPS_MINT_FLAG_CACHE_TTL_MS, 30000ms default) with get/set +// and NO invalidation hook (runOpsMintKind.server.ts:38-45,56-81). So after a flag +// flip a process keeps minting the stale kind until its cached entry expires; in +// multi-instance prod each process expires independently. This suite reconstructs the +// same flag fn over a real cache and pins both edges of that window. + +// Mirror of resolveRunIdMintKind's flag fn (runOpsMintKind.server.ts:56-81). +function makeCachedFlag( + cache: BoundedTtlCache, + liveFlag: () => RunIdMintKind +): (orgId: string) => Promise { + return async (orgId: string) => { + const cached = cache.get(orgId); + if (cached !== undefined) return cached; + const kind = liveFlag(); + cache.set(orgId, kind); + return kind; + }; +} + +const TTL_MS = 30_000; +const env = { organizationId: "org_flip", id: "env_flip" }; + +describe("computeRunIdMintKind flip latency (mintCache TTL window — current behavior LOCK)", () => { + beforeEach(() => vi.useFakeTimers()); + afterEach(() => vi.useRealTimers()); + + it("returns the STALE cached kind within the TTL after the flag flips cuid->ksuid", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "cuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // populates the cache + + live = "ksuid"; // admin flips the org flag + vi.advanceTimersByTime(TTL_MS - 1); // still inside the window + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // STALE, as designed + }); + + it("returns the FRESH kind once the TTL expires after a cuid->ksuid flip", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "cuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); + + live = "ksuid"; + vi.advanceTimersByTime(TTL_MS + 1); // past expiry -> entry evicted on read + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); // re-reads the live flag + }); + + it("symmetric flip-back ksuid->cuid is also stale within TTL, fresh after", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "ksuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); + + live = "cuid"; + vi.advanceTimersByTime(TTL_MS - 1); + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); // STALE + + vi.advanceTimersByTime(2); // now past expiry + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // FRESH + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts new file mode 100644 index 00000000000..9d2e575fef8 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts @@ -0,0 +1,61 @@ +import { describe, expect, it, vi } from "vitest"; +import { computeRunIdMintKind } from "./runOpsMintKind.server"; + +describe("computeRunIdMintKind (pure)", () => { + it("mints cuid when the master switch is off (never reads the flag)", async () => { + const flag = vi.fn(); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: false, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); + expect(flag).not.toHaveBeenCalled(); + }); + + it("mints cuid when split is OFF, even if master + per-org flag say ksuid", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => false, flag } + ); + expect(kind).toBe("cuid"); // the split-enabled gate dominates + expect(flag).not.toHaveBeenCalled(); // split-off short-circuits before any flag read + }); + + it("mints ksuid only when master on AND split on AND per-org flag = ksuid", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("ksuid"); + }); + + it("passes the already-loaded org feature flags through to the flag fn (no extra DB read)", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const orgFeatureFlags = { runOpsMintKsuid: "ksuid" }; + await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1", orgFeatureFlags }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(flag).toHaveBeenCalledWith("org_1", orgFeatureFlags); + }); + + it("mints cuid for a non-canary org (per-org flag defaults to cuid)", async () => { + const flag = vi.fn().mockResolvedValue("cuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_2", id: "env_2" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); + }); + + it("fails safe to cuid when the flag read throws", async () => { + const flag = vi.fn().mockRejectedValue(new Error("db down")); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); // never arm a mint on a flag-read failure + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts new file mode 100644 index 00000000000..c3751c993ce --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts @@ -0,0 +1,84 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { singleton } from "~/utils/singleton"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { isSplitEnabled } from "./splitMode.server"; + +export type RunIdMintKind = "cuid" | "ksuid"; + +type MintKindDeps = { + masterEnabled: boolean; + splitEnabled: () => Promise; + // Receives the orgId + the (optional) already-loaded org feature flags. When + // orgFeatureFlags is provided, the implementation must NOT read the DB for them. + flag: (orgId: string, orgFeatureFlags: unknown | undefined) => Promise; +}; + +// PURE CORE — no env import; tests drive this directly. Gate order is load-bearing: +// master switch → split gate → per-org flag, short-circuiting at the first OFF. +export async function computeRunIdMintKind( + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }, + deps: MintKindDeps +): Promise { + if (!deps.masterEnabled) return "cuid"; + if (!(await deps.splitEnabled())) return "cuid"; + try { + return await deps.flag(environment.organizationId, environment.orgFeatureFlags); + } catch (error) { + logger.error("[runOpsMintKind] flag read failed; minting cuid (fail-safe)", { error }); + return "cuid"; + } +} + +// ENV-BOUND wrapper — the only place env/$replica/isSplitEnabled are read. +const flagFn = singleton("runOpsMintFlag", () => makeFlag($replica)); +const mintCache = singleton( + "runOpsMintCache", + () => + new BoundedTtlCache( + env.RUN_OPS_MINT_FLAG_CACHE_TTL_MS, + env.RUN_OPS_MINT_FLAG_CACHE_MAX_ENTRIES + ) +); + +export async function resolveRunIdMintKind(environment: { + organizationId: string; + id: string; + // Pass environment.organization.featureFlags from the trigger call site. + orgFeatureFlags?: unknown; +}): Promise { + return computeRunIdMintKind(environment, { + masterEnabled: env.RUN_OPS_MINT_KSUID_ENABLED, + splitEnabled: isSplitEnabled, + flag: async (orgId, orgFeatureFlags) => { + // The cache stores only "cuid"|"ksuid" (never undefined), so the cache's + // "stored-undefined == miss" caveat never applies here. + const cached = mintCache.get(orgId); + if (cached !== undefined) return cached; + + // Hot-path pass-through: use the org flags the authenticated environment already + // carries; only fall back to a DB read when the caller did NOT pass them (non-trigger + // callers). The trigger path always passes them, so it never issues this findFirst. + const overrides = + orgFeatureFlags !== undefined + ? orgFeatureFlags + : ( + await $replica.organization.findFirst({ + where: { id: orgId }, + select: { featureFlags: true }, + }) + )?.featureFlags; + + const kind = await flagFn({ + key: FEATURE_FLAG.runOpsMintKsuid, + defaultValue: "cuid", + overrides: (overrides as Record) ?? {}, + }); + mintCache.set(orgId, kind); + return kind; + }, + }); +} diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts b/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts new file mode 100644 index 00000000000..59eef29fbce --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts @@ -0,0 +1,14 @@ +// Pure run-ops split READ gate. The LEGACY handle is intentionally the control-plane client, +// so only the NEW client's distinctness gates (see runOpsSplitReadGate.test.ts). +export function computeRunOpsSplitReadEnabled(args: { + newReplica: unknown; + controlPlaneWriter: unknown; + controlPlaneReplica: unknown; + hasNewUrl: boolean; + hasLegacyUrl: boolean; +}): boolean { + const newIsDistinctDedicatedClient = + args.newReplica !== args.controlPlaneWriter && args.newReplica !== args.controlPlaneReplica; + + return newIsDistinctDedicatedClient && args.hasNewUrl && args.hasLegacyUrl; +} diff --git a/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts new file mode 100644 index 00000000000..d3048039951 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts @@ -0,0 +1,61 @@ +/** + * isSplitEnabled() is the Wave-0 gate. The entire migration/routing/FK-drop family + * MUST be unreachable when this returns false. Default is false (single-DB). Never + * infer split-vs-single from URL string-equality — distinctness is proven by the + * runtime sentinel. + */ +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { probeDistinctDatabases as defaultProbe } from "./distinctDbSentinel.server"; + +export type SplitModeConfig = { + flagEnabled: boolean; + legacyUrl?: string; + newUrl?: string; +}; + +export type SplitModeDeps = { + probe?: typeof defaultProbe; + logger?: { warn: (msg: string, meta?: Record) => void }; +}; + +export async function computeSplitEnabled( + config: SplitModeConfig, + deps: SplitModeDeps = {} +): Promise { + // Hard gate #1: explicit positive opt-in. OFF by default -> never probe. + if (!config.flagEnabled) { + return false; + } + // Both URLs are required to even consider a split. + if (!config.legacyUrl || !config.newUrl) { + deps.logger?.warn( + "RUN_OPS_SPLIT_ENABLED is on but TASK_RUN_LEGACY_DATABASE_URL / TASK_RUN_DATABASE_URL are not both set; staying single-DB." + ); + return false; + } + // Hard gate #2: runtime sentinel must confirm physically-distinct DBs. + const probe = deps.probe ?? defaultProbe; + const result = await probe(config.legacyUrl, config.newUrl, { logger: deps.logger }); + return result.distinct === true; +} + +let cached: Promise | undefined; + +export function isSplitEnabled(): Promise { + if (!cached) { + cached = computeSplitEnabled( + { + flagEnabled: env.RUN_OPS_SPLIT_ENABLED, + legacyUrl: env.TASK_RUN_LEGACY_DATABASE_URL, + newUrl: env.TASK_RUN_DATABASE_URL, + }, + { logger } + ); + } + return cached; +} + +export function __resetSplitModeCacheForTests(): void { + cached = undefined; +} diff --git a/apps/webapp/app/v3/runOpsMigration/types.ts b/apps/webapp/app/v3/runOpsMigration/types.ts new file mode 100644 index 00000000000..69a4f7b1e85 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/types.ts @@ -0,0 +1,25 @@ +// Pure types for the cross-seam residency guard. No runtime, no env, no Prisma. +import type { Residency } from "@trigger.dev/core/v3/isomorphic"; + +// Aliased (not re-declared) so it cannot drift from the classifier's own union. +export type RunOpsResidency = Residency; + +export type StoreTarget = "new" | "legacy"; + +export type UnblockRouteKind = "MANUAL" | "DATETIME" | "RESUME_TOKEN" | "IDEMPOTENCY_REUSE" | "RUN"; + +export interface CrossSeamGuardInput { + waitpointId: string; + routeKind: UnblockRouteKind; + treeOwnerResidency?: RunOpsResidency; + isCrossTreeIdempotency?: boolean; + hasLegacyParent?: boolean; +} + +export interface CrossSeamGuardDecision { + store: StoreTarget; + /** Always the waitpoint's OWN classification, even when pinned to legacy. */ + residency: RunOpsResidency; + routeKind: UnblockRouteKind; + pinnedReason?: "non-tree-owned" | "cross-tree-idempotency" | "legacy-parent-descendant"; +} diff --git a/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts b/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts new file mode 100644 index 00000000000..3296569ce0b --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts @@ -0,0 +1,96 @@ +// If you add a `completeWaitpoint(` call site in the run-engine, add a matching +// entry here or `apps/webapp/test/crossSeamGuard.proof.test.ts` fails. Entries are +// one-per-textual-call-site (so the per-file count matches the source), anchored by +// method name, not line number. The `kind` is the dominant route kind — store +// selection is driven by residency, not kind, so a disputed kind label is cosmetic. +// +// PURE module — no engine import, no env, no Prisma. +import type { UnblockRouteKind } from "./types"; + +export interface UnblockRoute { + id: string; + kind: UnblockRouteKind; + /** The relative source path, e.g. "internal-packages/run-engine/src/engine/index.ts". */ + site: string; + /** Enclosing method/symbol name — NEVER a line number. */ + symbol: string; +} + +const INDEX = "internal-packages/run-engine/src/engine/index.ts"; +const WAITPOINT_SYSTEM = "internal-packages/run-engine/src/engine/systems/waitpointSystem.ts"; +const TTL_SYSTEM = "internal-packages/run-engine/src/engine/systems/ttlSystem.ts"; +const RUN_ATTEMPT_SYSTEM = "internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts"; +const BATCH_SYSTEM = "internal-packages/run-engine/src/engine/systems/batchSystem.ts"; + +export const UNBLOCK_ROUTES: readonly UnblockRoute[] = [ + { + id: "index.public", + kind: "RESUME_TOKEN", + site: INDEX, + symbol: "completeWaitpoint (public declaration)", + }, + { + id: "index.public.delegate", + kind: "RESUME_TOKEN", + site: INDEX, + symbol: "completeWaitpoint (delegation to waitpointSystem)", + }, + { + id: "index.finishWaitpoint", + kind: "DATETIME", + site: INDEX, + symbol: "finishWaitpoint redis job", + }, + { + id: "wp.sink", + kind: "RUN", + site: WAITPOINT_SYSTEM, + symbol: "completeWaitpoint (sink declaration)", + }, + { + id: "wp.blockAndComplete", + kind: "RUN", + site: WAITPOINT_SYSTEM, + symbol: "blockRunAndCompleteWaitpoint", + }, + { + id: "wp.getOrCreate", + kind: "IDEMPOTENCY_REUSE", + site: WAITPOINT_SYSTEM, + symbol: "getOrCreateRunWaitpoint", + }, + { + id: "batch.tryCompleteBatch", + kind: "RUN", + site: BATCH_SYSTEM, + symbol: "#tryCompleteBatch", + }, + { + id: "ttl.expireRun", + kind: "RUN", + site: TTL_SYSTEM, + symbol: "expireRun", + }, + { + id: "runAttempt.succeeded", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "attemptSucceeded", + }, + { + id: "runAttempt.cancel", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "cancelRun", + }, + { + id: "runAttempt.permanentlyFail", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "#permanentlyFailRun", + }, +]; + +export function expectedCompleteWaitpointCallSites(): { site: string; symbol: string }[] { + return UNBLOCK_ROUTES.map((r) => ({ site: r.site, symbol: r.symbol })); +} diff --git a/apps/webapp/app/v3/runStore.server.test.ts b/apps/webapp/app/v3/runStore.server.test.ts new file mode 100644 index 00000000000..065d8dbbc2e --- /dev/null +++ b/apps/webapp/app/v3/runStore.server.test.ts @@ -0,0 +1,267 @@ +import { heteroPostgresTest, heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore, RoutingRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { buildRunStore } from "./runStore.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// 25-char internal id -> cuid -> LEGACY; 27-char internal id -> ksuid -> NEW. +const CUID_25 = "c".repeat(25); +const KSUID_27 = "k".repeat(27); + +async function seedEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +function createRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}) { + return { + data: { + id: params.runId, + engine: "V2" as const, + status: "PENDING" as const, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT" as const, + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2" as const, + executionStatus: "RUN_CREATED" as const, + description: "Run was created", + runStatus: "PENDING" as const, + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT" as const, + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +describe("T24 — findRun resolves ksuid run on dedicated DB", () => { + heteroRunOpsPostgresTest( + "split ON: findRun({friendlyId, runtimeEnvironmentId}, {select}) finds a ksuid run on the new store", + async ({ prisma14, prisma17 }) => { + const ENV_ID = "env_t24_ksuid_probe"; + const WORKER_ID = "worker_t24_lock"; + await prisma17.taskRun.create({ + data: { + id: KSUID_27, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_t24_ksuid", + runtimeEnvironmentId: ENV_ID, + environmentType: "DEVELOPMENT", + organizationId: "org_t24", + projectId: "proj_t24", + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "trace_t24", + spanId: "span_t24", + queue: "task/my-task", + lockedToVersionId: WORKER_ID, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + }); + + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + const run = await store.findRun( + { friendlyId: "run_t24_ksuid", runtimeEnvironmentId: ENV_ID }, + { select: { lockedToVersionId: true } } + ); + + expect(run).not.toBeNull(); + expect(run?.lockedToVersionId).toBe(WORKER_ID); + expect(await prisma14.taskRun.findUnique({ where: { id: KSUID_27 } })).toBeNull(); + } + ); +}); + +describe("buildRunStore", () => { + heteroPostgresTest( + "split OFF returns a passthrough PostgresRunStore that writes only to the single DB", + async ({ prisma14, prisma17 }) => { + // Single-DB: every handle is prisma14. prisma17 must stay untouched. + const store = buildRunStore({ + splitEnabled: false, + newWriter: prisma14, + newReplica: prisma14, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + expect(store).toBeInstanceOf(PostgresRunStore); + + const seed = await seedEnvironment(prisma14, "off"); + // A ksuid id (would route to NEW under split) must still land on the single DB. + const runId = KSUID_27; + await store.createRun( + createRunInput({ + runId, + friendlyId: "run_off", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }) + ); + + expect(await prisma14.taskRun.findUnique({ where: { id: runId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: runId } })).toBeNull(); + } + ); + + heteroPostgresTest( + "split ON routes a NEW-classified create to the new store and a LEGACY-classified create to the legacy store", + async ({ prisma14, prisma17 }) => { + // legacy = PG14, new = PG17. + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + expect(store).toBeInstanceOf(RoutingRunStore); + + const seedNew = await seedEnvironment(prisma17, "on_new"); + const seedLegacy = await seedEnvironment(prisma14, "on_legacy"); + + // ksuid -> NEW (PG17) + await store.createRun( + createRunInput({ + runId: KSUID_27, + friendlyId: "run_new", + organizationId: seedNew.organization.id, + projectId: seedNew.project.id, + runtimeEnvironmentId: seedNew.environment.id, + }) + ); + expect(await prisma17.taskRun.findUnique({ where: { id: KSUID_27 } })).not.toBeNull(); + expect(await prisma14.taskRun.findUnique({ where: { id: KSUID_27 } })).toBeNull(); + + // cuid -> LEGACY (PG14) + await store.createRun( + createRunInput({ + runId: CUID_25, + friendlyId: "run_legacy", + organizationId: seedLegacy.organization.id, + projectId: seedLegacy.project.id, + runtimeEnvironmentId: seedLegacy.environment.id, + }) + ); + expect(await prisma14.taskRun.findUnique({ where: { id: CUID_25 } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: CUID_25 } })).toBeNull(); + } + ); + + heteroPostgresTest( + "split ON keeps a write on a LEGACY-classified id on the legacy store", + async ({ prisma14, prisma17 }) => { + // Routing is pure id-shape, so a cuid write stays LEGACY. + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + const seedLegacy = await seedEnvironment(prisma14, "no_marker_legacy"); + // The run lives on LEGACY (PG14); seed it directly. + await prisma14.taskRun.create({ + data: { + id: CUID_25, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_no_marker", + runtimeEnvironmentId: seedLegacy.environment.id, + environmentType: "DEVELOPMENT", + organizationId: seedLegacy.organization.id, + projectId: seedLegacy.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "t", + spanId: "s", + queue: "task/my-task", + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + }); + + const updated = await store.updateMetadata( + CUID_25, + { + metadata: '{"k":"v"}', + metadataType: "application/json", + metadataVersion: { increment: 1 }, + updatedAt: new Date("2024-01-02T00:00:00.000Z"), + }, + {} + ); + expect(updated.count).toBe(1); + + const onLegacy = await prisma14.taskRun.findUnique({ where: { id: CUID_25 } }); + expect(onLegacy?.metadata).toBe('{"k":"v"}'); + expect(await prisma17.taskRun.findUnique({ where: { id: CUID_25 } })).toBeNull(); + } + ); +}); diff --git a/apps/webapp/app/v3/runStore.server.ts b/apps/webapp/app/v3/runStore.server.ts index 2993597ea17..3cab13f9419 100644 --- a/apps/webapp/app/v3/runStore.server.ts +++ b/apps/webapp/app/v3/runStore.server.ts @@ -1,8 +1,122 @@ -import { PostgresRunStore } from "@internal/run-store"; -import { $replica, prisma } from "~/db.server"; +import { PostgresRunStore, RoutingRunStore, type RunStore } from "@internal/run-store"; +import { ownerEngine, type Residency } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient, PrismaReplicaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { + $replica, + prisma, + runOpsLegacyPrisma, + runOpsLegacyReplica, + runOpsNewPrismaClient, + runOpsNewReplicaClient, +} from "~/db.server"; +import { env } from "~/env.server"; import { singleton } from "~/utils/singleton"; -export const runStore = singleton( - "PostgresRunStore", - () => new PostgresRunStore({ prisma, readOnlyPrisma: $replica }) -); +type BuildRunStoreDeps = { + /** Boot constant: true only when both run-ops DBs are configured and the split flag is on. */ + splitEnabled: boolean; + /** Split-only handles. Required when splitEnabled is true; omitted entirely when OFF + * so single-DB callers never touch the run-ops clients (keeps mocks/passthrough clean). */ + newWriter?: RunOpsPrismaClient; + newReplica?: RunOpsPrismaClient; + legacyWriter?: PrismaClient; + legacyReplica?: PrismaReplicaClient; + /** Single-DB store handles (control-plane pair). Used verbatim when split is OFF. */ + singleWriter: PrismaClient; + singleReplica: PrismaReplicaClient; + /** Residency classifier; defaults to ownerEngine inside RoutingRunStore. */ + classify?: (id: string) => Residency; +}; + +/** + * Pure run-store builder (no env / no boot side effects — webapp testability rule). + * + * Split OFF (default / self-host): returns the exact passthrough PostgresRunStore we + * have always returned, built from the single control-plane handles. No second store + * is constructed and no marker predicate is consulted, so behavior is byte-identical + * to single-DB today. + * + * Split ON: returns a RoutingRunStore that selects between a NEW store (where new runs + * are born) and a LEGACY store (draining) by run-id residency (id shape). There is no cuid + * migration, so a LEGACY-classified id is always LEGACY-resident. + */ +export function buildRunStore(deps: BuildRunStoreDeps): RunStore { + if (!deps.splitEnabled) { + return new PostgresRunStore({ + prisma: deps.singleWriter, + readOnlyPrisma: deps.singleReplica, + }); + } + + if (!deps.newWriter || !deps.newReplica || !deps.legacyWriter || !deps.legacyReplica) { + throw new Error("buildRunStore: split is enabled but run-ops store handles are missing"); + } + // The NEW store is backed by the dedicated RunOpsPrismaClient (subset schema): relation-shaped + // ops branch onto FK-free scalars + explicit join models. The LEGACY store keeps the default + // "legacy" variant (full @trigger.dev/database schema with implicit M2M + @relations). + const newStore = new PostgresRunStore({ + prisma: deps.newWriter, + readOnlyPrisma: deps.newReplica, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: deps.legacyWriter, + readOnlyPrisma: deps.legacyReplica, + }); + + return new RoutingRunStore({ + new: newStore, + legacy: legacyStore, + classify: deps.classify ?? ownerEngine, + }); +} + +// Build the routing store whenever BOTH run-ops DBs are configured, independent of +// RUN_OPS_SPLIT_ENABLED. Reads must fan out across both DBs so a run that lives on the new +// DB stays visible even with the flag off (matches the db.server topology factory). The flag +// governs write/mint residency + migration via isSplitEnabled(), not read visibility. +const ROUTING_ENABLED = + !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; + +// Resolve the run-ops handles, tolerating contexts where they are absent — tests that mock +// ~/db.server minimally omit them, and accessing a missing export under vi.mock throws. A +// miss means "no run-ops handles here" and we fall back to single-store. +function tryResolveRunOpsHandles() { + try { + if ( + !runOpsNewPrismaClient || + !runOpsNewReplicaClient || + !runOpsLegacyPrisma || + !runOpsLegacyReplica + ) { + return null; + } + return { + newWriter: runOpsNewPrismaClient, + newReplica: runOpsNewReplicaClient, + legacyWriter: runOpsLegacyPrisma, + legacyReplica: runOpsLegacyReplica, + }; + } catch { + return null; + } +} + +export const runStore: RunStore = singleton("RunStore", () => { + const handles = ROUTING_ENABLED ? tryResolveRunOpsHandles() : null; + // Single-store passthrough: self-host (one DB), or a context without run-ops handles. + if (!handles) { + return buildRunStore({ + splitEnabled: false, + singleWriter: prisma, + singleReplica: $replica, + }); + } + return buildRunStore({ + splitEnabled: true, + ...handles, + singleWriter: prisma, + singleReplica: $replica, + }); +}); diff --git a/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts b/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts index f18d2e5aa67..c7b2c4aebe4 100644 --- a/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts +++ b/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts @@ -8,6 +8,7 @@ import type { PrismaClientOrTransaction } from "~/db.server"; import { workerQueue } from "~/services/worker.server"; import { socketIo } from "./handleSocketIo.server"; import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { isV3Disabled } from "./engineDeprecation.server"; export class TaskRunHeartbeatFailedService extends BaseService { @@ -23,16 +24,8 @@ export class TaskRunHeartbeatFailedService extends BaseService { friendlyId: true, status: true, lockedAt: true, - runtimeEnvironment: { - select: { - type: true, - }, - }, - lockedToVersion: { - select: { - supportsLazyAttempts: true, - }, - }, + runtimeEnvironmentId: true, + lockedToVersionId: true, _count: { select: { attempts: true, @@ -60,6 +53,16 @@ export class TaskRunHeartbeatFailedService extends BaseService { return; } + const env = await controlPlaneResolver.resolveEnv(taskRun.runtimeEnvironmentId); + const lockedWorker = await controlPlaneResolver.resolveRunLockedWorker({ + lockedToVersionId: taskRun.lockedToVersionId, + }); + + if (!env) { + logger.debug("TaskRunHeartbeatFailedService: environment not found", { runId }); + return; + } + const service = new FailedTaskRunService(); switch (taskRun.status) { @@ -143,7 +146,7 @@ export class TaskRunHeartbeatFailedService extends BaseService { ); try { - if (taskRun.runtimeEnvironment.type === "DEVELOPMENT") { + if (env.type === "DEVELOPMENT") { return; } @@ -152,7 +155,7 @@ export class TaskRunHeartbeatFailedService extends BaseService { version: "v1", runId: taskRun.id, // Give the run a few seconds to exit to complete any flushing etc - delayInMs: taskRun.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + delayInMs: lockedWorker?.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, }); } catch (error) { logger.error("[TaskRunHeartbeatFailedService] Error signaling run cancellation", { diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 638fbda5896..643093624b4 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -59,6 +59,7 @@ "@internal/llm-model-catalog": "workspace:*", "@internal/redis": "workspace:*", "@internal/run-engine": "workspace:*", + "@internal/run-ops-database": "workspace:*", "@internal/run-store": "workspace:*", "@internal/schedule-engine": "workspace:*", "@internal/tracing": "workspace:*", @@ -249,6 +250,7 @@ "@swc/helpers": "^0.4.11", "@tailwindcss/forms": "^0.5.3", "@tailwindcss/typography": "^0.5.9", + "@testcontainers/postgresql": "^11.14.0", "@total-typescript/ts-reset": "^0.4.2", "@types/bcryptjs": "^2.4.2", "@types/compression": "^1.7.2", diff --git a/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts b/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts new file mode 100644 index 00000000000..7231205ea8d --- /dev/null +++ b/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts @@ -0,0 +1,135 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for findEnvironmentFromRun. +// The env (slug/project/org) lives on PG14; the run-ops scalar row on PG17 with cross-seam +// FKs dropped. A PostgresRunStore over PG17 reads run scalars; the ControlPlaneResolver over +// PG14 resolves the env. The DB is never mocked. The .count() proof shows neither DB joins +// the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 120_000, hookTimeout: 120_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +let seedCounter = 0; + +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const organization = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + return { organization, project, environment }; +} + +async function seedRun( + prisma: PrismaClient, + ids: { runtimeEnvironmentId: string; projectId: string; organizationId: string }, + opts?: { runTags?: string[] } +) { + const n = seedCounter++; + return prisma.taskRun.create({ + data: { + id: `run_${n}_pg17`, + engine: "V2", + status: "PENDING", + friendlyId: `run_friendly_${n}`, + runtimeEnvironmentId: ids.runtimeEnvironmentId, + organizationId: ids.organizationId, + projectId: ids.projectId, + taskIdentifier: "fefr-task", + payload: "{}", + payloadType: "application/json", + queue: "task/fefr-task", + traceId: `trace_${n}`, + spanId: `span_${n}`, + workerQueue: "main", + runTags: opts?.runTags ?? ["a", "b"], + }, + }); +} + +function buildResolver(controlPlane: PrismaClient) { + return new ControlPlaneResolver({ + controlPlanePrimary: controlPlane, + controlPlaneReplica: controlPlane, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => false, + }); +} + +describe("findEnvironmentFromRun cross-DB read-through", () => { + heteroPostgresTest( + "resolves env from PG14 while run scalars resolve from PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient); + const run = await seedRun( + prisma17 as unknown as PrismaClient, + { + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + }, + { runTags: ["x", "y"] } + ); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = buildResolver(prisma14 as unknown as PrismaClient); + + // The decomposed findEnvironmentFromRun: run scalars from the store + env from the resolver. + const taskRun = await runStore.findRun( + { id: run.id }, + { select: { runtimeEnvironmentId: true, runTags: true, batchId: true } }, + prisma17 as unknown as PrismaClient + ); + expect(taskRun).not.toBeNull(); + const environment = await resolver.resolveAuthenticatedEnv(taskRun!.runtimeEnvironmentId); + expect(environment).not.toBeNull(); + expect(environment!.id).toBe(cp.environment.id); + expect(environment!.slug).toBe(cp.environment.slug); + expect(environment!.project.id).toBe(cp.project.id); + expect(taskRun!.runTags).toEqual(["x", "y"]); + + // Inversion proof: PG17 (run-ops) has no env rows; PG14 (control-plane) has no run rows. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..a6437a3e016 --- /dev/null +++ b/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts @@ -0,0 +1,164 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for the run route loaders that were +// decomposed onto the ControlPlaneResolver. The env (slug/project/org) and the +// locked worker/deployment live on PG14; the run-ops scalar row on PG17 with cross-seam FKs +// dropped (including the lockedById / lockedToVersionId FKs). A PostgresRunStore over PG17 +// reads run scalars; the ControlPlaneResolver over PG14 resolves env + lockedBy.worker.deployment. +// The DB is never mocked. The .count() proof shows neither DB joins the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", + "TaskRun_lockedById_fkey", + "TaskRun_lockedToVersionId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const c of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe(`ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${c}"`); + } +} + +let n = 0; +async function seedAll(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${s}`, + contentHash: `hash_${s}`, + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: `2024.1.${s}`, + metadata: {}, + engine: "V2", + }, + }); + const deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `dep_${s}`, + contentHash: `hash_${s}`, + version: worker.version, + shortCode: `dc_${s}`, + type: "MANAGED", + status: "DEPLOYED", + projectId: project.id, + environmentId: environment.id, + workerId: worker.id, + git: { commitSha: `sha_${s}` }, + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${s}`, + slug: `t-${s}`, + filePath: "src/index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + return { organization, project, environment, worker, deployment, task }; +} + +describe("run route loader cross-DB read-through", () => { + heteroPostgresTest( + "resources.runs.$runParam env + lockedBy.worker.deployment.git resolve from PG14", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedAll(prisma14 as unknown as PrismaClient); + + const run = await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: `run_${n++}_pg17`, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: `run_rl_${n}`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + lockedById: cp.task.id, + lockedToVersionId: cp.worker.id, + taskIdentifier: "rl-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rl-task", + traceId: "tr_rl", + spanId: "sp_rl", + workerQueue: "main", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma14 as unknown as PrismaClient, + controlPlaneReplica: prisma14 as unknown as PrismaClient, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { + select: { + id: true, + runtimeEnvironmentId: true, + lockedById: true, + lockedToVersionId: true, + }, + }, + prisma17 as unknown as PrismaClient + ); + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.organization.title).toBe(cp.organization.title); + expect(env!.project.externalRef).toBe(cp.project.externalRef); + + const locked = await resolver.resolveRunLockedWorker({ + lockedById: found!.lockedById, + lockedToVersionId: found!.lockedToVersionId, + }); + expect(locked!.lockedToVersion!.version).toBe(cp.worker.version); + expect(locked!.lockedBy!.worker.deployment!.git).toEqual({ + commitSha: cp.deployment.git ? (cp.deployment.git as any).commitSha : undefined, + }); + expect(locked!.lockedBy!.worker.deployment!.friendlyId).toBe(cp.deployment.friendlyId); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/runEngineHandlers.test.ts b/apps/webapp/test/runEngineHandlers.test.ts new file mode 100644 index 00000000000..5fff3cba5b1 --- /dev/null +++ b/apps/webapp/test/runEngineHandlers.test.ts @@ -0,0 +1,674 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { CompleteBatchResult } from "@internal/run-engine"; +import { describe, expect, vi } from "vitest"; +import { + handleBatchCompletion, + readRunForEvent, + readRunForEventOrThrow, + resolveBatchRunOpsWriter, + type BatchCompletionDeps, + type EventReadDeps, +} from "~/v3/runEngineHandlersShared.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// Proves two routing properties against REAL Postgres (never mocked): +// 1. the 7 TaskRun event reads resolve run-ops new-or-old via read-through; +// 2. the batch update + error-createMany transaction commits entirely on the +// run-ops writer that owns the BatchTaskRun row (no boundary-spanning txn). + +const EVENT_SELECT = { + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + runTags: true, + batchId: true, +} as const; + +async function seedEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +async function seedTaskRun( + prisma: PrismaClient, + params: { + id: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + runTags?: string[]; + } +) { + return prisma.taskRun.create({ + data: { + id: params.id, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: params.friendlyId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + traceId: "trace_1", + spanId: "span_1", + queue: "task/my-task", + runtimeEnvironmentId: params.runtimeEnvironmentId, + projectId: params.projectId, + organizationId: params.organizationId, + environmentType: "DEVELOPMENT", + isTest: false, + taskEventStore: "taskEvent", + runTags: params.runTags ?? ["alpha", "beta"], + createdAt: new Date("2024-01-01T00:00:00.000Z"), + completedAt: new Date("2024-01-01T00:01:00.000Z"), + }, + }); +} + +async function seedBatch( + prisma: PrismaClient, + params: { id: string; friendlyId: string; runtimeEnvironmentId: string } +) { + return prisma.batchTaskRun.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + status: "PENDING", + }, + }); +} + +function makeBatchDeps( + overrides: { + splitEnabled?: boolean; + newReplica?: PrismaClient; + newWriter?: PrismaClient; + legacyWriter?: PrismaClient; + legacyReplica?: PrismaClient; + } & { single?: PrismaClient } +): BatchCompletionDeps & { tryCompleteBatchCalls: string[] } { + const single = overrides.single; + const tryCompleteBatchCalls: string[] = []; + return { + splitEnabled: overrides.splitEnabled ?? false, + newReplica: (overrides.newReplica ?? single)!, + newWriter: (overrides.newWriter ?? single)!, + legacyWriter: (overrides.legacyWriter ?? single)!, + tryCompleteBatch: async (batchId: string) => { + tryCompleteBatchCalls.push(batchId); + }, + tryCompleteBatchCalls, + }; +} + +function failure(index: number, errorCode: string, extra?: Record) { + return { + index, + taskIdentifier: "my-task", + payload: '{"item":' + index + "}", + options: { foo: "bar" }, + error: `error ${index}`, + errorCode, + timestamp: Date.now(), + ...extra, + }; +} + +describe("runEngineHandlers read-through", () => { + // Test A: a NEW run resolves via read-through against the new store. + containerTest("event read resolves a NEW run via read-through", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const { organization, project, environment } = await seedEnvironment(prisma, "a"); + await seedTaskRun(prisma, { + id: "run_new_a", + friendlyId: "run_friendly_a", + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + runTags: ["x", "y"], + }); + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: prisma, + splitEnabled: false, + }; + + const run = await readRunForEvent("run_new_a", environment.id, EVENT_SELECT, deps); + + expect(run).not.toBeNull(); + expect(run!.id).toBe("run_new_a"); + expect(run!.friendlyId).toBe("run_friendly_a"); + expect(run!.runTags).toEqual(["x", "y"]); + expect(run!.organizationId).toBe(organization.id); + expect(run!.taskEventStore).toBe("taskEvent"); + }); + + // Test C: single-DB short-circuit — readLegacy must never be invoked. + containerTest("single-DB short-circuit never touches a legacy handle", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const { organization, project, environment } = await seedEnvironment(prisma, "c"); + await seedTaskRun(prisma, { + id: "run_single_c", + friendlyId: "run_friendly_c", + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + // A legacy replica that THROWS if read — proves the short-circuit. + const exploding = new Proxy( + {}, + { + get() { + throw new Error("legacy replica must not be touched in single-DB mode"); + }, + } + ) as unknown as PrismaClient; + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: exploding, + splitEnabled: false, + }; + + const run = await readRunForEvent("run_single_c", environment.id, EVENT_SELECT, deps); + expect(run!.id).toBe("run_single_c"); + }); + + // readRunForEventOrThrow reproduces the not-found-as-error semantics. + containerTest("readRunForEventOrThrow throws on a missing run", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + await seedEnvironment(prisma, "nf"); + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: prisma, + splitEnabled: false, + }; + + await expect( + readRunForEventOrThrow("run_missing", "env_x", EVENT_SELECT, deps) + ).rejects.toThrow(); + + // Nullable helper returns null instead of throwing for the same input. + const run = await readRunForEvent("run_missing", "env_x", EVENT_SELECT, deps); + expect(run).toBeNull(); + }); +}); + +describe("runEngineHandlers read-through cross-version", () => { + // Test B (heterogeneous recast): an OLD in-retention run is served off the LEGACY + // REPLICA only, and the legacy primary/writer is structurally absent. + heteroPostgresTest( + "event read resolves an OLD in-retention run via the legacy replica", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const legacySeed = await seedEnvironment(prisma14, "b14"); + // A 25-char cuid id classifies as LEGACY so read-through probes new, misses, + // then falls back to the legacy replica. + const legacyRunId = "c".repeat(25); + const seededRow = await seedTaskRun(prisma14, { + id: legacyRunId, + friendlyId: "run_friendly_b", + organizationId: legacySeed.organization.id, + projectId: legacySeed.project.id, + runtimeEnvironmentId: legacySeed.environment.id, + runTags: ["legacy", "tag"], + }); + + // The read uses the NEW store for the new-DB probe and the LEGACY store for + // the replica fallback, so a hit can only come from the legacy replica. + let legacyReplicaUsed = false; + // A store facade that routes the read to the legacy store when handed the + // legacy client and the new store otherwise — both real DBs, no mocks. + const routedStore = { + ...newStore, + findRun: ((where: any, args: any, client: any) => { + if (client === prisma14) { + legacyReplicaUsed = true; + return legacyStore.findRun(where, args, client); + } + return newStore.findRun(where, args, client); + }) as typeof newStore.findRun, + } as PostgresRunStore; + const routedDeps: EventReadDeps = { + store: routedStore, + newReplica: prisma17, + legacyReplica: prisma14, + splitEnabled: true, + // Pure boundary: this legacy run was never migrated, so don't short-circuit. + isKnownMigrated: async () => false, + }; + + const run = await readRunForEvent( + legacyRunId, + legacySeed.environment.id, + EVENT_SELECT, + routedDeps + ); + + expect(legacyReplicaUsed).toBe(true); + expect(run).not.toBeNull(); + expect(run!.id).toBe(legacyRunId); + // Byte-identity of the enrichment select across the legacy<->new boundary: + // re-read the same row on the legacy replica directly and deep-equal it. + const direct = await legacyStore.findRun( + { id: legacyRunId }, + { select: EVENT_SELECT }, + prisma14 + ); + expect(run).toEqual(direct); + expect(run!.runTags).toEqual(["legacy", "tag"]); + expect(seededRow.id).toBe(legacyRunId); + + // The new DB has no such run. + const onNew = await newStore.findRun({ id: legacyRunId }, { select: EVENT_SELECT }, prisma17); + expect(onNew).toBeNull(); + } + ); +}); + +describe("runEngineHandlers batch completion", () => { + // Tests D + F: the txn commits whole on a single run-ops writer; rolls back atomically. + containerTest("batch txn commits whole on the run-ops writer", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "d"); + const batchId = "c".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_d", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1", "run_friendly_2"], + successfulRunCount: 2, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR", { options: { nested: { a: 1, b: [2, 3] } } })], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PARTIAL_FAILED"); + expect(batch.runIds).toEqual(["run_friendly_1", "run_friendly_2"]); + expect(batch.successfulRunCount).toBe(2); + expect(batch.failedRunCount).toBe(1); + expect(batch.processingCompletedAt).not.toBeNull(); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(1); + expect(errors[0]!.errorCode).toBe("TRIGGER_ERROR"); + // JSON round-trip of options. + expect(errors[0]!.options).toEqual({ nested: { a: 1, b: [2, 3] } }); + + // PARTIAL_FAILED (not ABORTED) -> tryCompleteBatch is invoked. + expect(deps.tryCompleteBatchCalls).toEqual([batchId]); + }); + + // Atomicity: if the createMany fails, the update rolls back too. + containerTest("batch txn rolls back the update when createMany fails", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "rb"); + const batchId = "d".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_rb", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + // A failure with a null taskIdentifier violates the NOT NULL constraint inside + // the createMany, forcing the whole transaction to roll back. + const result = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 0, + failedRunCount: 1, + failures: [ + { index: 0, taskIdentifier: null as any, payload: "{}", error: "boom", timestamp: 1 }, + ], + } as unknown as CompleteBatchResult; + + await expect(handleBatchCompletion(result, deps)).rejects.toThrow(); + + // The update must NOT have committed — status stays PENDING from the seed. + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PENDING"); + expect(batch.processingCompletedAt).toBeNull(); + }); + + // Test E: callback retry is idempotent via skipDuplicates. + containerTest("batch txn is idempotent on callback retry", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "e"); + const batchId = "e".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_e", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 2, + failures: [failure(0, "TRIGGER_ERROR"), failure(1, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + await handleBatchCompletion(result, deps); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(2); + }); + + // Test I: aggregate fast-path collapses same-errorCode failures to one row. + containerTest("aggregate fast-path collapses queue-size-limit failures", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "i"); + const batchId = "f".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_i", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 3, + failures: [ + failure(5, "QUEUE_SIZE_LIMIT_EXCEEDED"), + failure(6, "QUEUE_SIZE_LIMIT_EXCEEDED"), + failure(7, "QUEUE_SIZE_LIMIT_EXCEEDED"), + ], + }; + + await handleBatchCompletion(result, deps); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(1); + expect(errors[0]!.index).toBe(5); + expect(errors[0]!.error).toContain("(3 items in this batch failed with the same error)"); + }); + + // ABORTED status does not call tryCompleteBatch. + containerTest("ABORTED batch does not call tryCompleteBatch", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "ab"); + const batchId = "g".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_ab", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("ABORTED"); + expect(batch.completedAt).not.toBeNull(); + expect(deps.tryCompleteBatchCalls).toEqual([]); + }); + + // A successful (no-failure) batch is PENDING and calls tryCompleteBatch. + containerTest("successful batch is PENDING and calls tryCompleteBatch", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "ok"); + const batchId = "h".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_ok", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 0, + failures: [], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PENDING"); + expect(deps.tryCompleteBatchCalls).toEqual([batchId]); + }); +}); + +describe("runEngineHandlers batch residency routing", () => { + // True single-DB invariant: the topology's cpFallback makes newReplica and + // legacyWriter the SAME control-plane client, so the probe always resolves to + // that one client regardless of where length-classification would guess. + containerTest("true single-DB resolves to the single client", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "single"); + const batchId = "s".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_single", + runtimeEnvironmentId: environment.id, + }); + + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma, + newWriter: prisma, + legacyWriter: prisma, + }); + expect(writer).toBe(prisma); + }); + + // Test G (heterogeneous recast): a legacy-resident batch (row only on the legacy DB) commits on + // the LEGACY writer; the NEW DB is left with zero rows for the batch. + heteroPostgresTest( + "legacy-resident batch routes to the LEGACY writer, new DB untouched", + async ({ prisma14, prisma17 }) => { + const legacySeed = await seedEnvironment(prisma14, "g14"); + const batchId = "c".repeat(25); + await seedBatch(prisma14, { + id: batchId, + friendlyId: "batch_friendly_g", + runtimeEnvironmentId: legacySeed.environment.id, + }); + + // The probe misses on new (the new DB has no such batch) and resolves the legacy writer. + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + }); + expect(writer).toBe(prisma14); + + const deps: BatchCompletionDeps = { + splitEnabled: true, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + // Committed on the legacy DB. + const legacyBatch = await prisma14.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(legacyBatch.status).toBe("PARTIAL_FAILED"); + const legacyErrors = await prisma14.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(legacyErrors).toHaveLength(1); + + // The new DB has zero rows for this batch — no misroute. + const onNew = await prisma17.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onNew).toHaveLength(0); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(0); + } + ); + + // Regression: the real "run-ops DB connected, split flag off" state. splitEnabled + // is false, yet newWriter is a DISTINCT (empty) DB while the batch lives on legacy. + // Old code wrote to newWriter -> "No record was found for an update" -> batch hangs. + heteroPostgresTest( + "split-off connected-but-off: legacy-resident batch routes to LEGACY, not newWriter", + async ({ prisma14, prisma17 }) => { + const legacySeed = await seedEnvironment(prisma14, "off14"); + const batchId = "c".repeat(25); + await seedBatch(prisma14, { + id: batchId, + friendlyId: "batch_friendly_off", + runtimeEnvironmentId: legacySeed.environment.id, + }); + + const deps: BatchCompletionDeps = { + splitEnabled: false, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + // Committed on the legacy DB; the new DB (the distinct newWriter) untouched. + const legacyBatch = await prisma14.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(legacyBatch.status).toBe("PARTIAL_FAILED"); + expect(legacyBatch.processingCompletedAt).not.toBeNull(); + const legacyErrors = await prisma14.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(legacyErrors).toHaveLength(1); + + const onNew = await prisma17.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onNew).toHaveLength(0); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(0); + } + ); + + // Test H (heterogeneous recast): a new batch (row only on the new DB) commits on the NEW + // writer; the LEGACY DB is untouched. + heteroPostgresTest( + "new batch routes to the NEW writer, legacy DB untouched", + async ({ prisma14, prisma17 }) => { + const newSeed = await seedEnvironment(prisma17, "h17"); + const batchId = "d".repeat(25); + await seedBatch(prisma17, { + id: batchId, + friendlyId: "batch_friendly_h", + runtimeEnvironmentId: newSeed.environment.id, + }); + + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + }); + expect(writer).toBe(prisma17); + + const deps: BatchCompletionDeps = { + splitEnabled: true, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR", { options: { json: { deep: [1, 2, 3] } } })], + }; + + await handleBatchCompletion(result, deps); + + const newBatch = await prisma17.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(newBatch.status).toBe("PARTIAL_FAILED"); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(1); + // Batch JSON round-trip on the new DB. + expect(newErrors[0]!.options).toEqual({ json: { deep: [1, 2, 3] } }); + + // The legacy DB is untouched. + const onLegacy = await prisma14.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onLegacy).toHaveLength(0); + } + ); +}); diff --git a/apps/webapp/test/runOpsCrossSeamGuard.test.ts b/apps/webapp/test/runOpsCrossSeamGuard.test.ts new file mode 100644 index 00000000000..5f232c08693 --- /dev/null +++ b/apps/webapp/test/runOpsCrossSeamGuard.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from "vitest"; +import { + computeStoreForCompletion, + selectStoreForWaitpoint, +} from "~/v3/runOpsMigration/crossSeamGuard.server"; +import { UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; + +// Real sample ids exercising the genuine run-id residency classifier (no stub). +const NEW = "waitpoint_" + "a".repeat(27); // 27-char ksuid body -> NEW +const LEGACY = "waitpoint_" + "a".repeat(25); // 25-char cuid body -> LEGACY +const AMBIGUOUS = "waitpoint_" + "a".repeat(10); // neither length -> throws + +describe("selectStoreForWaitpoint — happy-path residency routing", () => { + it("MANUAL completion of a NEW waitpoint selects the new store", () => { + const d = selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "MANUAL" }); + expect(d.store).toBe("new"); + expect(d.residency).toBe("NEW"); + }); + + it("RESUME_TOKEN completion of a LEGACY waitpoint selects the legacy store", () => { + const d = selectStoreForWaitpoint({ waitpointId: LEGACY, routeKind: "RESUME_TOKEN" }); + expect(d.store).toBe("legacy"); + expect(d.residency).toBe("LEGACY"); + }); + + it("DATETIME completion of a NEW waitpoint selects the new store", () => { + expect(selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "DATETIME" }).store).toBe("new"); + }); + + it("RUN completion of a NEW waitpoint selects the new store", () => { + expect(selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "RUN" }).store).toBe("new"); + }); + + it("IDEMPOTENCY_REUSE of a NEW waitpoint with no pins selects the new store", () => { + const d = selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "IDEMPOTENCY_REUSE" }); + expect(d.store).toBe("new"); + expect(d.pinnedReason).toBeUndefined(); + }); +}); + +describe("selectStoreForWaitpoint — legacy pins", () => { + it("pins a NEW-residency waitpoint to legacy when non-tree-owned", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "MANUAL", + treeOwnerResidency: "LEGACY", + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("non-tree-owned"); + }); + + it("pins cross-tree idempotency reuse to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "IDEMPOTENCY_REUSE", + isCrossTreeIdempotency: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("cross-tree-idempotency"); + }); + + it("pins a descendant of a legacy parent to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "RUN", + hasLegacyParent: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("legacy-parent-descendant"); + }); + + it("applies deterministic pin precedence: non-tree-owned wins", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "RUN", + treeOwnerResidency: "LEGACY", + isCrossTreeIdempotency: true, + hasLegacyParent: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("non-tree-owned"); + }); + + it("reports the waitpoint's own residency even when pinned to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "MANUAL", + treeOwnerResidency: "LEGACY", + }); + expect(d.store).toBe("legacy"); + expect(d.residency).toBe("NEW"); + }); +}); + +describe("selectStoreForWaitpoint — ambiguity and unknown routes are loud", () => { + it("rethrows UnclassifiableRunId for an ambiguous-length id (never silently routes)", () => { + expect(() => selectStoreForWaitpoint({ waitpointId: AMBIGUOUS, routeKind: "MANUAL" })).toThrow( + UnclassifiableRunId + ); + }); + + it("throws when an unknown routeKind is supplied", () => { + expect(() => + // @ts-expect-error deliberately invalid kind + selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "WAT" as any }) + ).toThrow(); + }); +}); + +describe("computeStoreForCompletion — single-DB no-op + flag wrapper", () => { + it("returns the single store without classifying when split is OFF", () => { + const calls: string[] = []; + const d = computeStoreForCompletion( + { waitpointId: AMBIGUOUS, routeKind: "MANUAL" }, + { + splitEnabled: false, + classify: (id) => { + calls.push(id); + return "NEW"; + }, + } + ); + expect(d.store).toBe("legacy"); // the single store + expect(calls).toEqual([]); // classifier never consulted + }); + + it("delegates to selectStoreForWaitpoint when split is ON", () => { + const d = computeStoreForCompletion( + { waitpointId: NEW, routeKind: "MANUAL" }, + { splitEnabled: true } + ); + expect(d.store).toBe("new"); + }); +}); diff --git a/apps/webapp/test/runOpsDbTopology.test.ts b/apps/webapp/test/runOpsDbTopology.test.ts new file mode 100644 index 00000000000..b33c0db7fd3 --- /dev/null +++ b/apps/webapp/test/runOpsDbTopology.test.ts @@ -0,0 +1,121 @@ +import { PostgreSqlContainer } from "@testcontainers/postgresql"; +import { describe, expect, it, vi } from "vitest"; +import { buildReplicaClient, buildWriterClient, selectRunOpsTopology } from "~/db.server"; + +const cp = { writer: {} as any, replica: {} as any }; + +describe("selectRunOpsTopology (pure)", () => { + it("split OFF: all run-ops handles collapse to control-plane and NO client is built", () => { + const buildNewWriter = vi.fn(); + const buildNewReplica = vi.fn(); + const topo = selectRunOpsTopology( + { splitEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + // new run-ops collapses to the control-plane client refs (no second connection). + expect(topo.newRunOps.writer).toBe(cp.writer); + expect(topo.newRunOps.replica).toBe(cp.replica); + expect(topo.legacyRunOps).toBe(cp); + expect(topo.controlPlane).toBe(cp); + expect(buildNewWriter).not.toHaveBeenCalled(); // no second connection opened + expect(buildNewReplica).not.toHaveBeenCalled(); + }); + + it("split ON: new-run-ops builds its own writer + replica; cp/legacy reuse cp", () => { + const newWriter = { tag: "nw" } as any; + const newReplica = { tag: "nr" } as any; + const buildNewWriter = vi.fn().mockReturnValue(newWriter); + const buildNewReplica = vi.fn().mockReturnValue(newReplica); + const topo = selectRunOpsTopology( + { + splitEnabled: true, + legacyUrl: "postgres://legacy", + newUrl: "postgres://new", + newReplicaUrl: "postgres://new-r", + }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + expect(topo.newRunOps.writer).toBe(newWriter); + expect(topo.newRunOps.replica).toBe(newReplica); + expect(topo.controlPlane).toBe(cp); + expect(topo.legacyRunOps).toBe(cp); // legacy run-ops shares the control-plane server initially + expect(buildNewWriter).toHaveBeenCalledTimes(1); + }); + + it("split ON without a new replica URL: replica falls back to the new writer", () => { + const newWriter = { tag: "nw" } as any; + const buildNewWriter = vi.fn().mockReturnValue(newWriter); + const buildNewReplica = vi.fn(); + const topo = selectRunOpsTopology( + { splitEnabled: true, legacyUrl: "postgres://legacy", newUrl: "postgres://new" }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + expect(topo.newRunOps.replica).toBe(newWriter); + expect(buildNewReplica).not.toHaveBeenCalled(); + }); +}); + +describe("selectRunOpsTopology (integration, real containers)", () => { + it("split OFF: opens exactly one DB; all run-ops handles share the control-plane client", async () => { + const pg = await new PostgreSqlContainer("docker.io/postgres:14").start(); + try { + const cpWriter = buildWriterClient({ url: pg.getConnectionUri(), clientType: "cp" }); + const cp = { writer: cpWriter, replica: cpWriter }; + const builtUrls: string[] = []; + const topo = selectRunOpsTopology( + { splitEnabled: false, legacyUrl: pg.getConnectionUri(), newUrl: pg.getConnectionUri() }, + { + controlPlane: cp, + buildNewWriter: (url) => { + builtUrls.push(url); + return buildWriterClient({ url, clientType: "x" }) as any; + }, + buildNewReplica: (url) => { + builtUrls.push(url); + return buildReplicaClient({ url, clientType: "x" }) as any; + }, + } + ); + expect(builtUrls).toHaveLength(0); // no second connection opened + expect(topo.newRunOps.writer).toBe(cp.writer); + expect(topo.newRunOps.replica).toBe(cp.replica); + expect(topo.legacyRunOps).toBe(cp); + await topo.newRunOps.writer.$queryRawUnsafe("SELECT 1"); + await cpWriter.$disconnect(); + } finally { + await pg.stop(); + } + }, 60_000); + + it("split ON: constructs CP + legacy-run-ops + new-run-ops + replicas (legacy + new)", async () => { + const rds = await new PostgreSqlContainer("docker.io/postgres:14").start(); + const ps = await new PostgreSqlContainer("docker.io/postgres:17").start(); + try { + const cpWriter = buildWriterClient({ url: rds.getConnectionUri(), clientType: "cp" }); + const cp = { writer: cpWriter, replica: cpWriter }; + const topo = selectRunOpsTopology( + { splitEnabled: true, legacyUrl: rds.getConnectionUri(), newUrl: ps.getConnectionUri() }, + { + controlPlane: cp, + buildNewWriter: (url, ct) => buildWriterClient({ url, clientType: ct }) as any, + buildNewReplica: (url, ct) => buildReplicaClient({ url, clientType: ct }) as any, + } + ); + // CP + legacy resolve to the legacy/control-plane pair; new run-ops is the dedicated run-ops box. + expect(topo.controlPlane).toBe(cp); + expect(topo.legacyRunOps).toBe(cp); + expect(topo.newRunOps.writer).not.toBe(cpWriter); + await topo.controlPlane.writer.$queryRawUnsafe("SELECT 1"); + await topo.newRunOps.writer.$queryRawUnsafe("SELECT 1"); + const ver = await topo.newRunOps.writer.$queryRawUnsafe>( + "SELECT current_setting('server_version') AS v" + ); + expect(ver[0].v.startsWith("17")).toBe(true); // new run-ops really is the dedicated box + await cpWriter.$disconnect(); + await topo.newRunOps.writer.$disconnect(); + } finally { + await rds.stop(); + await ps.stop(); + } + }, 120_000); +}); diff --git a/apps/webapp/test/runOpsMintCutover.test.ts b/apps/webapp/test/runOpsMintCutover.test.ts new file mode 100644 index 00000000000..d838a382e92 --- /dev/null +++ b/apps/webapp/test/runOpsMintCutover.test.ts @@ -0,0 +1,193 @@ +// Per-env KSUID mint cutover integration proof. +// +// NEVER mocks the DB: the mint decision runs through the pure core `computeRunIdMintKind` +// wired to a REAL `makeFlag(prisma)` that reads the REAL `Organization.featureFlags` / +// `FeatureFlag` rows in a testcontainers Postgres. Only the two boundary knobs +// are injected — `masterEnabled` and the `splitEnabled` boot-boolean — never a +// mocked DB. The KSUID/cuid format + residency are then proven through the SAME isomorphic +// helpers the real trigger path uses (`generateKsuidId` / `RunId.toFriendlyId` / +// `RunId.fromFriendlyId` / `ownerEngine`). +import type { PrismaClient } from "@trigger.dev/database"; +import { generateKsuidId, ownerEngine, RunId } from "@trigger.dev/core/v3/isomorphic"; +import { postgresTest } from "@internal/testcontainers"; +import { describe, expect, vi } from "vitest"; +import { + computeRunIdMintKind, + type RunIdMintKind, +} from "~/v3/runOpsMigration/runOpsMintKind.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { + createTestOrgProjectWithMember, + createRuntimeEnvironment, + uniqueId, +} from "./fixtures/environmentVariablesFixtures"; + +vi.setConfig({ testTimeout: 60_000 }); + +// The real trigger-path mint helper, copied verbatim from triggerTask.server.ts so the +// test exercises the exact id format a cut-over env produces. +function mintRunKsuidFriendlyId(): string { + return RunId.toFriendlyId(generateKsuidId()); +} + +// Mirrors the real trigger path: resolve the kind, then mint either a KSUID friendlyId or +// the default cuid one (RunId.generate()). +function mintRunFriendlyId(kind: RunIdMintKind): string { + return kind === "ksuid" ? mintRunKsuidFriendlyId() : RunId.generate().friendlyId; +} + +async function seedOrgEnv(prisma: PrismaClient, mintFlag?: RunIdMintKind) { + const { organization, project } = await createTestOrgProjectWithMember(prisma); + const environment = await createRuntimeEnvironment(prisma, { + projectId: project.id, + organizationId: organization.id, + type: "PRODUCTION", + slug: uniqueId("prod"), + }); + if (mintFlag) { + await prisma.organization.update({ + where: { id: organization.id }, + data: { featureFlags: { [FEATURE_FLAG.runOpsMintKsuid]: mintFlag } }, + }); + } + return { organization, environment }; +} + +// Build the env-bound `flag` dependency around a REAL makeFlag(prisma) reading the real +// Organization.featureFlags override store. Pure-core gets the real DB-backed flag; only +// masterEnabled + splitEnabled are injected boundary config. +function realFlag(prisma: PrismaClient) { + const flagFn = makeFlag(prisma); + return async (orgId: string, orgFeatureFlags: unknown | undefined): Promise => { + const overrides = + orgFeatureFlags !== undefined + ? orgFeatureFlags + : ( + await prisma.organization.findFirst({ + where: { id: orgId }, + select: { featureFlags: true }, + }) + )?.featureFlags; + return flagFn({ + key: FEATURE_FLAG.runOpsMintKsuid, + defaultValue: "cuid", + overrides: (overrides as Record) ?? {}, + }); + }; +} + +describe("per-env KSUID mint cutover", () => { + postgresTest( + "canary org mints KSUID/NEW; non-canary org mints cuid/LEGACY", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); // canary + const b = await seedOrgEnv(prisma); // not cut over + + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + const kindA = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const kindB = await computeRunIdMintKind( + { organizationId: b.organization.id, id: b.environment.id }, + deps + ); + + expect(kindA).toBe("ksuid"); + expect(kindB).toBe("cuid"); + + const friendlyA = mintRunFriendlyId(kindA); + const friendlyB = mintRunFriendlyId(kindB); + + expect(RunId.fromFriendlyId(friendlyA).length).toBe(27); + expect(ownerEngine(RunId.fromFriendlyId(friendlyA))).toBe("NEW"); + + expect(RunId.fromFriendlyId(friendlyB).length).toBe(25); + expect(ownerEngine(RunId.fromFriendlyId(friendlyB))).toBe("LEGACY"); + } + ); + + postgresTest( + "split OFF mints cuid even for a flagged-ksuid org (split gate dominates)", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); + const flag = vi.fn(realFlag(prisma)); + + const kind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + { masterEnabled: true, splitEnabled: async () => false, flag } + ); + + expect(kind).toBe("cuid"); + expect(flag).not.toHaveBeenCalled(); // gated off before any DB read + } + ); + + postgresTest( + "drain-new-forward (D8): flipping back to cuid stops new KSUID mints without reverting existing", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + // First run is born KSUID/NEW while cut over. + const firstKind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const firstFriendly = mintRunFriendlyId(firstKind); + expect(firstKind).toBe("ksuid"); + expect(ownerEngine(RunId.fromFriendlyId(firstFriendly))).toBe("NEW"); + + // Roll the org back to cuid (drain-new-forward — set the flag to "cuid"). + await prisma.organization.update({ + where: { id: a.organization.id }, + data: { featureFlags: { [FEATURE_FLAG.runOpsMintKsuid]: "cuid" } }, + }); + + // The NEXT run mints cuid again (the env-bound resolver's TTL cache is not used here, + // so the flip is observed immediately — production waits one cache TTL). + const nextKind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const nextFriendly = mintRunFriendlyId(nextKind); + expect(nextKind).toBe("cuid"); + expect(ownerEngine(RunId.fromFriendlyId(nextFriendly))).toBe("LEGACY"); + + // The already-minted KSUID run is untouched — drain-new-forward never reverts it. + expect(RunId.fromFriendlyId(firstFriendly).length).toBe(27); + expect(ownerEngine(RunId.fromFriendlyId(firstFriendly))).toBe("NEW"); + } + ); + + postgresTest( + "parent and child re-resolve independently from their own org flag", + async ({ prisma }) => { + // Parent lives in a cut-over org; child is triggered into a NON-cut-over org. + const parentOrg = await seedOrgEnv(prisma, "ksuid"); + const childOrg = await seedOrgEnv(prisma); // not cut over + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + const parentKind = await computeRunIdMintKind( + { organizationId: parentOrg.organization.id, id: parentOrg.environment.id }, + deps + ); + const childKind = await computeRunIdMintKind( + { organizationId: childOrg.organization.id, id: childOrg.environment.id }, + deps + ); + + // Observed behavior: the mint decision is resolved per the run's OWN org/env flag — + // it does NOT inherit the parent's residency. A child in a non-cut-over org mints cuid + // even when its parent was born KSUID. If children must inherit, that inheritance + // belongs to the child-trigger path, not this resolver. + expect(parentKind).toBe("ksuid"); + expect(childKind).toBe("cuid"); + } + ); +}); diff --git a/apps/webapp/test/runOpsSplitMode.test.ts b/apps/webapp/test/runOpsSplitMode.test.ts new file mode 100644 index 00000000000..826dd37c09e --- /dev/null +++ b/apps/webapp/test/runOpsSplitMode.test.ts @@ -0,0 +1,95 @@ +import { describe, expect, it, vi } from "vitest"; +// @testcontainers/postgresql resolves because it is declared in apps/webapp/package.json. +import { PostgreSqlContainer } from "@testcontainers/postgresql"; +import { computeSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; +import { probeDistinctDatabases } from "~/v3/runOpsMigration/distinctDbSentinel.server"; + +describe("computeSplitEnabled (pure)", () => { + it("is OFF by default and never probes when the flag is off", async () => { + const probe = vi.fn(); + const result = await computeSplitEnabled( + { flagEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ); + expect(result).toBe(false); + expect(probe).not.toHaveBeenCalled(); // self-host opens no second connection + }); + + it("stays single-DB when flag is on but URLs are missing", async () => { + const probe = vi.fn(); + expect(await computeSplitEnabled({ flagEnabled: true }, { probe })).toBe(false); + expect(probe).not.toHaveBeenCalled(); + }); + + it("enables split only when flag is on AND sentinel confirms distinct", async () => { + const probe = vi.fn().mockResolvedValue({ distinct: true }); + expect( + await computeSplitEnabled( + { flagEnabled: true, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(true); + }); + + it("stays single-DB when sentinel reports NOT distinct", async () => { + const probe = vi.fn().mockResolvedValue({ distinct: false, reason: "same DB" }); + expect( + await computeSplitEnabled( + { flagEnabled: true, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(false); + }); + + // Migration-family unreachability proof: with the flag off the gate returns false and + // no probe runs. Downstream migration-family code is required to early-return on + // !isSplitEnabled(); this unit proves the gate's value, each downstream unit's own test + // proves it honors the gate. Split OFF collapsing to a single prisma/$replica pair with + // no second connection opened depends on this no-probe behavior. + it("is provably unreachable (no probe) when the flag is off", async () => { + const probe = vi.fn(); + expect( + await computeSplitEnabled( + { flagEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(false); + expect(probe).not.toHaveBeenCalled(); + }); +}); + +describe("distinct-DB sentinel (real Postgres)", () => { + it("reports NOT distinct when both URLs hit the same physical cluster", async () => { + const pg = await new PostgreSqlContainer("docker.io/postgres:14").start(); + try { + const url = pg.getConnectionUri(); + const result = await probeDistinctDatabases(url, url); + expect(result.distinct).toBe(false); // identical URL -> false-split prevented + } finally { + await pg.stop(); + } + }, 60_000); + + it("reports distinct when URLs hit two separate clusters (legacy + new)", async () => { + const legacy = await new PostgreSqlContainer("docker.io/postgres:14").start(); + const next = await new PostgreSqlContainer("docker.io/postgres:17").start(); + try { + const result = await probeDistinctDatabases( + legacy.getConnectionUri(), + next.getConnectionUri() + ); + expect(result.distinct).toBe(true); + } finally { + await legacy.stop(); + await next.stop(); + } + }, 120_000); + + it("fails closed (single-DB) when a DB is unreachable", async () => { + const result = await probeDistinctDatabases( + "postgresql://nouser:nopass@127.0.0.1:1/none", + "postgresql://nouser:nopass@127.0.0.1:2/none" + ); + expect(result.distinct).toBe(false); + }, 30_000); +}); diff --git a/apps/webapp/test/runOpsSplitReadGate.test.ts b/apps/webapp/test/runOpsSplitReadGate.test.ts new file mode 100644 index 00000000000..c9238bcff34 --- /dev/null +++ b/apps/webapp/test/runOpsSplitReadGate.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from "vitest"; +import { computeRunOpsSplitReadEnabled } from "~/v3/runOpsMigration/runOpsSplitReadGate"; + +// Distinct sentinel objects standing in for the prisma client singletons. +const cpWriter = { __tag: "cp-writer" }; +const cpReplica = { __tag: "cp-replica" }; +const dedicatedNew = { __tag: "dedicated-new" }; + +describe("computeRunOpsSplitReadEnabled", () => { + it("enables split when a distinct dedicated NEW client is open and both URLs are set", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(true); + }); + + // Regression: the LEGACY run-ops handle IS the control-plane replica by design. The gate must + // depend only on the NEW client's distinctness — never on the legacy handle differing from CP. + it("stays enabled even though the legacy handle equals the control-plane replica", () => { + // The caller passes controlPlaneReplica (=== legacy handle) for the CP slot; NEW is still + // distinct, so split must remain ON. (A gate that required legacy !== CP would be false here.) + expect( + computeRunOpsSplitReadEnabled({ + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, // legacy run-ops replica is this very object in prod + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(true); + }); + + it("disables split when NEW falls back to the control-plane client (no dedicated DB)", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: cpReplica, // cpFallback: NEW === control-plane replica + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(false); + }); + + it("disables split when NEW equals the control-plane writer", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: cpWriter, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(false); + }); + + it("disables split when either URL is missing, even with a distinct client", () => { + const base = { + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + }; + expect(computeRunOpsSplitReadEnabled({ ...base, hasNewUrl: false, hasLegacyUrl: true })).toBe( + false + ); + expect(computeRunOpsSplitReadEnabled({ ...base, hasNewUrl: true, hasLegacyUrl: false })).toBe( + false + ); + }); +}); diff --git a/apps/webapp/test/services.controlPlane.readthrough.test.ts b/apps/webapp/test/services.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..11ddb8dd0bf --- /dev/null +++ b/apps/webapp/test/services.controlPlane.readthrough.test.ts @@ -0,0 +1,115 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for the run-rooted services that were +// decomposed onto the ControlPlaneResolver. The env (slug/project/org) lives on PG14; +// the run-ops scalar row on PG17 with cross-seam FKs dropped. A PostgresRunStore over PG17 reads +// run scalars; the ControlPlaneResolver over PG14 resolves the env. The DB is never mocked. The +// .count() proof shows neither DB joins the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const c of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe(`ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${c}"`); + } +} + +let n = 0; +async function seedControlPlane(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + return { organization, project, environment }; +} + +async function seedRun( + prisma: PrismaClient, + cp: { environment: { id: string }; project: { id: string }; organization: { id: string } } +) { + const s = n++; + return prisma.taskRun.create({ + data: { + id: `run_${s}_pg17`, + engine: "V2", + status: "PENDING", + friendlyId: `run_${s}`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + taskIdentifier: "svc-task", + payload: "{}", + payloadType: "application/json", + queue: "task/svc-task", + traceId: `tr_${s}`, + spanId: `sp_${s}`, + workerQueue: "main", + }, + }); +} + +function buildResolver(cp: PrismaClient) { + return new ControlPlaneResolver({ + controlPlanePrimary: cp, + controlPlaneReplica: cp, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); +} + +describe("service control-plane read-through", () => { + heteroPostgresTest( + "expireEnqueuedRun: org id resolves from PG14 via resolveEnv while run scalars resolve from PG17", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient); + const run = await seedRun(prisma17 as unknown as PrismaClient, cp); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = buildResolver(prisma14 as unknown as PrismaClient); + + const found = await runStore.findRun( + { id: run.id }, + { select: { id: true, runtimeEnvironmentId: true } }, + prisma17 as unknown as PrismaClient + ); + const env = await resolver.resolveEnv(found!.runtimeEnvironmentId); + expect(env!.organizationId).toBe(cp.organization.id); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..29fe5972ac9 --- /dev/null +++ b/apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts @@ -0,0 +1,176 @@ +// Dedicated run-ops proof: Shape-1 run-detail loaders read the run by friendlyId on the dedicated +// run-ops client (PG17, subset schema with no control-plane tables), then authorize membership + +// resolve env on PG14. Neither DB joins the other. +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +let n = 0; +async function seedAll(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + const member = await prisma.user.create({ + data: { email: `u-${s}@example.com`, name: `U ${s}`, authenticationMethod: "MAGIC_LINK" }, + }); + await prisma.orgMember.create({ + data: { organizationId: organization.id, userId: member.id, role: "ADMIN" }, + }); + const stranger = await prisma.user.create({ + data: { email: `x-${s}@example.com`, name: `X ${s}`, authenticationMethod: "MAGIC_LINK" }, + }); + return { organization, project, environment, member, stranger }; +} + +// [TEST-NEWSEED] The run lives on the dedicated run-ops client; its control-plane FKs are synthetic +// scalar ids pointing at rows that exist only on PG14 (the dedicated DB has no such tables). +async function seedKsuidRun(prisma17: RunOpsPrismaClient, cp: Awaited>) { + const k = n++; + return prisma17.taskRun.create({ + data: { + id: `run_2abcDEF${k}ghijkLMNOPqrstuv`, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: `run_2abcDEF${k}ghijkLMNOPqrstuv`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + taskIdentifier: "shape1-task", + payload: "{}", + payloadType: "application/json", + queue: "task/shape1-task", + idempotencyKey: "idem-1", + spanId: `sp_${k}`, + traceId: `tr_${k}`, + number: 1, + workerQueue: "main", + }, + }); +} + +function wire(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + schemaVariant: "dedicated", + }); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma14, + controlPlaneReplica: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + return { runStore, resolver }; +} + +describe("Shape-1 run-detail loaders cross-DB read-through (dedicated run-ops client)", () => { + heteroRunOpsPostgresTest( + "ksuid run resolves: friendlyId read on the dedicated run-ops DB + membership/env auth on PG14 (resources.runs.$runParam shape)", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore, resolver } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { + select: { + id: true, + traceId: true, + projectId: true, + runtimeEnvironmentId: true, + status: true, + queue: true, + spanId: true, + idempotencyKey: true, + taskIdentifier: true, + }, + } + ); + expect(found).not.toBeNull(); + expect(found!.id).toBe(run.id); + + const authorized = await cp14.project.findFirst({ + where: { id: found!.projectId, organization: { members: { some: { userId: cp.member.id } } } }, + select: { id: true }, + }); + expect(authorized).not.toBeNull(); + + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.project.slug).toBe(cp.project.slug); + expect(env!.organization.slug).toBe(cp.organization.slug); + + // Inversion proof: no run on PG14 (control-plane). + expect(await cp14.taskRun.count()).toBe(0); + } + ); + + heteroRunOpsPostgresTest( + "non-member is denied: membership findFirst returns null (404/redirect path)", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { select: { id: true, projectId: true, runtimeEnvironmentId: true } } + ); + expect(found).not.toBeNull(); + + const authorized = await cp14.project.findFirst({ + where: { id: found!.projectId, organization: { members: { some: { userId: cp.stranger.id } } } }, + select: { id: true }, + }); + expect(authorized).toBeNull(); + } + ); + + heteroRunOpsPostgresTest( + "env-slug-scoped routes: idempotencyKey.reset re-imposes env slug on the resolved env", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore, resolver } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { select: { id: true, idempotencyKey: true, taskIdentifier: true, projectId: true, runtimeEnvironmentId: true } } + ); + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.slug === "does-not-match").toBe(false); + expect(found!.idempotencyKey).toBe("idem-1"); + } + ); +}); diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts new file mode 100644 index 00000000000..322366125fb --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts @@ -0,0 +1,213 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +// Control-plane datasource repoint (legacy DB -> new DB). +// +// Post-repoint the control plane lives on the new DB, so we model the new topology by seeding the +// control-plane rows on the new side (`prisma17`) and injecting it as both the resolver's primary +// and replica. `prisma14` stands in for the pre-repoint legacy source for the cross-version +// transition test. NEVER mock — we seed and read the real testcontainer clients, and we observe +// the DB boundary via a $extends query counter. + +// Cross-DB testcontainer spin-up + queries can exceed the 5s default on the first test. +vi.setConfig({ testTimeout: 60_000 }); + +let seedCounter = 0; + +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook that increments a counter + * on every actual operation. NOT a mock: the returned client still issues the real query and + * returns real rows — we only observe the DB boundary (the countQueries pattern). + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + +/** Seeds org -> project -> env + a pinned BackgroundWorker (+task) + TaskQueue + TaskSchedule. */ +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const org = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: org.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${n}`, + contentHash: `hash_${n}`, + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: `2024.1.${n}`, + metadata: {}, + engine: "V2", + }, + }); + await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${n}`, + slug: `my-task-${n}`, + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${n}`, + name: `task/my-task-${n}`, + runtimeEnvironmentId: environment.id, + projectId: project.id, + workers: { connect: { id: worker.id } }, + }, + }); + const schedule = await prisma.taskSchedule.create({ + data: { + friendlyId: `schedule_${n}`, + taskIdentifier: `my-task-${n}`, + generatorExpression: "0 * * * *", + projectId: project.id, + }, + }); + return { org, project, environment, worker, queue, schedule }; +} + +// --- Repoint resolution (split ON, CP on the new DB) --------- + +heteroPostgresTest( + "control-plane references resolve against the repointed (new-DB) CP client", + async ({ prisma17 }) => { + const { environment, worker } = await seedControlPlane(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma17, + controlPlaneReplica: prisma17, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); + expect( + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }) + ).not.toBeNull(); + } +); + +// --- Relaxed-cache (no latency regression) ------------------------- + +heteroPostgresTest( + "relaxed (longer TTL) cache still hits on the new DB", + async ({ prisma17 }) => { + const { environment } = await seedControlPlane(prisma17); + const { client: counting, reads } = countQueries(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: counting, + controlPlaneReplica: counting, + // Relaxed: a much longer TTL than the default — same-provider resolution is cheap. + cache: new ControlPlaneCache({ ttlMs: 300_000, maxEntries: 10_000 }), + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); + expect(reads()).toBe(1); + // Second read served from the relaxed cache — no extra DB round-trip. + await resolver.resolveEnv(environment.id); + expect(reads()).toBe(1); + } +); + +// --- Cross-version transition (legacy DB -> new DB) ----------------------- + +heteroPostgresTest( + "resolution is byte-identical across the legacy-DB -> new-DB host transition", + async ({ prisma14, prisma17, pinnedCollation }) => { + // Seed identical control-plane shapes on the pre-repoint (legacy) and post-repoint + // (new) sides. + const before = await seedControlPlane(prisma14); + const after = await seedControlPlane(prisma17); + + const resolver14 = new ControlPlaneResolver({ + controlPlanePrimary: prisma14, + controlPlaneReplica: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + const resolver17 = new ControlPlaneResolver({ + controlPlanePrimary: prisma17, + controlPlaneReplica: prisma17, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const env14 = await resolver14.resolveEnv(before.environment.id); + const env17 = await resolver17.resolveEnv(after.environment.id); + // Same resolution shape across the version boundary (ids differ per-seed; structure identical). + expect(Object.keys(env14 ?? {}).sort()).toEqual(Object.keys(env17 ?? {}).sort()); + expect(env14?.type).toBe(env17?.type); + expect(env14?.archivedAt).toBe(env17?.archivedAt); + + // ORDER BY on a representative text-heavy column must agree across the version boundary, using + // the pinned ICU collation the hetero fixture exposes so the comparison is apples-to-apples. + const slugs = ["banana", "Apple", "cherry", "Äpfel", "apple"]; + const orderBy = async (prisma: PrismaClient) => { + const rows = await prisma.$queryRawUnsafe<{ s: string }[]>( + `SELECT s FROM (VALUES ('${slugs.join("'),('")}')) AS t(s) ORDER BY s COLLATE "${pinnedCollation}"` + ); + return rows.map((r) => r.s); + }; + expect(await orderBy(prisma14)).toEqual(await orderBy(prisma17)); + } +); + +// --- Single-DB no-op (passthrough preserved) ----------------------- + +heteroPostgresTest( + "single-DB passthrough (split OFF) runs plain in-DB joins with no cache", + async ({ prisma17 }) => { + const { environment } = await seedControlPlane(prisma17); + const { client: counting, reads } = countQueries(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: counting, + controlPlaneReplica: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveEnv(environment.id); + await resolver.resolveEnv(environment.id); + // No cache when split is OFF — every call hits the DB, identical to today's passthrough. + expect(reads()).toBe(2); + } +); diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts new file mode 100644 index 00000000000..207fb5201c2 --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts @@ -0,0 +1,609 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, it, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { + ControlPlaneReferenceError, + ControlPlaneResolver, +} from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +// Cross-DB testcontainer spin-up + queries can exceed the 5s default on the first test. +vi.setConfig({ testTimeout: 60_000 }); + +// --- test helpers ---------------------------------------------------------- + +let seedCounter = 0; + +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook that increments a + * counter on every actual operation. NOT a mock: the returned client still issues the real + * query and returns real rows — we only observe the DB boundary. + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const org = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: org.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + return { org, project, environment }; +} + +async function seedWorker( + prisma: PrismaClient, + ctx: { projectId: string; environmentId: string }, + opts?: { promote?: boolean } +) { + const n = seedCounter++; + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${n}`, + contentHash: `hash_${n}`, + projectId: ctx.projectId, + runtimeEnvironmentId: ctx.environmentId, + version: `2024.1.${n}`, + metadata: {}, + engine: "V2", + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${n}`, + slug: `my-task-${n}`, + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${n}`, + name: `task/my-task-${n}`, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + workers: { connect: { id: worker.id } }, + }, + }); + let deployment = null; + if (opts?.promote) { + deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `deployment_${n}`, + contentHash: `hash_${n}`, + version: worker.version, + shortCode: `dep_${n}`, + type: "MANAGED", + status: "DEPLOYED", + projectId: ctx.projectId, + environmentId: ctx.environmentId, + workerId: worker.id, + }, + }); + await prisma.workerDeploymentPromotion.create({ + data: { + label: "current", + deploymentId: deployment.id, + environmentId: ctx.environmentId, + }, + }); + } + return { worker, task, queue, deployment }; +} + +// --- cache unit tests (no DB) ---------------------------------------------- + +describe("ControlPlaneCache", () => { + it("caches null as a confirmed absence (distinct from a miss)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + expect(cache.getEnv("env_x")).toBeUndefined(); + cache.setEnv("env_x", null); + expect(cache.getEnv("env_x")).toBeNull(); + }); + + it("invalidateEnv drops the entry (next read is a miss)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + cache.setEnv("env_y", { id: "env_y" } as any); + cache.invalidateEnv("env_y"); + expect(cache.getEnv("env_y")).toBeUndefined(); + }); + + it("invalidating one key does not affect another", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + cache.setEnv("env_a", { id: "env_a" } as any); + cache.setEnv("env_b", { id: "env_b" } as any); + cache.invalidateEnv("env_a"); + expect(cache.getEnv("env_a")).toBeUndefined(); + expect(cache.getEnv("env_b")).toMatchObject({ id: "env_b" }); + }); +}); + +// --- resolveEnv ------------------------------------------------------------- + +heteroPostgresTest( + "resolveEnv returns the cross-DB env row and caches it", + async ({ prisma14 }) => { + const { environment, org } = await seedControlPlane(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveEnv(environment.id); + expect(first).toMatchObject({ + id: environment.id, + projectId: environment.projectId, + organizationId: org.id, + type: "PRODUCTION", + archivedAt: null, + }); + expect(reads()).toBe(1); + + const second = await resolver.resolveEnv(environment.id); + expect(second).toEqual(first); + expect(reads()).toBe(1); + } +); + +heteroPostgresTest("resolveEnv caches a null absence", async ({ prisma14 }) => { + const cache = new ControlPlaneCache(); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv("env_does_not_exist")).toBeNull(); + expect(reads()).toBe(1); + expect(await resolver.resolveEnv("env_does_not_exist")).toBeNull(); + expect(reads()).toBe(1); +}); + +heteroPostgresTest( + "resolveEnv passthrough (split OFF) hits the DB every time, no cache", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveEnv(environment.id); + await resolver.resolveEnv(environment.id); + expect(reads()).toBe(2); + } +); + +// --- resolveWorkerVersion --------------------------------------------------- + +heteroPostgresTest( + "resolveWorkerVersion (pinned) returns worker/tasks/queues and caches it", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task, queue } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const first = await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(first?.worker.id).toBe(worker.id); + expect(first?.tasks.map((t) => t.id)).toContain(task.id); + expect(first?.queues.map((q) => q.id)).toContain(queue.id); + expect(first?.deployment).toBeNull(); + const readsAfterFirst = reads(); + expect(readsAfterFirst).toBeGreaterThanOrEqual(1); + + const second = await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(second?.worker.id).toBe(worker.id); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveWorkerVersion (current deployment) resolves the promoted worker", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, deployment } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const first = await resolver.resolveWorkerVersion({ environmentId: environment.id }); + expect(first?.worker.id).toBe(worker.id); + expect(first?.deployment?.id).toBe(deployment?.id); + const readsAfterFirst = reads(); + + const second = await resolver.resolveWorkerVersion({ environmentId: environment.id }); + expect(second?.worker.id).toBe(worker.id); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveWorkerVersion passthrough (split OFF) re-reads every call", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + const readsAfterFirst = reads(); + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(reads()).toBe(readsAfterFirst * 2); + } +); + +// --- assertEnvExists -------------------------------------------------------- + +heteroPostgresTest( + "assertEnvExists resolves for a seeded env, caches, and throws for a missing one", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + expect(reads()).toBe(1); + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + expect(reads()).toBe(1); + + await expect(resolver.assertEnvExists("env_missing")).rejects.toBeInstanceOf( + ControlPlaneReferenceError + ); + } +); + +heteroPostgresTest( + "assertEnvExists passthrough (split OFF) still validates a real env", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + await expect(resolver.assertEnvExists("env_missing")).rejects.toBeInstanceOf( + ControlPlaneReferenceError + ); + } +); + +// --- resolveAuthenticatedEnv ------------------------------------------------ + +heteroPostgresTest( + "resolveAuthenticatedEnv returns the toAuthenticated shape and caches it", + async ({ prisma14 }) => { + const { environment, project, org } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.id).toBe(environment.id); + expect(first!.slug).toBe(environment.slug); + expect(first!.type).toBe("PRODUCTION"); + expect(first!.organizationId).toBe(org.id); + expect(first!.projectId).toBe(project.id); + expect(first!.project.id).toBe(project.id); + expect(first!.project.externalRef).toBe(project.externalRef); + expect(first!.organization.id).toBe(org.id); + expect(first!.organization.title).toBe(org.title); + // concurrencyLimitBurstFactor is coerced to a plain number by toAuthenticated(). + expect(typeof first!.concurrencyLimitBurstFactor).toBe("number"); + expect(reads()).toBe(1); + + const second = await resolver.resolveAuthenticatedEnv(environment.id); + expect(second).toEqual(first); + expect(reads()).toBe(1); + + expect(await resolver.resolveAuthenticatedEnv("env_missing")).toBeNull(); + } +); + +heteroPostgresTest( + "resolveAuthenticatedEnv populates parentEnvironment { id, apiKey } for a branch env", + async ({ prisma14 }) => { + const m = seedCounter++; + const org = await prisma14.organization.create({ + data: { title: `Org wp ${m}`, slug: `org-wp-${m}` }, + }); + const project = await prisma14.project.create({ + data: { + name: `P wp ${m}`, + slug: `p-wp-${m}`, + externalRef: `proj_wp_${m}`, + organizationId: org.id, + }, + }); + const parent = await prisma14.runtimeEnvironment.create({ + data: { + type: "PREVIEW", + slug: `preview-parent-${m}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_parent_key_${m}`, + pkApiKey: `pk_parent_${m}`, + shortcode: `sc_parent_${m}`, + }, + }); + const branch = await prisma14.runtimeEnvironment.create({ + data: { + type: "PREVIEW", + slug: `preview-branch-${m}`, + branchName: "feat/x", + projectId: project.id, + organizationId: org.id, + apiKey: `tr_branch_key_${m}`, + pkApiKey: `pk_branch_${m}`, + shortcode: `sc_branch_${m}`, + parentEnvironmentId: parent.id, + }, + }); + + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const env = await resolver.resolveAuthenticatedEnv(branch.id); + expect(env).not.toBeNull(); + expect(env!.apiKey).toBe(`tr_branch_key_${m}`); + expect(env!.parentEnvironment).not.toBeNull(); + expect(env!.parentEnvironment!.id).toBe(parent.id); + expect(env!.parentEnvironment!.apiKey).toBe(`tr_parent_key_${m}`); + + const noParent = await resolver.resolveAuthenticatedEnv(parent.id); + expect(noParent!.parentEnvironment).toBeNull(); + } +); + +heteroPostgresTest( + "resolveAuthenticatedEnv passthrough (split OFF) hits the DB every time, no cache", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveAuthenticatedEnv(environment.id); + await resolver.resolveAuthenticatedEnv(environment.id); + expect(reads()).toBe(2); + } +); + +// --- resolveRunLockedWorker ------------------------------------------------- + +heteroPostgresTest( + "resolveRunLockedWorker returns lockedBy (task+worker+deployment) and lockedToVersion, caches it", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task, deployment } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting, reads } = countQueries(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveRunLockedWorker({ + lockedById: task.id, + lockedToVersionId: worker.id, + }); + expect(first).not.toBeNull(); + expect(first!.lockedBy!.id).toBe(task.id); + expect(first!.lockedBy!.filePath).toBe(task.filePath); + expect(first!.lockedBy!.slug).toBe(task.slug); + expect(first!.lockedBy!.exportName).toBe(task.exportName); + expect(first!.lockedBy!.machineConfig).toEqual(task.machineConfig); + expect(first!.lockedBy!.worker.id).toBe(worker.id); + expect(first!.lockedBy!.worker.version).toBe(worker.version); + expect(first!.lockedBy!.worker.deployment!.friendlyId).toBe(deployment!.friendlyId); + expect(first!.lockedToVersion!.version).toBe(worker.version); + expect(first!.lockedToVersion!.supportsLazyAttempts).toBe(worker.supportsLazyAttempts); + const readsAfterFirst = reads(); + expect(readsAfterFirst).toBeGreaterThanOrEqual(1); + + const second = await resolver.resolveRunLockedWorker({ + lockedById: task.id, + lockedToVersionId: worker.id, + }); + expect(second).toEqual(first); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker returns null lockedBy/lockedToVersion when ids are absent", + async ({ prisma14 }) => { + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const resolved = await resolver.resolveRunLockedWorker({ + lockedById: null, + lockedToVersionId: null, + }); + expect(resolved).not.toBeNull(); + expect(resolved!.lockedBy).toBeNull(); + expect(resolved!.lockedToVersion).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker resolves lockedBy only when lockedToVersionId is absent", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { task } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const result = await resolver.resolveRunLockedWorker({ lockedById: task.id }); + expect(result).not.toBeNull(); + expect(result!.lockedBy!.id).toBe(task.id); + expect(result!.lockedBy!.slug).toBe(task.slug); + expect(result!.lockedToVersion).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker resolves lockedToVersion only when lockedById is absent", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const result = await resolver.resolveRunLockedWorker({ lockedToVersionId: worker.id }); + expect(result).not.toBeNull(); + expect(result!.lockedToVersion!.version).toBe(worker.version); + expect(result!.lockedBy).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker passthrough (split OFF) re-reads every call", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveRunLockedWorker({ lockedById: task.id, lockedToVersionId: worker.id }); + const readsAfterFirst = reads(); + await resolver.resolveRunLockedWorker({ lockedById: task.id, lockedToVersionId: worker.id }); + expect(reads()).toBe(readsAfterFirst * 2); + } +); diff --git a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts new file mode 100644 index 00000000000..c3084000ab7 --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts @@ -0,0 +1,194 @@ +// The webapp adapter presents the cross-DB app ControlPlaneResolver as the run-engine seam. +// Proven over real testcontainers (never mocked): resolveEnv maps onto the MinimalAuthenticatedEnv +// superset; resolveWorkerVersion forwards the env type so the engine dequeue dispatch (DEV +// most-recent / MANAGED promotion) runs; assertEnvExists delegates and rejects on a missing env. +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; +import { RunEngineControlPlaneResolver } from "~/v3/runOpsMigration/runEngineControlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +let n = 0; + +function buildAppResolver(controlPlane: PrismaClient) { + return new ControlPlaneResolver({ + controlPlanePrimary: controlPlane, + controlPlaneReplica: controlPlane, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => false, + }); +} + +async function seedEnv(prisma: PrismaClient, type: "PRODUCTION" | "DEVELOPMENT") { + const suffix = `re-${n++}`; + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type, + slug: suffix, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${suffix}`, + pkApiKey: `pk_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 9, + }, + }); + return { organization, project, environment, suffix }; +} + +async function seedWorker( + prisma: PrismaClient, + ctx: { projectId: string; environmentId: string; suffix: string }, + opts: { promote?: boolean; deploy?: boolean } +) { + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${ctx.suffix}`, + contentHash: `hash_${ctx.suffix}`, + projectId: ctx.projectId, + runtimeEnvironmentId: ctx.environmentId, + version: `2024.1.${ctx.suffix}`, + metadata: {}, + engine: "V2", + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${ctx.suffix}`, + slug: "my-task", + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${ctx.suffix}`, + name: "task/my-task", + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + type: "VIRTUAL", + workers: { connect: { id: worker.id } }, + tasks: { connect: { id: task.id } }, + }, + }); + if (opts.deploy) { + const deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `deployment_${ctx.suffix}`, + contentHash: worker.contentHash, + version: worker.version, + shortCode: `short_${ctx.suffix}`, + imageReference: `image:${ctx.suffix}`, + status: "DEPLOYED", + projectId: ctx.projectId, + environmentId: ctx.environmentId, + workerId: worker.id, + type: "MANAGED", + }, + }); + if (opts.promote) { + await prisma.workerDeploymentPromotion.create({ + data: { + label: CURRENT_DEPLOYMENT_LABEL, + deploymentId: deployment.id, + environmentId: ctx.environmentId, + }, + }); + } + return { worker, task, queue, deployment }; + } + return { worker, task, queue }; +} + +describe("RunEngineControlPlaneResolver adapter", () => { + heteroPostgresTest("resolveEnv maps app ResolvedEnv onto ResolvedEngineEnv", async ({ prisma14 }) => { + const { organization, project, environment } = await seedEnv(prisma14, "PRODUCTION"); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const env = await adapter.resolveEnv(environment.id); + expect(env).not.toBeNull(); + expect(env!.id).toBe(environment.id); + expect(env!.type).toBe("PRODUCTION"); + expect(env!.projectId).toBe(project.id); + expect(env!.organizationId).toBe(organization.id); + // Nested + concurrency fields the run-engine MinimalAuthenticatedEnvironment requires. + expect(env!.project.id).toBe(project.id); + expect(env!.organization.id).toBe(organization.id); + expect(env!.maximumConcurrencyLimit).toBe(9); + expect(env!.concurrencyLimitBurstFactor.toNumber()).toBe(2); + expect(env!.archivedAt).toBeNull(); + + expect(await adapter.resolveEnv("env_missing")).toBeNull(); + }); + + heteroPostgresTest( + "resolveWorkerVersion (deployed, no workerId) resolves the promoted MANAGED deployment", + async ({ prisma14 }) => { + const { project, environment, suffix } = await seedEnv(prisma14, "PRODUCTION"); + const seeded = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id, suffix }, + { deploy: true, promote: true } + ); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const version = await adapter.resolveWorkerVersion({ + environmentId: environment.id, + type: "PRODUCTION", + }); + expect(version).not.toBeNull(); + expect(version!.worker.id).toBe(seeded.worker.id); + expect(version!.deployment?.id).toBe("deployment" in seeded ? seeded.deployment.id : undefined); + expect(version!.tasks.map((t) => t.slug)).toContain("my-task"); + } + ); + + heteroPostgresTest( + "resolveWorkerVersion (DEVELOPMENT, no workerId) resolves the most-recent worker (no deployment)", + async ({ prisma14 }) => { + const { project, environment, suffix } = await seedEnv(prisma14, "DEVELOPMENT"); + const seeded = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id, suffix }, + { deploy: false } + ); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const version = await adapter.resolveWorkerVersion({ + environmentId: environment.id, + type: "DEVELOPMENT", + }); + expect(version).not.toBeNull(); + expect(version!.worker.id).toBe(seeded.worker.id); + expect(version!.deployment).toBeNull(); + } + ); + + heteroPostgresTest("assertEnvExists resolves for a present env, rejects for a missing one", async ({ + prisma14, + }) => { + const { environment } = await seedEnv(prisma14, "PRODUCTION"); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + await expect(adapter.assertEnvExists(environment.id)).resolves.toBeUndefined(); + await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow(); + }); +}); diff --git a/apps/webapp/vitest.config.ts b/apps/webapp/vitest.config.ts index 69eb980732f..8e05aec1ebc 100644 --- a/apps/webapp/vitest.config.ts +++ b/apps/webapp/vitest.config.ts @@ -5,7 +5,18 @@ import tsconfigPaths from "vite-tsconfig-paths"; export default defineConfig({ test: { sequence: { sequencer: DurationShardingSequencer }, - include: ["test/**/*.test.ts"], + // Webapp tests live under test/**; the run-ops migration family + // colocates its *.server.test.ts next to source under app/v3/runOpsMigration/. + // The run-store seam test colocates next to its source at app/v3/runStore.server.test.ts. + // Pure unit tests for runEngine concerns colocate next to their source file. + include: [ + "test/**/*.test.ts", + "app/v3/runOpsMigration/**/*.test.ts", + "app/v3/runStore.server.test.ts", + "app/v3/services/bulk/**/*.test.ts", + "app/runEngine/concerns/**/*.test.ts", + "app/runEngine/services/**/*.test.ts", + ], // *.e2e.test.ts: smoke matrix, run via vitest.e2e.config.ts. // *.e2e.full.test.ts: full auth suite, runs via vitest.e2e.full.config.ts // (needs a globalSetup-spawned webapp + Postgres container). From ec5f4b63fd4611a059f9fe261a3c7d5e8e084fe9 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 11:48:04 +0100 Subject: [PATCH 02/14] refactor(run-ops): drop known-migrated read layer; residency is id-shape only Migration/drain is deferred, so residency is decided purely by id-shape (ownerEngine): 25-char cuid -> LEGACY, 27-char ksuid -> NEW, unclassifiable -> LEGACY. This is behavior-preserving in production, which never injected a custom isKnownMigrated and, with no migration, always saw the default false. - delete knownMigratedFilter.server.ts + its test - readThrough: drop the isKnownMigrated dep + migrated short-circuit; KEEP the unclassifiable->LEGACY new-then-legacy fallback - resolveInheritedMintKind: collapse to pure ownerEngine id-shape (no deps) - mintBatchFriendlyId: drop isKnownMigrated/isSplitEnabled from ResolveDeps - runEngineHandlersShared: drop isKnownMigrated from EventReadDeps/readRunForEvent (batch-write residency probe via newReplica.batchTaskRun.findFirst is untouched) - tests: delete injected-marker cases, keep pure id-shape assertions Co-Authored-By: Claude Opus 4.8 (1M context) --- .../app/v3/runEngineHandlersShared.server.ts | 6 +- .../knownMigratedFilter.server.test.ts | 133 ----------------- .../knownMigratedFilter.server.ts | 138 ------------------ .../mintBatchFriendlyId.server.test.ts | 81 ++-------- .../mintBatchFriendlyId.server.ts | 11 +- .../readThrough.server.test.ts | 71 +-------- .../v3/runOpsMigration/readThrough.server.ts | 24 +-- .../resolveInheritedMintKind.server.test.ts | 61 +------- .../resolveInheritedMintKind.server.ts | 17 +-- apps/webapp/test/runEngineHandlers.test.ts | 2 - 10 files changed, 34 insertions(+), 510 deletions(-) delete mode 100644 apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts delete mode 100644 apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts diff --git a/apps/webapp/app/v3/runEngineHandlersShared.server.ts b/apps/webapp/app/v3/runEngineHandlersShared.server.ts index 4f648fefc19..f4c07f730be 100644 --- a/apps/webapp/app/v3/runEngineHandlersShared.server.ts +++ b/apps/webapp/app/v3/runEngineHandlersShared.server.ts @@ -16,9 +16,8 @@ export type EventReadDeps = { newReplica: PrismaReplicaClient; legacyReplica: PrismaReplicaClient; splitEnabled: boolean; - // Pure boundaries forwarded to read-through; production leaves them undefined - // so the read-through layer uses its own wired defaults. Tests inject fakes. - isKnownMigrated?: (runId: string) => Promise; + // Pure boundary forwarded to read-through; production leaves it undefined + // so the read-through layer uses its own wired default. Tests inject a fake. isPastRetention?: (runId: string) => boolean; }; @@ -43,7 +42,6 @@ export async function readRunForEvent( newClient: deps.newReplica, legacyReplica: deps.legacyReplica, splitEnabled: deps.splitEnabled, - isKnownMigrated: deps.isKnownMigrated, isPastRetention: deps.isPastRetention, }, }); diff --git a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts deleted file mode 100644 index 06d725a9867..00000000000 --- a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.test.ts +++ /dev/null @@ -1,133 +0,0 @@ -// Pure-core tests for the known-migrated filter. The injected `readMarker`/`probeNew` -// are PURE BOUNDARIES (the marker source and the new-DB existence predicate), not DB -// mocks — the DB-crossing proof for `probeNew` lives in readThrough.server.test.ts. -import { beforeEach, describe, expect, it, vi } from "vitest"; -import { containerTest } from "@internal/testcontainers"; -import { - ensureRedirectMarkerTable, - writeRedirectMarker, - isFenced, -} from "@internal/run-engine"; -import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; -import { - computeKnownMigrated, - isKnownMigrated, - __resetKnownMigratedCacheForTests, -} from "./knownMigratedFilter.server"; - -describe("computeKnownMigrated", () => { - beforeEach(() => { - __resetKnownMigratedCacheForTests(); - }); - - it("(a) marker present → migrated, without probing new", async () => { - const readMarker = vi.fn(async () => true); - const probeNew = vi.fn(async () => false); - - const result = await computeKnownMigrated("run_a", { readMarker, probeNew }); - - expect(result).toBe(true); - expect(readMarker).toHaveBeenCalledTimes(1); - expect(probeNew).not.toHaveBeenCalled(); - }); - - it("(b) marker absent + new-probe hit → migrated", async () => { - const readMarker = vi.fn(async () => false); - const probeNew = vi.fn(async () => true); - - const result = await computeKnownMigrated("run_b", { readMarker, probeNew }); - - expect(result).toBe(true); - expect(readMarker).toHaveBeenCalledTimes(1); - expect(probeNew).toHaveBeenCalledTimes(1); - }); - - it("(c) marker absent + new-probe miss → NOT migrated", async () => { - const readMarker = vi.fn(async () => false); - const probeNew = vi.fn(async () => false); - - const result = await computeKnownMigrated("run_c", { readMarker, probeNew }); - - expect(result).toBe(false); - expect(readMarker).toHaveBeenCalledTimes(1); - expect(probeNew).toHaveBeenCalledTimes(1); - }); - - it("(d) a positive is memoized: second call re-invokes neither readMarker nor probeNew", async () => { - const cache = new BoundedTtlCache(60_000, 100); - const readMarker = vi.fn(async () => false); - const probeNew = vi.fn(async () => true); - - const first = await computeKnownMigrated("run_d", { - readMarker, - probeNew, - cache, - ttlMs: 60_000, - }); - expect(first).toBe(true); - - const second = await computeKnownMigrated("run_d", { - readMarker, - probeNew, - cache, - ttlMs: 60_000, - }); - expect(second).toBe(true); - - // The boundaries ran exactly once, on the first call only. - expect(readMarker).toHaveBeenCalledTimes(1); - expect(probeNew).toHaveBeenCalledTimes(1); - }); -}); - -describe("isKnownMigrated marker authority", () => { - beforeEach(() => { - __resetKnownMigratedCacheForTests(); - }); - - // The OLD-side redirect marker is the authority: once written, the run is "known - // migrated" WITHOUT a NEW-DB probe. `containerTest` gives a real PG to host the - // marker table; `probeNew` is forced false to prove the marker path alone decides. - containerTest( - "a written redirect marker makes a run known-migrated via isFenced (no new-probe)", - async ({ prisma }) => { - await ensureRedirectMarkerTable(prisma); - const runId = "run_marker_authority"; - - const probeNew = vi.fn(async () => false); - const readMarker = (id: string) => isFenced(prisma, id); - - // Before the marker: not fenced → not migrated → probeNew consulted (and false). - expect(await computeKnownMigrated(runId, { readMarker, probeNew })).toBe(false); - expect(probeNew).toHaveBeenCalledTimes(1); - - // Write the OLD-side marker, reset the cache, re-evaluate: now migrated by marker - // alone, and probeNew is NOT consulted again. - await writeRedirectMarker(prisma, { runId, reason: "live-migration" }); - __resetKnownMigratedCacheForTests(); - probeNew.mockClear(); - - expect(await computeKnownMigrated(runId, { readMarker, probeNew })).toBe(true); - expect(probeNew).not.toHaveBeenCalled(); - } - ); - - containerTest( - "the DEFAULT readMarker consults isFenced on the legacy replica", - async ({ prisma }) => { - await ensureRedirectMarkerTable(prisma); - const runId = "run_default_marker"; - - // Inject the legacy-replica client the default adapter reads from; force probeNew - // false so only the marker can flip the result. - const probeNew = vi.fn(async () => false); - - // No `readMarker` passed → the wired default must read the marker via isFenced. - await writeRedirectMarker(prisma, { runId, reason: "live-migration" }); - expect( - await isKnownMigrated(runId, { legacyMarkerClient: prisma, probeNew }) - ).toBe(true); - expect(probeNew).not.toHaveBeenCalled(); - } - ); -}); diff --git a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts b/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts deleted file mode 100644 index 4307f4197ab..00000000000 --- a/apps/webapp/app/v3/runOpsMigration/knownMigratedFilter.server.ts +++ /dev/null @@ -1,138 +0,0 @@ -/** - * Known-migrated filter. - * - * "Known migrated" is true when a run's row has been copied to the NEW run-ops DB - * and the OLD side has been fenced. The read-through layer consults this predicate - * to AVOID re-probing the legacy read replica for runs that already live on new — - * that re-probe is exactly the read load we are shedding off the legacy DB's replica. - * - * Authority order: - * 1. Cache hit → return it. - * 2. Redirect-marker on the OLD side (`readMarker(runId)` true) → migrated. - * The marker is the authoritative "this row now lives on the new DB" signal - * written by the live-migration fencing primitive. - * 3. Fall back to a NEW-DB existence probe (`probeNew(runId)`) — covers - * backfilled/straggler-swept rows whose marker is gone (GC'd) or whose mere - * presence on new is the only remaining evidence. - * - * Caching policy: positives are cached aggressively (a migrated row never - * un-migrates within the retention window); negatives are NOT cached (a - * not-yet-migrated row may migrate at any moment, and re-reading legacy for it is - * still correct — the row is there until termination — so the only cost of a stale - * negative would be a brief extra probe, which we avoid by simply not caching it). - */ -import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; -import { isFenced, type RedirectMarkerClient } from "@internal/run-engine"; - -type KnownMigratedDeps = { - /** Authoritative migrated-marker source: true iff the OLD side is fenced for this run. */ - readMarker?: (runId: string) => Promise; - /** Fallback NEW-DB existence probe: true iff the run already exists on the new store. */ - probeNew?: (runId: string) => Promise; - /** Bounded TTL memo for positive results. */ - cache?: BoundedTtlCache; - /** TTL (ms) used by the default module-level cache. */ - ttlMs?: number; - /** OLD/LEGACY run-ops client the default `readMarker` reads the fence from. */ - legacyMarkerClient?: RedirectMarkerClient; -}; - -/** Default positive-cache TTL: long, because a migrated row never un-migrates in the window. */ -const DEFAULT_TTL_MS = 5 * 60_000; -const DEFAULT_MAX_ENTRIES = 50_000; - -/** - * PURE testable core (no `env`/`db.server`/`process.env` import — webapp testability rule). - * Tests inject `readMarker`/`probeNew` as pure boundaries (NOT DB mocks). - */ -export async function computeKnownMigrated( - runId: string, - deps: KnownMigratedDeps -): Promise { - const cache = deps.cache; - - // We only ever store positives, so a hit is always `true`. - const cached = cache?.get(runId); - if (cached !== undefined) { - return cached; - } - - // Marker present → migrated, never probe new. - if (deps.readMarker && (await deps.readMarker(runId))) { - cache?.set(runId, true); - return true; - } - - if (deps.probeNew && (await deps.probeNew(runId))) { - cache?.set(runId, true); - return true; - } - - // Not migrated. Negatives are not cached (see policy note above). - return false; -} - -let defaultCache: BoundedTtlCache | undefined; - -function getDefaultCache(ttlMs: number): BoundedTtlCache { - if (!defaultCache) { - defaultCache = new BoundedTtlCache(ttlMs, DEFAULT_MAX_ENTRIES); - } - return defaultCache; -} - -/** - * Default `readMarker` adapter. Delegates to the OLD-side fence (`isFenced`) so the - * redirect marker is the migrated authority. The legacy run-ops replica - * client is injected by the wired wrapper (`isKnownMigrated`) — the pure core never - * imports `db.server`. - */ -function makeDefaultReadMarker( - client: RedirectMarkerClient -): (runId: string) => Promise { - return (runId: string) => isFenced(client, runId); -} - -/** - * Wired wrapper. Defaults `readMarker` to the marker adapter above, `probeNew` to a - * NEW run-ops existence check, and `cache` to a module-level singleton. - * - * The `probeNew` default uses `findFirst` (NEVER `findUnique` — webapp Prisma rule) - * against the new run-ops writer handle. - */ -export async function isKnownMigrated(runId: string, deps?: KnownMigratedDeps): Promise { - const ttlMs = deps?.ttlMs ?? DEFAULT_TTL_MS; - - // Lazy default for probeNew so the db.server import stays out of the pure core and - // only resolves when the wired wrapper actually needs it. - const probeNew = - deps?.probeNew ?? - (async (id: string) => { - const { runOpsNewPrisma } = await import("~/db.server"); - const row = await runOpsNewPrisma.taskRun.findFirst({ - where: { friendlyId: id }, - select: { friendlyId: true }, - }); - return row !== null; - }); - - // Resolve the OLD/LEGACY marker client (injected for tests; the legacy run-ops - // replica in production). Only needed when no explicit readMarker is provided. - let readMarker = deps?.readMarker; - if (!readMarker) { - const legacyMarkerClient = - deps?.legacyMarkerClient ?? (await import("~/db.server")).runOpsLegacyReplica; - readMarker = makeDefaultReadMarker(legacyMarkerClient); - } - - return computeKnownMigrated(runId, { - readMarker, - probeNew, - cache: deps?.cache ?? getDefaultCache(ttlMs), - ttlMs, - }); -} - -export function __resetKnownMigratedCacheForTests(): void { - defaultCache = undefined; -} diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts index 3f393c66075..b552b4de736 100644 --- a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts @@ -32,11 +32,7 @@ describe("resolveBatchMintKind", () => { const resolveRunIdMintKind = vi.fn().mockResolvedValue("ksuid"); const kind = await resolveBatchMintKind({ environment, - deps: { - resolveRunIdMintKind, - isKnownMigrated: vi.fn(), - isSplitEnabled: vi.fn(), - }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("ksuid"); expect(resolveRunIdMintKind).toHaveBeenCalledWith({ @@ -46,90 +42,41 @@ describe("resolveBatchMintKind", () => { }); }); - it("ROOT batch on a non-cut-over org -> cuid, isKnownMigrated NOT called", async () => { + it("ROOT batch on a non-cut-over org -> cuid", async () => { const resolveRunIdMintKind = vi.fn().mockResolvedValue("cuid"); - const isKnownMigrated = vi.fn(); const kind = await resolveBatchMintKind({ environment, - deps: { - resolveRunIdMintKind, - isKnownMigrated, - isSplitEnabled: vi.fn(), - }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("cuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); }); - it("CHILD batch inherits a ksuid (NEW) parent by id-shape, split off, no marker read", async () => { + it("CHILD batch inherits a ksuid (NEW) parent by id-shape", async () => { const parentRunFriendlyId = `run_${"a".repeat(27)}`; const resolveRunIdMintKind = vi.fn(); - const isKnownMigrated = vi.fn(); - const isSplitEnabled = vi.fn().mockResolvedValue(false); const kind = await resolveBatchMintKind({ environment, parentRunFriendlyId, - deps: { resolveRunIdMintKind, isKnownMigrated, isSplitEnabled }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("ksuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); expect(resolveRunIdMintKind).not.toHaveBeenCalled(); }); it("CHILD batch inherits a cuid (LEGACY) parent by id-shape", async () => { const parentRunFriendlyId = `run_${"a".repeat(25)}`; - const isSplitEnabled = vi.fn().mockResolvedValue(false); - - const kind = await resolveBatchMintKind({ - environment, - parentRunFriendlyId, - deps: { - resolveRunIdMintKind: vi.fn(), - isKnownMigrated: vi.fn(), - isSplitEnabled, - }, - }); - - expect(kind).toBe("cuid"); - }); - - it("CHILD batch with a legacy-by-shape parent already migrated (split on + marker) -> ksuid", async () => { - const parentRunFriendlyId = `run_${"a".repeat(25)}`; - const isSplitEnabled = vi.fn().mockResolvedValue(true); - const isKnownMigrated = vi.fn().mockResolvedValue(true); - - const kind = await resolveBatchMintKind({ - environment, - parentRunFriendlyId, - deps: { - resolveRunIdMintKind: vi.fn(), - isKnownMigrated, - isSplitEnabled, - }, - }); - - expect(kind).toBe("ksuid"); - }); - - it("CHILD inheritance does NOT consult the marker when split is OFF (hot-path zero-IO)", async () => { - const parentRunFriendlyId = `run_${"a".repeat(25)}`; - const isSplitEnabled = vi.fn().mockResolvedValue(false); - const isKnownMigrated = vi.fn().mockResolvedValue(true); + const resolveRunIdMintKind = vi.fn(); const kind = await resolveBatchMintKind({ environment, parentRunFriendlyId, - deps: { - resolveRunIdMintKind: vi.fn(), - isKnownMigrated, - isSplitEnabled, - }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("cuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); }); // mint-on-FLIP invariant: a child follows its parent's store even after the org flag @@ -140,11 +87,7 @@ describe("resolveBatchMintKind", () => { const kind = await resolveBatchMintKind({ environment, parentRunFriendlyId, - deps: { - resolveRunIdMintKind, - isKnownMigrated: vi.fn().mockResolvedValue(false), - isSplitEnabled: vi.fn().mockResolvedValue(true), - }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("cuid"); expect(resolveRunIdMintKind).not.toHaveBeenCalled(); @@ -156,11 +99,7 @@ describe("resolveBatchMintKind", () => { const kind = await resolveBatchMintKind({ environment, parentRunFriendlyId, - deps: { - resolveRunIdMintKind, - isKnownMigrated: vi.fn().mockResolvedValue(false), - isSplitEnabled: vi.fn().mockResolvedValue(true), - }, + deps: { resolveRunIdMintKind }, }); expect(kind).toBe("ksuid"); expect(resolveRunIdMintKind).not.toHaveBeenCalled(); diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts index bdbb83b51dd..0503fc5b2c8 100644 --- a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts @@ -3,20 +3,14 @@ import { resolveRunIdMintKind as defaultResolveRunIdMintKind, type RunIdMintKind, } from "~/v3/engineVersion.server"; -import { isKnownMigrated as defaultIsKnownMigrated } from "~/v3/runOpsMigration/knownMigratedFilter.server"; -import { isSplitEnabled as defaultIsSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; import { resolveInheritedMintKind } from "~/v3/runOpsMigration/resolveInheritedMintKind.server"; type ResolveDeps = { resolveRunIdMintKind: typeof defaultResolveRunIdMintKind; - isKnownMigrated: (runId: string) => Promise; - isSplitEnabled: () => Promise; }; const defaultDeps: ResolveDeps = { resolveRunIdMintKind: defaultResolveRunIdMintKind, - isKnownMigrated: defaultIsKnownMigrated, - isSplitEnabled: defaultIsSplitEnabled, }; export function batchIdForMintKind(kind: RunIdMintKind): { id: string; friendlyId: string } { @@ -34,10 +28,7 @@ export async function resolveBatchMintKind(args: { }): Promise { const deps = { ...defaultDeps, ...args.deps }; return args.parentRunFriendlyId - ? resolveInheritedMintKind(args.parentRunFriendlyId, { - isSplitEnabled: deps.isSplitEnabled, - isKnownMigrated: deps.isKnownMigrated, - }) + ? resolveInheritedMintKind(args.parentRunFriendlyId) : deps.resolveRunIdMintKind({ organizationId: args.environment.organizationId, id: args.environment.id, diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts index 4bd292b4dfd..3d8dc914761 100644 --- a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts @@ -1,8 +1,8 @@ // Real legacy-replica + new-DB proof for the read-through layer. // We NEVER mock the DB: the reads run as real `$queryRaw` against the two containers, -// crossing the actual legacy↔new boundary the migration relies on. The only injected -// fakes are the pure boundaries — `isKnownMigrated`, `isPastRetention`, -// `splitEnabled` — plus throwing spies used to assert a store was NEVER touched. +// crossing the actual legacy↔new boundary the split relies on. The only injected +// fakes are the pure boundaries — `isPastRetention`, `splitEnabled` — plus throwing +// spies used to assert a store was NEVER touched. import { heteroPostgresTest } from "@internal/testcontainers"; import { describe, expect, vi } from "vitest"; import type { PrismaReplicaClient } from "~/db.server"; @@ -54,7 +54,6 @@ describe("readThroughRun (legacy replica + new DB)", () => { splitEnabled: true, newClient: prisma17 as unknown as PrismaReplicaClient, legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: async () => false, }, }); @@ -63,56 +62,6 @@ describe("readThroughRun (legacy replica + new DB)", () => { } ); - heteroPostgresTest( - "Step 2: a migrated run is filtered from old-probing", - async ({ prisma14, prisma17 }) => { - const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { - throw new Error("readLegacy must never be called for a known-migrated run"); - }); - - const result = await readThroughRun({ - runId: LEGACY_RUN_ID, - environmentId: "env_1", - readNew: (c) => realRead(c, false), // new misses → step (b) short-circuit - readLegacy: throwingLegacy, - deps: { - splitEnabled: true, - newClient: prisma17 as unknown as PrismaReplicaClient, - legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: async () => true, - }, - }); - - expect(result.source).toBe("not-found"); - expect(throwingLegacy).not.toHaveBeenCalled(); - } - ); - - heteroPostgresTest( - "Step 2b: a migrated run that the new read hits returns source=new", - async ({ prisma14, prisma17 }) => { - const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { - throw new Error("readLegacy must never be called when new hits"); - }); - - const result = await readThroughRun({ - runId: LEGACY_RUN_ID, - environmentId: "env_1", - readNew: (c) => realRead(c, true), - readLegacy: throwingLegacy, - deps: { - splitEnabled: true, - newClient: prisma17 as unknown as PrismaReplicaClient, - legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: async () => true, - }, - }); - - expect(result.source).toBe("new"); - expect(throwingLegacy).not.toHaveBeenCalled(); - } - ); - heteroPostgresTest( "Step 3: post-termination past-retention returns the normal not-found surface", async ({ prisma14, prisma17 }) => { @@ -125,7 +74,6 @@ describe("readThroughRun (legacy replica + new DB)", () => { splitEnabled: true, newClient: prisma17 as unknown as PrismaReplicaClient, legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: async () => false, isPastRetention: () => true, }, }); @@ -142,7 +90,6 @@ describe("readThroughRun (legacy replica + new DB)", () => { splitEnabled: true, newClient: prisma17 as unknown as PrismaReplicaClient, legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: async () => false, isPastRetention: () => false, }, }); @@ -155,14 +102,11 @@ describe("readThroughRun (legacy replica + new DB)", () => { ); heteroPostgresTest( - "Step 4: single-DB passthrough — only readNew runs, legacy + filter never touched", + "Step 4: single-DB passthrough — only readNew runs, legacy never touched", async ({ prisma14, prisma17 }) => { const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { throw new Error("readLegacy must never run in single-DB mode"); }); - const throwingFilter = vi.fn(async (): Promise => { - throw new Error("isKnownMigrated must never run in single-DB mode"); - }); const newRead = vi.fn((c: PrismaReplicaClient) => realRead(c, true)); const result = await readThroughRun({ @@ -174,14 +118,12 @@ describe("readThroughRun (legacy replica + new DB)", () => { splitEnabled: false, newClient: prisma17 as unknown as PrismaReplicaClient, legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: throwingFilter, }, }); expect(result.source).toBe("new"); expect(newRead).toHaveBeenCalledTimes(1); expect(throwingLegacy).not.toHaveBeenCalled(); - expect(throwingFilter).not.toHaveBeenCalled(); } ); @@ -191,9 +133,6 @@ describe("readThroughRun (legacy replica + new DB)", () => { const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { throw new Error("readLegacy must never run for a NEW-residency id"); }); - const throwingFilter = vi.fn(async (): Promise => { - throw new Error("isKnownMigrated must never run for a NEW-residency id"); - }); const result = await readThroughRun({ runId: NEW_RUN_ID, @@ -204,13 +143,11 @@ describe("readThroughRun (legacy replica + new DB)", () => { splitEnabled: true, newClient: prisma17 as unknown as PrismaReplicaClient, legacyReplica: prisma14 as unknown as PrismaReplicaClient, - isKnownMigrated: throwingFilter, }, }); expect(result.source).toBe("new"); expect(throwingLegacy).not.toHaveBeenCalled(); - expect(throwingFilter).not.toHaveBeenCalled(); } ); }); diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts index a4be2fdd1e5..8b83b70d78f 100644 --- a/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts @@ -3,12 +3,13 @@ * (which carries the read load we are shedding). Disabled entirely when isSplitEnabled() * is false (single-DB passthrough). * - * During the retention window, old not-yet-migrated run-ops rows are served off - * the legacy read replica. A known-migrated filter short-circuits re-probing legacy for - * runs already living on the new DB. After termination, past-retention runs return the - * normal not-found response. Patterned on `mollifier/resolveRunForMutation.server.ts` - * (`?? default` DI), but with the legacy-primary/writer fallback deliberately removed: - * this layer has NO legacy-writer handle at all (structural guarantee). + * During the retention window, old run-ops rows are served off the legacy read replica. + * Residency is decided purely by id-shape: a ksuid (NEW) id reads new only, a cuid + * (LEGACY) id reads legacy only. An unclassifiable id falls back to a new-then-legacy + * probe. After termination, past-retention runs return the normal not-found response. + * Patterned on `mollifier/resolveRunForMutation.server.ts` (`?? default` DI), but with + * the legacy-primary/writer fallback deliberately removed: this layer has NO legacy-writer + * handle at all (structural guarantee). */ import type { PrismaReplicaClient } from "~/db.server"; import { @@ -17,7 +18,6 @@ import { } from "~/db.server"; import { logger as defaultLogger } from "~/services/logger.server"; import { ownerEngine, UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; -import { isKnownMigrated as defaultIsKnownMigrated } from "./knownMigratedFilter.server"; import { isSplitEnabled } from "./splitMode.server"; export type ReadThroughSource = "new" | "legacy-replica"; @@ -32,7 +32,6 @@ export type ReadThroughDeps = { legacyReplica?: PrismaReplicaClient; /** Resolved boot constant; never `await`ed per-request when supplied. */ splitEnabled?: boolean; - isKnownMigrated?: (runId: string) => Promise; isPastRetention?: (runId: string) => boolean; logger?: { warn: (m: string, meta?: unknown) => void }; /** Saturation-signal emit hook: called on each legacy-replica hit. */ @@ -58,7 +57,7 @@ export async function readThroughRun( const splitEnabled = deps?.splitEnabled ?? (await isSplitEnabled()); // Passthrough: single plain read against the one collapsed store. No legacy read, - // no marker check, no isKnownMigrated, no second connection. + // no second connection. if (!splitEnabled) { const v = await input.readNew(newClient); return v != null ? { source: "new", value: v } : { source: "not-found" }; @@ -93,13 +92,6 @@ export async function readThroughRun( return { source: "new", value: v }; } - // Known-migrated short-circuit: the row is on new but the new read missed it - // (lag / select shape). Do NOT re-probe legacy. - const isMigrated = deps?.isKnownMigrated ?? defaultIsKnownMigrated; - if (await isMigrated(runId)) { - return { source: "not-found" }; - } - // Legacy READ REPLICA only — never a legacy writer/primary (no such handle exists). const lv = await input.readLegacy(legacyReplica); if (lv != null) { diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts index ce4ae5a1d4a..74baee1bb6e 100644 --- a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts @@ -1,64 +1,15 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { resolveInheritedMintKind } from "./resolveInheritedMintKind.server"; const NEW_PARENT = `run_${"a".repeat(27)}`; // ksuid id-shape -> NEW const LEGACY_PARENT = `run_${"b".repeat(25)}`; // cuid id-shape -> LEGACY -describe("resolveInheritedMintKind (pure, shared across all mint paths)", () => { - it("inherits a ksuid (NEW) parent by id-shape, split off, marker never read", async () => { - const isKnownMigrated = vi.fn(); - const kind = await resolveInheritedMintKind(NEW_PARENT, { - isSplitEnabled: async () => false, - isKnownMigrated, - }); - expect(kind).toBe("ksuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); +describe("resolveInheritedMintKind (pure id-shape, shared across all mint paths)", () => { + it("inherits a ksuid (NEW) parent by id-shape -> ksuid", () => { + expect(resolveInheritedMintKind(NEW_PARENT)).toBe("ksuid"); }); - it("inherits a cuid (LEGACY) parent by id-shape, split off, marker never read", async () => { - const isKnownMigrated = vi.fn(); - const kind = await resolveInheritedMintKind(LEGACY_PARENT, { - isSplitEnabled: async () => false, - isKnownMigrated, - }); - expect(kind).toBe("cuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); - }); - - // The gap this helper closes: split OFF = one physical DB, and a probeNew-backed - // isKnownMigrated returns true for any extant parent. The guard must skip the marker - // when split is off so a cuid parent keeps minting cuid children (byte-identical to today). - it("does NOT consult the marker when split is OFF (hot-path zero-IO; byte-identical to today)", async () => { - const isKnownMigrated = vi.fn().mockResolvedValue(true); - const kind = await resolveInheritedMintKind(LEGACY_PARENT, { - isSplitEnabled: async () => false, - isKnownMigrated, - }); - expect(kind).toBe("cuid"); - expect(isKnownMigrated).not.toHaveBeenCalled(); - }); - - it("split ON + legacy-by-shape parent already migrated (marker true) -> ksuid (co-resident on NEW)", async () => { - const kind = await resolveInheritedMintKind(LEGACY_PARENT, { - isSplitEnabled: async () => true, - isKnownMigrated: async () => true, - }); - expect(kind).toBe("ksuid"); - }); - - it("split ON + legacy-by-shape parent NOT migrated (marker false) -> cuid (stays LEGACY)", async () => { - const kind = await resolveInheritedMintKind(LEGACY_PARENT, { - isSplitEnabled: async () => true, - isKnownMigrated: async () => false, - }); - expect(kind).toBe("cuid"); - }); - - it("split ON + ksuid parent -> ksuid regardless of marker (already NEW)", async () => { - const kind = await resolveInheritedMintKind(NEW_PARENT, { - isSplitEnabled: async () => true, - isKnownMigrated: async () => false, - }); - expect(kind).toBe("ksuid"); + it("inherits a cuid (LEGACY) parent by id-shape -> cuid", () => { + expect(resolveInheritedMintKind(LEGACY_PARENT)).toBe("cuid"); }); }); diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts index 79e41b41dff..e43c3a8e33c 100644 --- a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts @@ -1,21 +1,10 @@ import { ownerEngine } from "@trigger.dev/core/v3/isomorphic"; import type { RunIdMintKind } from "./runOpsMintKind.server"; -type InheritedMintKindDeps = { - isSplitEnabled: () => Promise; - isKnownMigrated: (runId: string) => Promise; -}; - // Mint a child in the SAME physical store as its anchor (parent run / owning batch), // regardless of the org's current mint flag — keeps a subgraph co-resident across a -// flip. Marker-aware inheritance only matters with split on; split off is a pure -// id-shape check (zero hot-path I/O, byte-identical to today). -export async function resolveInheritedMintKind( - parentRunFriendlyId: string, - deps: InheritedMintKindDeps -): Promise { - if ((await deps.isSplitEnabled()) && (await deps.isKnownMigrated(parentRunFriendlyId))) { - return "ksuid"; - } +// flip. With no migration/drain, residency is a pure id-shape check (zero hot-path +// I/O): a ksuid (NEW) parent mints ksuid children, a cuid (LEGACY) parent mints cuid. +export function resolveInheritedMintKind(parentRunFriendlyId: string): RunIdMintKind { return ownerEngine(parentRunFriendlyId) === "NEW" ? "ksuid" : "cuid"; } diff --git a/apps/webapp/test/runEngineHandlers.test.ts b/apps/webapp/test/runEngineHandlers.test.ts index 5fff3cba5b1..d57ebf069e0 100644 --- a/apps/webapp/test/runEngineHandlers.test.ts +++ b/apps/webapp/test/runEngineHandlers.test.ts @@ -278,8 +278,6 @@ describe("runEngineHandlers read-through cross-version", () => { newReplica: prisma17, legacyReplica: prisma14, splitEnabled: true, - // Pure boundary: this legacy run was never migrated, so don't short-circuit. - isKnownMigrated: async () => false, }; const run = await readRunForEvent( From 90e7a509c95687b877ae0e8570000b7c36dd698e Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 13:55:31 +0100 Subject: [PATCH 03/14] test(run-ops): add cache + distinct-db sentinel tests; drop test enumeration labels Add a pure unit test for ControlPlaneCache covering per-slot round-trips, null-vs-miss distinction, epoch-based invalidation, per-slot key isolation, bounded eviction, and TTL expiry. Add a testcontainer test for probeDistinctDatabases covering distinct clusters, same physical database (with reason), same-cluster-different-database, and fail-closed probe failure. Strip developer-enumeration labels from three existing test files (readThrough step numbers, runEngineHandlers Test-X comments) and rename the run-detail loader read-through test to drop the non-domain "shape 1" name. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../controlPlaneCache.server.test.ts | 143 ++++++++++++++++++ .../readThrough.server.test.ts | 8 +- ...lLoaders.controlPlane.readthrough.test.ts} | 12 +- apps/webapp/test/runEngineHandlers.test.ts | 19 ++- .../distinctDbSentinel.server.test.ts | 62 ++++++++ 5 files changed, 224 insertions(+), 20 deletions(-) create mode 100644 apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts rename apps/webapp/test/{shape1RunDetailLoaders.controlPlane.readthrough.test.ts => runDetailLoaders.controlPlane.readthrough.test.ts} (93%) create mode 100644 apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts new file mode 100644 index 00000000000..f83d534c904 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts @@ -0,0 +1,143 @@ +import { describe, expect, it } from "vitest"; +import { + ControlPlaneCache, + type ResolvedAuthenticatedEnv, + type ResolvedEnv, + type ResolvedRunLockedWorker, + type ResolvedWorkerVersion, +} from "./controlPlaneCache.server"; + +// Minimal, structurally-irrelevant stand-ins: the cache stores and returns opaque values by +// reference, so these only need to be distinguishable objects — the slot types are exercised for +// key routing, not field shape. +const anEnv = { id: "env_1" } as unknown as ResolvedEnv; +const aVersion = { worker: { id: "bw_1" } } as unknown as ResolvedWorkerVersion; +const anAuthEnv = { id: "env_1", slug: "prod" } as unknown as ResolvedAuthenticatedEnv; +const aLockedWorker = { lockedBy: null, lockedToVersion: null } as ResolvedRunLockedWorker; + +describe("ControlPlaneCache", () => { + it("round-trips a value through every slot", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_1", anEnv); + cache.setWorkerVersion("env_1:current", aVersion); + cache.setEnvExists("env_1", true); + cache.setAuthEnv("env_1", anAuthEnv); + cache.setLockedWorker("bw_1:v_1", aLockedWorker); + + expect(cache.getEnv("env_1")).toBe(anEnv); + expect(cache.getWorkerVersion("env_1:current")).toBe(aVersion); + expect(cache.getEnvExists("env_1")).toBe(true); + expect(cache.getAuthEnv("env_1")).toBe(anAuthEnv); + expect(cache.getLockedWorker("bw_1:v_1")).toBe(aLockedWorker); + }); + + it("returns undefined for a key that was never set, in every slot", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnv("missing")).toBeUndefined(); + expect(cache.getWorkerVersion("missing")).toBeUndefined(); + expect(cache.getEnvExists("missing")).toBeUndefined(); + expect(cache.getAuthEnv("missing")).toBeUndefined(); + expect(cache.getLockedWorker("missing")).toBeUndefined(); + }); + + it("distinguishes a cached null (confirmed absence) from an unset miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnv("env_2")).toBeUndefined(); + cache.setEnv("env_2", null); + expect(cache.getEnv("env_2")).toBeNull(); + + expect(cache.getAuthEnv("env_2")).toBeUndefined(); + cache.setAuthEnv("env_2", null); + expect(cache.getAuthEnv("env_2")).toBeNull(); + + expect(cache.getWorkerVersion("env_2:current")).toBeUndefined(); + cache.setWorkerVersion("env_2:current", null); + expect(cache.getWorkerVersion("env_2:current")).toBeNull(); + + expect(cache.getLockedWorker("_:_")).toBeUndefined(); + cache.setLockedWorker("_:_", null); + expect(cache.getLockedWorker("_:_")).toBeNull(); + }); + + it("caches a false env-existence result distinctly from an unset miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnvExists("env_3")).toBeUndefined(); + cache.setEnvExists("env_3", false); + expect(cache.getEnvExists("env_3")).toBe(false); + }); + + it("invalidateEnv forces the next getEnv to miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_4", anEnv); + expect(cache.getEnv("env_4")).toBe(anEnv); + + cache.invalidateEnv("env_4"); + expect(cache.getEnv("env_4")).toBeUndefined(); + }); + + it("makes a re-setEnv after invalidation readable again", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const replacement = { id: "env_5b" } as unknown as ResolvedEnv; + + cache.setEnv("env_5", anEnv); + cache.invalidateEnv("env_5"); + expect(cache.getEnv("env_5")).toBeUndefined(); + + cache.setEnv("env_5", replacement); + expect(cache.getEnv("env_5")).toBe(replacement); + }); + + it("invalidateEnv is scoped to its own id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const other = { id: "env_keep" } as unknown as ResolvedEnv; + + cache.setEnv("env_drop", anEnv); + cache.setEnv("env_keep", other); + cache.invalidateEnv("env_drop"); + + expect(cache.getEnv("env_drop")).toBeUndefined(); + expect(cache.getEnv("env_keep")).toBe(other); + }); + + it("does not collide keys across slots for the same id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("x", anEnv); + cache.setEnvExists("x", true); + cache.setAuthEnv("x", anAuthEnv); + + expect(cache.getEnv("x")).toBe(anEnv); + expect(cache.getEnvExists("x")).toBe(true); + expect(cache.getAuthEnv("x")).toBe(anAuthEnv); + + // Invalidating the env slot leaves the sibling slots for the same id intact. + cache.invalidateEnv("x"); + expect(cache.getEnv("x")).toBeUndefined(); + expect(cache.getEnvExists("x")).toBe(true); + expect(cache.getAuthEnv("x")).toBe(anAuthEnv); + }); + + it("evicts the oldest entry once maxEntries is exceeded", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 2 }); + + cache.setEnv("first", { id: "first" } as unknown as ResolvedEnv); + cache.setEnv("second", { id: "second" } as unknown as ResolvedEnv); + cache.setEnv("third", { id: "third" } as unknown as ResolvedEnv); + + expect(cache.getEnv("first")).toBeUndefined(); + expect(cache.getEnv("second")).toMatchObject({ id: "second" }); + expect(cache.getEnv("third")).toMatchObject({ id: "third" }); + }); + + it("treats a zero-TTL entry as immediately expired", () => { + const cache = new ControlPlaneCache({ ttlMs: 0, maxEntries: 100 }); + + cache.setEnv("env_ttl", anEnv); + expect(cache.getEnv("env_ttl")).toBeUndefined(); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts index 3d8dc914761..1fe52189c83 100644 --- a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts @@ -40,7 +40,7 @@ function toHttpish(result: ReadThroughResult): { status: number; value?: T describe("readThroughRun (legacy replica + new DB)", () => { heteroPostgresTest( - "Step 1: old in-retention run is served from the legacy REPLICA, never a primary", + "old in-retention run is served from the legacy REPLICA, never a primary", async ({ prisma14, prisma17 }) => { // legacy hit, new miss. The layer has NO legacy-writer handle at all — the // read resolving through `legacyReplica` (prisma14) IS the structural guarantee @@ -63,7 +63,7 @@ describe("readThroughRun (legacy replica + new DB)", () => { ); heteroPostgresTest( - "Step 3: post-termination past-retention returns the normal not-found surface", + "post-termination past-retention returns the normal not-found surface", async ({ prisma14, prisma17 }) => { const pastRetentionResult = await readThroughRun({ runId: LEGACY_RUN_ID, @@ -102,7 +102,7 @@ describe("readThroughRun (legacy replica + new DB)", () => { ); heteroPostgresTest( - "Step 4: single-DB passthrough — only readNew runs, legacy never touched", + "single-DB passthrough — only readNew runs, legacy never touched", async ({ prisma14, prisma17 }) => { const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { throw new Error("readLegacy must never run in single-DB mode"); @@ -128,7 +128,7 @@ describe("readThroughRun (legacy replica + new DB)", () => { ); heteroPostgresTest( - "Step 5: new-residency fast-path — legacy replica is never touched", + "new-residency fast-path — legacy replica is never touched", async ({ prisma14, prisma17 }) => { const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { throw new Error("readLegacy must never run for a NEW-residency id"); diff --git a/apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts similarity index 93% rename from apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts rename to apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts index 29fe5972ac9..cc2e4bc02c8 100644 --- a/apps/webapp/test/shape1RunDetailLoaders.controlPlane.readthrough.test.ts +++ b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts @@ -1,4 +1,4 @@ -// Dedicated run-ops proof: Shape-1 run-detail loaders read the run by friendlyId on the dedicated +// Dedicated run-ops proof: the run-detail page loaders read the run by friendlyId on the dedicated // run-ops client (PG17, subset schema with no control-plane tables), then authorize membership + // resolve env on PG14. Neither DB joins the other. import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; @@ -48,8 +48,8 @@ async function seedAll(prisma: PrismaClient) { return { organization, project, environment, member, stranger }; } -// [TEST-NEWSEED] The run lives on the dedicated run-ops client; its control-plane FKs are synthetic -// scalar ids pointing at rows that exist only on PG14 (the dedicated DB has no such tables). +// The run lives on the dedicated run-ops client; its control-plane FKs are synthetic scalar ids +// pointing at rows that exist only on PG14 (the dedicated DB has no such tables). async function seedKsuidRun(prisma17: RunOpsPrismaClient, cp: Awaited>) { const k = n++; return prisma17.taskRun.create({ @@ -61,10 +61,10 @@ async function seedKsuidRun(prisma17: RunOpsPrismaClient, cp: Awaited { +describe("run-detail loaders cross-DB read-through (dedicated run-ops client)", () => { heteroRunOpsPostgresTest( "ksuid run resolves: friendlyId read on the dedicated run-ops DB + membership/env auth on PG14 (resources.runs.$runParam shape)", async ({ prisma14, prisma17 }) => { diff --git a/apps/webapp/test/runEngineHandlers.test.ts b/apps/webapp/test/runEngineHandlers.test.ts index d57ebf069e0..2c57d87506e 100644 --- a/apps/webapp/test/runEngineHandlers.test.ts +++ b/apps/webapp/test/runEngineHandlers.test.ts @@ -151,7 +151,7 @@ function failure(index: number, errorCode: string, extra?: Record { - // Test A: a NEW run resolves via read-through against the new store. + // A NEW run resolves via read-through against the new store. containerTest("event read resolves a NEW run via read-through", async ({ prisma }) => { const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); const { organization, project, environment } = await seedEnvironment(prisma, "a"); @@ -181,7 +181,7 @@ describe("runEngineHandlers read-through", () => { expect(run!.taskEventStore).toBe("taskEvent"); }); - // Test C: single-DB short-circuit — readLegacy must never be invoked. + // Single-DB short-circuit — readLegacy must never be invoked. containerTest("single-DB short-circuit never touches a legacy handle", async ({ prisma }) => { const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); const { organization, project, environment } = await seedEnvironment(prisma, "c"); @@ -237,8 +237,8 @@ describe("runEngineHandlers read-through", () => { }); describe("runEngineHandlers read-through cross-version", () => { - // Test B (heterogeneous recast): an OLD in-retention run is served off the LEGACY - // REPLICA only, and the legacy primary/writer is structurally absent. + // An OLD in-retention run is served off the LEGACY REPLICA only, and the legacy + // primary/writer is structurally absent. heteroPostgresTest( "event read resolves an OLD in-retention run via the legacy replica", async ({ prisma14, prisma17 }) => { @@ -378,7 +378,7 @@ describe("runEngineHandlers batch completion", () => { expect(batch.processingCompletedAt).toBeNull(); }); - // Test E: callback retry is idempotent via skipDuplicates. + // Callback retry is idempotent via skipDuplicates. containerTest("batch txn is idempotent on callback retry", async ({ prisma }) => { const { environment } = await seedEnvironment(prisma, "e"); const batchId = "e".repeat(25); @@ -404,7 +404,7 @@ describe("runEngineHandlers batch completion", () => { expect(errors).toHaveLength(2); }); - // Test I: aggregate fast-path collapses same-errorCode failures to one row. + // Aggregate fast-path collapses same-errorCode failures to one row. containerTest("aggregate fast-path collapses queue-size-limit failures", async ({ prisma }) => { const { environment } = await seedEnvironment(prisma, "i"); const batchId = "f".repeat(25); @@ -510,8 +510,8 @@ describe("runEngineHandlers batch residency routing", () => { expect(writer).toBe(prisma); }); - // Test G (heterogeneous recast): a legacy-resident batch (row only on the legacy DB) commits on - // the LEGACY writer; the NEW DB is left with zero rows for the batch. + // A legacy-resident batch (row only on the legacy DB) commits on the LEGACY writer; + // the NEW DB is left with zero rows for the batch. heteroPostgresTest( "legacy-resident batch routes to the LEGACY writer, new DB untouched", async ({ prisma14, prisma17 }) => { @@ -617,8 +617,7 @@ describe("runEngineHandlers batch residency routing", () => { } ); - // Test H (heterogeneous recast): a new batch (row only on the new DB) commits on the NEW - // writer; the LEGACY DB is untouched. + // A new batch (row only on the new DB) commits on the NEW writer; the LEGACY DB is untouched. heteroPostgresTest( "new batch routes to the NEW writer, legacy DB untouched", async ({ prisma14, prisma17 }) => { diff --git a/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts new file mode 100644 index 00000000000..9ab3a1e5dae --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts @@ -0,0 +1,62 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { probeDistinctDatabases } from "~/v3/runOpsMigration/distinctDbSentinel.server"; + +// Spinning up two separate postgres clusters and probing each can exceed the 5s default. +vi.setConfig({ testTimeout: 60_000 }); + +function urlWithDatabase(uri: string, database: string): string { + const url = new URL(uri); + url.pathname = `/${database}`; + return url.toString(); +} + +describe("probeDistinctDatabases", () => { + heteroPostgresTest( + "reports distinct for two separate physical clusters", + async ({ uri14, uri17 }) => { + const result = await probeDistinctDatabases(uri14, uri17); + expect(result).toEqual({ distinct: true }); + } + ); + + heteroPostgresTest( + "reports NOT distinct, citing the same physical database, when both URLs point at it", + async ({ uri14 }) => { + const result = await probeDistinctDatabases(uri14, uri14); + expect(result.distinct).toBe(false); + if (result.distinct === false) { + expect(result.reason).toMatch(/same physical database/i); + } + } + ); + + heteroPostgresTest( + "reports distinct for two databases in the SAME cluster", + async ({ postgresContainer14, uri14 }) => { + const otherDb = `sentinel_other_${Date.now()}`; + const admin = new PrismaClient({ + datasources: { db: { url: urlWithDatabase(postgresContainer14.getConnectionUri(), "postgres") } }, + }); + try { + await admin.$executeRawUnsafe(`CREATE DATABASE "${otherDb}"`); + } finally { + await admin.$disconnect(); + } + + const otherUrl = urlWithDatabase(uri14, otherDb); + const result = await probeDistinctDatabases(uri14, otherUrl); + expect(result).toEqual({ distinct: true }); + } + ); + + heteroPostgresTest( + "fails closed to NOT distinct when a probe cannot reach a database", + async ({ uri14 }) => { + const unreachable = "postgresql://nobody:nobody@127.0.0.1:1/does_not_exist"; + const result = await probeDistinctDatabases(uri14, unreachable); + expect(result.distinct).toBe(false); + } + ); +}); From ec0032dd908a8f876926f704e47978bb2e506e2b Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 15:14:59 +0100 Subject: [PATCH 04/14] build(run-ops): sync pnpm-lock apps/webapp importer with its declared deps apps/webapp/package.json declares @internal/run-ops-database (workspace) and @testcontainers/postgresql but the lockfile importer entry was never regenerated, so pnpm install --frozen-lockfile fails for the webapp. Regenerate the importer. Co-Authored-By: Claude Opus 4.8 (1M context) --- pnpm-lock.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 034f7d76ef4..1a56a054f42 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -371,6 +371,9 @@ importers: '@internal/run-engine': specifier: workspace:* version: link:../../internal-packages/run-engine + '@internal/run-ops-database': + specifier: workspace:* + version: link:../../internal-packages/run-ops-database '@internal/run-store': specifier: workspace:* version: link:../../internal-packages/run-store @@ -936,6 +939,9 @@ importers: '@tailwindcss/typography': specifier: ^0.5.9 version: 0.5.9(tailwindcss@3.4.1) + '@testcontainers/postgresql': + specifier: ^11.14.0 + version: 11.14.0 '@total-typescript/ts-reset': specifier: ^0.4.2 version: 0.4.2 From 72ace5c8591f62acc0bd9dc16473d7a15a2d6242 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 15:32:14 +0100 Subject: [PATCH 05/14] style(run-ops): apply oxfmt Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/webapp/app/db.server.ts | 10 +++- .../runEngineControlPlaneResolver.server.ts | 13 ++--- apps/webapp/app/v3/runStore.server.ts | 3 +- ...ilLoaders.controlPlane.readthrough.test.ts | 20 ++++++- .../controlPlaneRepoint.server.test.ts | 35 +++++------ .../distinctDbSentinel.server.test.ts | 4 +- ...nEngineControlPlaneResolver.server.test.ts | 58 ++++++++++--------- 7 files changed, 81 insertions(+), 62 deletions(-) diff --git a/apps/webapp/app/db.server.ts b/apps/webapp/app/db.server.ts index 179f9976205..9e481e53bb9 100644 --- a/apps/webapp/app/db.server.ts +++ b/apps/webapp/app/db.server.ts @@ -156,7 +156,9 @@ function tagDatasourceRunOps( // Same wrapper as captureInfrastructureErrors, bridged via double cast because // that helper is constrained to T extends @trigger.dev/database.PrismaClient. function captureInfraErrorsRunOps(client: RunOpsPrismaClient): RunOpsPrismaClient { - return captureInfrastructureErrors(client as unknown as PrismaClient) as unknown as RunOpsPrismaClient; + return captureInfrastructureErrors( + client as unknown as PrismaClient + ) as unknown as RunOpsPrismaClient; } export const prisma = singleton("prisma", () => @@ -261,8 +263,10 @@ export const runOpsNewPrismaClient: RunOpsPrismaClient = runOpsTopology.newRunOp export const runOpsNewReplicaClient: RunOpsPrismaClient = runOpsTopology.newRunOps.replica; // Legacy-typed aliases kept for the remaining consumers that still expect PrismaClient / // PrismaReplicaClient (idempotency residency, read-through, handlers, cascade cleanup). -export const runOpsNewPrisma: PrismaClient = runOpsTopology.newRunOps.writer as unknown as PrismaClient; -export const runOpsNewReplica: PrismaReplicaClient = runOpsTopology.newRunOps.replica as unknown as PrismaReplicaClient; +export const runOpsNewPrisma: PrismaClient = runOpsTopology.newRunOps + .writer as unknown as PrismaClient; +export const runOpsNewReplica: PrismaReplicaClient = runOpsTopology.newRunOps + .replica as unknown as PrismaReplicaClient; export const runOpsLegacyPrisma: PrismaClient = runOpsTopology.legacyRunOps.writer; export const runOpsLegacyReplica: PrismaReplicaClient = runOpsTopology.legacyRunOps.replica; diff --git a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts index b958d6d24f7..c505e3d1081 100644 --- a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts @@ -6,10 +6,7 @@ import type { } from "@internal/run-engine"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; import { $replica } from "~/db.server"; -import { - authIncludeWithParent, - toAuthenticated, -} from "~/models/runtimeEnvironment.server"; +import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; import { ControlPlaneResolver as AppControlPlaneResolver, controlPlaneResolver, @@ -64,9 +61,7 @@ export class RunEngineControlPlaneResolver implements EngineControlPlaneResolver }); } - async resolveAuthenticatedEnv( - environmentId: string - ): Promise { + async resolveAuthenticatedEnv(environmentId: string): Promise { // Mirror findEnvironmentById's data source ($replica) and auth shape, but the // engine needs `git` too. A single findFirst with `include: authIncludeWithParent` // returns all RuntimeEnvironment scalars (including `git`) on the row, so we map @@ -94,4 +89,6 @@ export class RunEngineControlPlaneResolver implements EngineControlPlaneResolver } // Module-level singleton over the app resolver singleton. -export const runEngineControlPlaneResolver = new RunEngineControlPlaneResolver(controlPlaneResolver); +export const runEngineControlPlaneResolver = new RunEngineControlPlaneResolver( + controlPlaneResolver +); diff --git a/apps/webapp/app/v3/runStore.server.ts b/apps/webapp/app/v3/runStore.server.ts index 3cab13f9419..4173fc55eaa 100644 --- a/apps/webapp/app/v3/runStore.server.ts +++ b/apps/webapp/app/v3/runStore.server.ts @@ -76,8 +76,7 @@ export function buildRunStore(deps: BuildRunStoreDeps): RunStore { // RUN_OPS_SPLIT_ENABLED. Reads must fan out across both DBs so a run that lives on the new // DB stays visible even with the flag off (matches the db.server topology factory). The flag // governs write/mint residency + migration via isSplitEnabled(), not read visibility. -const ROUTING_ENABLED = - !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; +const ROUTING_ENABLED = !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; // Resolve the run-ops handles, tolerating contexts where they are absent — tests that mock // ~/db.server minimally omit them, and accessing a missing export under vi.mock throws. A diff --git a/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts index cc2e4bc02c8..d2bf3d6e193 100644 --- a/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts +++ b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts @@ -118,7 +118,10 @@ describe("run-detail loaders cross-DB read-through (dedicated run-ops client)", expect(found!.id).toBe(run.id); const authorized = await cp14.project.findFirst({ - where: { id: found!.projectId, organization: { members: { some: { userId: cp.member.id } } } }, + where: { + id: found!.projectId, + organization: { members: { some: { userId: cp.member.id } } }, + }, select: { id: true }, }); expect(authorized).not.toBeNull(); @@ -148,7 +151,10 @@ describe("run-detail loaders cross-DB read-through (dedicated run-ops client)", expect(found).not.toBeNull(); const authorized = await cp14.project.findFirst({ - where: { id: found!.projectId, organization: { members: { some: { userId: cp.stranger.id } } } }, + where: { + id: found!.projectId, + organization: { members: { some: { userId: cp.stranger.id } } }, + }, select: { id: true }, }); expect(authorized).toBeNull(); @@ -165,7 +171,15 @@ describe("run-detail loaders cross-DB read-through (dedicated run-ops client)", const found = await runStore.findRun( { friendlyId: run.friendlyId }, - { select: { id: true, idempotencyKey: true, taskIdentifier: true, projectId: true, runtimeEnvironmentId: true } } + { + select: { + id: true, + idempotencyKey: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + }, + } ); const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); expect(env!.slug).toBe(cp.environment.slug); diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts index 322366125fb..fff5f848f75 100644 --- a/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts @@ -127,26 +127,23 @@ heteroPostgresTest( // --- Relaxed-cache (no latency regression) ------------------------- -heteroPostgresTest( - "relaxed (longer TTL) cache still hits on the new DB", - async ({ prisma17 }) => { - const { environment } = await seedControlPlane(prisma17); - const { client: counting, reads } = countQueries(prisma17); - const resolver = new ControlPlaneResolver({ - controlPlanePrimary: counting, - controlPlaneReplica: counting, - // Relaxed: a much longer TTL than the default — same-provider resolution is cheap. - cache: new ControlPlaneCache({ ttlMs: 300_000, maxEntries: 10_000 }), - splitEnabled: () => true, - }); +heteroPostgresTest("relaxed (longer TTL) cache still hits on the new DB", async ({ prisma17 }) => { + const { environment } = await seedControlPlane(prisma17); + const { client: counting, reads } = countQueries(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: counting, + controlPlaneReplica: counting, + // Relaxed: a much longer TTL than the default — same-provider resolution is cheap. + cache: new ControlPlaneCache({ ttlMs: 300_000, maxEntries: 10_000 }), + splitEnabled: () => true, + }); - expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); - expect(reads()).toBe(1); - // Second read served from the relaxed cache — no extra DB round-trip. - await resolver.resolveEnv(environment.id); - expect(reads()).toBe(1); - } -); + expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); + expect(reads()).toBe(1); + // Second read served from the relaxed cache — no extra DB round-trip. + await resolver.resolveEnv(environment.id); + expect(reads()).toBe(1); +}); // --- Cross-version transition (legacy DB -> new DB) ----------------------- diff --git a/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts index 9ab3a1e5dae..d2baaa6404a 100644 --- a/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts @@ -37,7 +37,9 @@ describe("probeDistinctDatabases", () => { async ({ postgresContainer14, uri14 }) => { const otherDb = `sentinel_other_${Date.now()}`; const admin = new PrismaClient({ - datasources: { db: { url: urlWithDatabase(postgresContainer14.getConnectionUri(), "postgres") } }, + datasources: { + db: { url: urlWithDatabase(postgresContainer14.getConnectionUri(), "postgres") }, + }, }); try { await admin.$executeRawUnsafe(`CREATE DATABASE "${otherDb}"`); diff --git a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts index c3084000ab7..0c0d2b80fa7 100644 --- a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts @@ -119,25 +119,28 @@ async function seedWorker( } describe("RunEngineControlPlaneResolver adapter", () => { - heteroPostgresTest("resolveEnv maps app ResolvedEnv onto ResolvedEngineEnv", async ({ prisma14 }) => { - const { organization, project, environment } = await seedEnv(prisma14, "PRODUCTION"); - const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + heteroPostgresTest( + "resolveEnv maps app ResolvedEnv onto ResolvedEngineEnv", + async ({ prisma14 }) => { + const { organization, project, environment } = await seedEnv(prisma14, "PRODUCTION"); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); - const env = await adapter.resolveEnv(environment.id); - expect(env).not.toBeNull(); - expect(env!.id).toBe(environment.id); - expect(env!.type).toBe("PRODUCTION"); - expect(env!.projectId).toBe(project.id); - expect(env!.organizationId).toBe(organization.id); - // Nested + concurrency fields the run-engine MinimalAuthenticatedEnvironment requires. - expect(env!.project.id).toBe(project.id); - expect(env!.organization.id).toBe(organization.id); - expect(env!.maximumConcurrencyLimit).toBe(9); - expect(env!.concurrencyLimitBurstFactor.toNumber()).toBe(2); - expect(env!.archivedAt).toBeNull(); + const env = await adapter.resolveEnv(environment.id); + expect(env).not.toBeNull(); + expect(env!.id).toBe(environment.id); + expect(env!.type).toBe("PRODUCTION"); + expect(env!.projectId).toBe(project.id); + expect(env!.organizationId).toBe(organization.id); + // Nested + concurrency fields the run-engine MinimalAuthenticatedEnvironment requires. + expect(env!.project.id).toBe(project.id); + expect(env!.organization.id).toBe(organization.id); + expect(env!.maximumConcurrencyLimit).toBe(9); + expect(env!.concurrencyLimitBurstFactor.toNumber()).toBe(2); + expect(env!.archivedAt).toBeNull(); - expect(await adapter.resolveEnv("env_missing")).toBeNull(); - }); + expect(await adapter.resolveEnv("env_missing")).toBeNull(); + } + ); heteroPostgresTest( "resolveWorkerVersion (deployed, no workerId) resolves the promoted MANAGED deployment", @@ -156,7 +159,9 @@ describe("RunEngineControlPlaneResolver adapter", () => { }); expect(version).not.toBeNull(); expect(version!.worker.id).toBe(seeded.worker.id); - expect(version!.deployment?.id).toBe("deployment" in seeded ? seeded.deployment.id : undefined); + expect(version!.deployment?.id).toBe( + "deployment" in seeded ? seeded.deployment.id : undefined + ); expect(version!.tasks.map((t) => t.slug)).toContain("my-task"); } ); @@ -182,13 +187,14 @@ describe("RunEngineControlPlaneResolver adapter", () => { } ); - heteroPostgresTest("assertEnvExists resolves for a present env, rejects for a missing one", async ({ - prisma14, - }) => { - const { environment } = await seedEnv(prisma14, "PRODUCTION"); - const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + heteroPostgresTest( + "assertEnvExists resolves for a present env, rejects for a missing one", + async ({ prisma14 }) => { + const { environment } = await seedEnv(prisma14, "PRODUCTION"); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); - await expect(adapter.assertEnvExists(environment.id)).resolves.toBeUndefined(); - await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow(); - }); + await expect(adapter.assertEnvExists(environment.id)).resolves.toBeUndefined(); + await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow(); + } + ); }); From c4d4f4a32431f2b969de07d97ad4b0e8e4d31279 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 17:24:02 +0100 Subject: [PATCH 06/14] fix(run-ops split): interlock split mode against native realtime backend Enabling RUN_OPS_SPLIT_ENABLED without REALTIME_BACKEND_NATIVE_ENABLED silently breaks realtime: Electric replicates only from the control-plane DB, so NEW-resident (ksuid) runs on the dedicated run-ops DB are invisible and every realtime subscription hangs. Add a boot-time interlock that refuses split mode in that misconfiguration, mirroring the existing distinct-DB data-loss sentinel. The check is a pure predicate (assertSplitRealtimeInterlock) run synchronously inside assertRunOpsSplitSentinel on the same eager-boot path, failing fast before the async DB probe and before any run-ops routing is wired. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../run-ops-split-realtime-interlock.md | 12 ++++++++ apps/webapp/app/db.server.ts | 12 +++++++- .../v3/runOpsMigration/splitMode.server.ts | 23 +++++++++++++++ apps/webapp/test/runOpsSplitMode.test.ts | 28 ++++++++++++++++++- 4 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 .server-changes/run-ops-split-realtime-interlock.md diff --git a/.server-changes/run-ops-split-realtime-interlock.md b/.server-changes/run-ops-split-realtime-interlock.md new file mode 100644 index 00000000000..f05bdf3744f --- /dev/null +++ b/.server-changes/run-ops-split-realtime-interlock.md @@ -0,0 +1,12 @@ +--- +area: webapp +type: fix +--- + +Add a boot-time interlock that refuses to enable the run-ops DB split +(`RUN_OPS_SPLIT_ENABLED`) unless the native realtime backend +(`REALTIME_BACKEND_NATIVE_ENABLED`) is also on. Electric replicates only from the +control-plane database, so enabling the split without the native backend would +leave NEW-resident (ksuid) runs invisible to realtime and hang every +subscription. The check runs synchronously on the same eager-boot path as the +existing distinct-DB sentinel and fails fast before any run-ops routing is wired. diff --git a/apps/webapp/app/db.server.ts b/apps/webapp/app/db.server.ts index 9e481e53bb9..79d86ac14ba 100644 --- a/apps/webapp/app/db.server.ts +++ b/apps/webapp/app/db.server.ts @@ -19,7 +19,10 @@ import { logTransactionInfrastructureError, } from "./utils/prismaErrors"; import { singleton } from "./utils/singleton"; -import { isSplitEnabled } from "./v3/runOpsMigration/splitMode.server"; +import { + isSplitEnabled, + assertSplitRealtimeInterlock, +} from "./v3/runOpsMigration/splitMode.server"; import { computeRunOpsSplitReadEnabled } from "./v3/runOpsMigration/runOpsSplitReadGate"; import { DATASOURCE_CONTEXT_KEY, startActiveSpan } from "./v3/tracer.server"; import type { Span } from "@opentelemetry/api"; @@ -284,6 +287,13 @@ export const runOpsSplitReadEnabled: boolean = computeRunOpsSplitReadEnabled({ // call it from the eager-boot path before any run-ops routing is wired. export async function assertRunOpsSplitSentinel(): Promise { if (!env.RUN_OPS_SPLIT_ENABLED) return; + // Realtime interlock (synchronous): Electric replicates only from the control-plane + // DB, so split-on without the native realtime backend leaves NEW-resident runs + // invisible and hangs every subscription. Fail fast before the async DB probe. + assertSplitRealtimeInterlock({ + splitEnabled: env.RUN_OPS_SPLIT_ENABLED, + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }); const ok = await isSplitEnabled(); if (!ok) { throw new Error( diff --git a/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts index d3048039951..8d0eb807dbf 100644 --- a/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts @@ -40,6 +40,29 @@ export async function computeSplitEnabled( return result.distinct === true; } +export type SplitRealtimeInterlockConfig = { + splitEnabled: boolean; + nativeRealtimeEnabled: boolean; +}; + +/** + * Boot-time realtime interlock (pure predicate). Split mode puts NEW-resident + * (ksuid) runs on the dedicated run-ops DB, but Electric replicates only from the + * control-plane DB — with the native realtime backend OFF those runs are invisible + * and every realtime subscription hangs. Refuse split unless native is on; split-off + * is always allowed regardless of the realtime backend. + */ +export function assertSplitRealtimeInterlock(config: SplitRealtimeInterlockConfig): void { + if (!config.splitEnabled) { + return; + } + if (!config.nativeRealtimeEnabled) { + throw new Error( + "RUN_OPS_SPLIT_ENABLED is on but the native realtime backend (REALTIME_BACKEND_NATIVE_ENABLED) is not enabled — Electric cannot serve NEW-resident runs; refusing to enable split." + ); + } +} + let cached: Promise | undefined; export function isSplitEnabled(): Promise { diff --git a/apps/webapp/test/runOpsSplitMode.test.ts b/apps/webapp/test/runOpsSplitMode.test.ts index 826dd37c09e..7ce2bec3a5d 100644 --- a/apps/webapp/test/runOpsSplitMode.test.ts +++ b/apps/webapp/test/runOpsSplitMode.test.ts @@ -1,7 +1,10 @@ import { describe, expect, it, vi } from "vitest"; // @testcontainers/postgresql resolves because it is declared in apps/webapp/package.json. import { PostgreSqlContainer } from "@testcontainers/postgresql"; -import { computeSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; +import { + computeSplitEnabled, + assertSplitRealtimeInterlock, +} from "~/v3/runOpsMigration/splitMode.server"; import { probeDistinctDatabases } from "~/v3/runOpsMigration/distinctDbSentinel.server"; describe("computeSplitEnabled (pure)", () => { @@ -58,6 +61,29 @@ describe("computeSplitEnabled (pure)", () => { }); }); +describe("assertSplitRealtimeInterlock (pure)", () => { + it("throws when split is on but the native realtime backend is off", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: true, nativeRealtimeEnabled: false }) + ).toThrowError(/native realtime backend|REALTIME_BACKEND_NATIVE_ENABLED/i); + }); + + it("does not throw when split is on and the native realtime backend is on", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: true, nativeRealtimeEnabled: true }) + ).not.toThrow(); + }); + + it("does not throw when split is off, regardless of the native realtime backend", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: false, nativeRealtimeEnabled: false }) + ).not.toThrow(); + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: false, nativeRealtimeEnabled: true }) + ).not.toThrow(); + }); +}); + describe("distinct-DB sentinel (real Postgres)", () => { it("reports NOT distinct when both URLs hit the same physical cluster", async () => { const pg = await new PostgreSqlContainer("docker.io/postgres:14").start(); From 4d1a69ee79479456564ac62847333ff9daf315f9 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 20:10:40 +0100 Subject: [PATCH 07/14] fix(run-ops split): gate topology split on the opt-in flag and tighten diagnostics - gate runOpsTopology splitEnabled on RUN_OPS_SPLIT_ENABLED so provisioning both DSNs before flipping the flag cannot open a second pool or route writes ahead of the distinct-DB sentinel - rethrow the original UnclassifiableRunId in the cross-seam guard so its value/valueLength keep reflecting the real waitpoint id - log run-found-but-environment-unresolved distinctly from missing-run - correct the RUN_OPS_DATABASE_URL doc comment (Prisma datasource, not the webapp runtime pool) Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/webapp/app/db.server.ts | 3 ++- apps/webapp/app/env.server.ts | 3 ++- apps/webapp/app/v3/eventRepository/index.server.ts | 6 ++++++ .../v3/runOpsMigration/crossSeamGuard.server.ts | 14 +++----------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/db.server.ts b/apps/webapp/app/db.server.ts index 79d86ac14ba..8fc033dbf03 100644 --- a/apps/webapp/app/db.server.ts +++ b/apps/webapp/app/db.server.ts @@ -238,7 +238,8 @@ export function selectRunOpsTopology( // singletons use (captureInfrastructureErrors(tagDatasource(role, raw))). const runOpsTopology: RunOpsTopology = singleton("runOpsTopology", () => { const newUrl = env.TASK_RUN_DATABASE_URL; - const splitEnabled = !!newUrl && !!env.TASK_RUN_LEGACY_DATABASE_URL; + // Gate on the opt-in flag too: the distinct-DB sentinel only runs when the flag is on. + const splitEnabled = env.RUN_OPS_SPLIT_ENABLED && !!newUrl && !!env.TASK_RUN_LEGACY_DATABASE_URL; return selectRunOpsTopology( { diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 130ca2c1494..906d3aa225b 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -132,7 +132,8 @@ const EnvironmentSchema = z // Explicit positive opt-in. Split behavior is unreachable unless this is true // AND the distinct-DB sentinel confirms the two URLs are physically distinct DBs. RUN_OPS_SPLIT_ENABLED: BoolEnv.default(false), - // Canonical URL for the dedicated run-ops DB. Takes precedence over TASK_RUN_DATABASE_URL. + // Datasource URL for the dedicated run-ops Prisma schema (migrations/generation). + // The webapp runtime pool is driven by TASK_RUN_DATABASE_URL, not this var. RUN_OPS_DATABASE_URL: z .string() .refine(isValidDatabaseUrl, "RUN_OPS_DATABASE_URL is invalid") diff --git a/apps/webapp/app/v3/eventRepository/index.server.ts b/apps/webapp/app/v3/eventRepository/index.server.ts index d89d241bf22..f0687d2a7ce 100644 --- a/apps/webapp/app/v3/eventRepository/index.server.ts +++ b/apps/webapp/app/v3/eventRepository/index.server.ts @@ -287,6 +287,12 @@ async function findRunForEventCreation(runId: string) { ); if (!environment) { + // Run exists but its environment could not be resolved (e.g. a lagging replica + // under split); distinguish this from a genuinely missing run. + logger.warn("Run found but environment unresolved for event creation", { + runId, + runtimeEnvironmentId: foundRun.runtimeEnvironmentId, + }); return null; } diff --git a/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts index 0364115eab0..791a101e9e3 100644 --- a/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts @@ -1,4 +1,4 @@ -import { ownerEngine, UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; +import { ownerEngine } from "@trigger.dev/core/v3/isomorphic"; import { isSplitEnabled } from "./splitMode.server"; import type { CrossSeamGuardDecision, @@ -56,16 +56,8 @@ export function selectStoreForWaitpoint( const classify = deps?.classify ?? ownerEngine; - let residency: RunOpsResidency; - try { - residency = classify(input.waitpointId); - } catch (error) { - // Loud on ambiguity: rethrow with context, never catch-and-default. - if (error instanceof UnclassifiableRunId) { - throw new UnclassifiableRunId(`${input.waitpointId} (routeKind=${input.routeKind})`); - } - throw error; - } + // Loud on ambiguity: classify throws UnclassifiableRunId with the real id; never catch-and-default. + const residency: RunOpsResidency = classify(input.waitpointId); const pinnedReason = applyPinningRules(input); const store: StoreTarget = pinnedReason ? "legacy" : storeForResidency(residency); From cac7c11bdb05322cfa94e7ad499decc3847493ce Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 21:38:04 +0100 Subject: [PATCH 08/14] test(webapp): cap vitest fork concurrency to bound testcontainer memory on CI --- apps/webapp/vitest.config.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/webapp/vitest.config.ts b/apps/webapp/vitest.config.ts index 8e05aec1ebc..02d61356b89 100644 --- a/apps/webapp/vitest.config.ts +++ b/apps/webapp/vitest.config.ts @@ -23,6 +23,10 @@ export default defineConfig({ exclude: ["test/**/*.e2e.test.ts", "test/**/*.e2e.full.test.ts"], globals: true, pool: "forks", + // Each fork boots its own testcontainer set (including the two-DB run-ops fixtures), + // so cap concurrency to keep per-runner container memory in check. Sharding still + // provides cross-runner parallelism. + poolOptions: { forks: { maxForks: 4, minForks: 1 } }, setupFiles: ["./test/setup.ts"], // load apps/webapp/.env }, // @ts-ignore From 70e54d5fcf0f72ec1448d50b1306714c57bb1afd Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 2 Jul 2026 22:06:12 +0100 Subject: [PATCH 09/14] chore(webapp): drop vitest maxForks cap (broke typecheck; superseded by worker-mock fix) --- apps/webapp/vitest.config.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/apps/webapp/vitest.config.ts b/apps/webapp/vitest.config.ts index 02d61356b89..8e05aec1ebc 100644 --- a/apps/webapp/vitest.config.ts +++ b/apps/webapp/vitest.config.ts @@ -23,10 +23,6 @@ export default defineConfig({ exclude: ["test/**/*.e2e.test.ts", "test/**/*.e2e.full.test.ts"], globals: true, pool: "forks", - // Each fork boots its own testcontainer set (including the two-DB run-ops fixtures), - // so cap concurrency to keep per-runner container memory in check. Sharding still - // provides cross-runner parallelism. - poolOptions: { forks: { maxForks: 4, minForks: 1 } }, setupFiles: ["./test/setup.ts"], // load apps/webapp/.env }, // @ts-ignore From 009c4cf49086c358cc7f644d2864f1b40163e099 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Fri, 3 Jul 2026 08:49:28 +0100 Subject: [PATCH 10/14] chore: add server-changes for pr05 Co-Authored-By: Claude Opus 4.8 (1M context) --- .server-changes/run-ops-split-webapp-foundation.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/run-ops-split-webapp-foundation.md diff --git a/.server-changes/run-ops-split-webapp-foundation.md b/.server-changes/run-ops-split-webapp-foundation.md new file mode 100644 index 00000000000..620b2fa7123 --- /dev/null +++ b/.server-changes/run-ops-split-webapp-foundation.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add the webapp foundation for the run-ops database split: topology/flag wiring, split-mode gating, a distinct-DB boot sentinel, and control-plane resolver read-through (all inert until `RUN_OPS_SPLIT_ENABLED`). From a01f2af69455efcca98dec4324c97ecb20c7137f Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Fri, 3 Jul 2026 09:26:47 +0100 Subject: [PATCH 11/14] fix(run-ops split): invalidate control-plane cache on writes; route auth-env through the cache-first resolver The ControlPlaneCache served env/org data with no invalidation, so admin/control-plane writes were only reflected after the TTL. Add two invalidation scopes to the cache (invalidateEnvironment for one env's slots; invalidateOrganization via a per-org epoch that env/authEnv values are stamped with, so all of an org's cached rows drop with no reverse index), expose them on the resolver, and call them at every write site that mutates cache-served data: pause/resume, archive, env/org concurrency + burst-factor, API-key regeneration, feature flags, API/batch rate limits, runs enable/disable, org + project delete, and stream-basin provisioning. Also extend the resolver's authenticated-env slot to carry `git` and make the run-engine adapter's resolveAuthenticatedEnv delegate to the cache-first, split-aware resolver instead of issuing its own $replica.findFirst, so it honors splitEnabled() and the cache like its siblings while still returning `git` and the deleted-project guard. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../run-ops-split-webapp-foundation.md | 2 +- .../backOffice/ApiRateLimitSection.server.ts | 3 + .../BatchRateLimitSection.server.ts | 3 + apps/webapp/app/models/api-key.server.ts | 4 + ...nvironments.$environmentId.burst-factor.ts | 3 + ...dmin.api.v1.environments.$environmentId.ts | 5 + ...api.v1.orgs.$organizationId.concurrency.ts | 4 + ...i.v1.orgs.$organizationId.feature-flags.ts | 4 + ...api.v1.orgs.$organizationId.runs.enable.ts | 5 + ...i.v2.orgs.$organizationId.feature-flags.ts | 4 + .../app/services/archiveBranch.server.ts | 4 + .../app/services/deleteOrganization.server.ts | 4 + .../app/services/deleteProject.server.ts | 6 + .../realtime/streamBasinProvisioner.server.ts | 7 + .../controlPlaneCache.server.test.ts | 98 +++++++++++- .../controlPlaneCache.server.ts | 88 +++++++++-- .../controlPlaneResolver.server.ts | 28 +++- .../runEngineControlPlaneResolver.server.ts | 21 +-- .../v3/services/allocateConcurrency.server.ts | 4 + ...billingLimitConvergeEnvironments.server.ts | 7 + .../v3/services/pauseEnvironment.server.ts | 6 + .../controlPlaneResolver.server.test.ts | 140 ++++++++++++++++++ ...nEngineControlPlaneResolver.server.test.ts | 65 +++++++- 23 files changed, 477 insertions(+), 38 deletions(-) diff --git a/.server-changes/run-ops-split-webapp-foundation.md b/.server-changes/run-ops-split-webapp-foundation.md index 620b2fa7123..e70532c3e5e 100644 --- a/.server-changes/run-ops-split-webapp-foundation.md +++ b/.server-changes/run-ops-split-webapp-foundation.md @@ -3,4 +3,4 @@ area: webapp type: feature --- -Add the webapp foundation for the run-ops database split: topology/flag wiring, split-mode gating, a distinct-DB boot sentinel, and control-plane resolver read-through (all inert until `RUN_OPS_SPLIT_ENABLED`). +Add the webapp foundation for the run-ops database split: topology/flag wiring, split-mode gating, a distinct-DB boot sentinel, and control-plane resolver read-through (all inert until `RUN_OPS_SPLIT_ENABLED`). The control-plane cache is now invalidated at env/org write sites (pause/resume, archive, concurrency/burst-factor, API-key regen, feature flags, rate limits, runs enable/disable, org/project delete, stream-basin provisioning) so admin/control-plane changes are reflected immediately rather than after the cache TTL, and the run-engine authenticated-env resolution goes through the cache-first, split-aware resolver. diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts index 7de475bda8e..7f137c61d0c 100644 --- a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts @@ -2,6 +2,7 @@ import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { type Duration } from "~/services/rateLimiter.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { API_RATE_LIMIT_INTENT } from "./ApiRateLimitSection"; import { handleRateLimitAction, @@ -31,6 +32,8 @@ export const apiRateLimitDomain: RateLimitDomain = { where: { id: orgId }, data: { apiRateLimiterConfig: next as any }, }); + // apiRateLimiterConfig is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(orgId); logger.info("admin.backOffice.apiRateLimit", { adminUserId, orgId, diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts index 3891e4fc40c..4614c5b2893 100644 --- a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts @@ -2,6 +2,7 @@ import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { type Duration } from "~/services/rateLimiter.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { BATCH_RATE_LIMIT_INTENT } from "./BatchRateLimitSection"; import { handleRateLimitAction, @@ -31,6 +32,8 @@ export const batchRateLimitDomain: RateLimitDomain = { where: { id: orgId }, data: { batchRateLimitConfig: next as any }, }); + // batchRateLimitConfig is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(orgId); logger.info("admin.backOffice.batchRateLimit", { adminUserId, orgId, diff --git a/apps/webapp/app/models/api-key.server.ts b/apps/webapp/app/models/api-key.server.ts index b5f2bd0f7d9..19947417229 100644 --- a/apps/webapp/app/models/api-key.server.ts +++ b/apps/webapp/app/models/api-key.server.ts @@ -2,6 +2,7 @@ import type { RuntimeEnvironment } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { customAlphabet } from "nanoid"; import { RuntimeEnvironmentType } from "~/database-types"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; const apiKeyId = customAlphabet( "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", @@ -87,6 +88,9 @@ export async function regenerateApiKey({ userId, environmentId }: RegenerateAPIK }); }); + // The env's apiKey changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environmentId); + return updatedEnviroment; } diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts index fa197fc1694..44c7c0243c6 100644 --- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts +++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts @@ -2,6 +2,7 @@ import { type ActionFunctionArgs, json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; const ParamsSchema = z.object({ @@ -26,5 +27,7 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits(environment); + controlPlaneResolver.invalidateEnvironment(environmentId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts index ae8deb32dfa..908e8f449a0 100644 --- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts +++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { engine } from "~/v3/runEngine.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; @@ -45,6 +46,10 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits(environment); + // Org max-concurrency changed too, which is embedded in every env of the org; invalidating + // the org drops the env/authEnv rows for all of them (including this env). + controlPlaneResolver.invalidateOrganization(environment.organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts index 97c5e74583f..c99637a0d10 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; const ParamsSchema = z.object({ @@ -83,5 +84,8 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits({ ...modifiedEnvironment, organization }); } + // Org + every affected env's concurrency changed; one org invalidation covers them all. + controlPlaneResolver.invalidateOrganization(organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts index 779847d250f..e5fd7f7963b 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { validatePartialFeatureFlags } from "~/v3/featureFlags"; const ParamsSchema = z.object({ @@ -101,6 +102,9 @@ export async function action({ request, params }: ActionFunctionArgs) { }, }); + // Org feature flags are embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(organizationId); + const updatedFlagsResult = updatedOrganization.featureFlags ? validatePartialFeatureFlags(updatedOrganization.featureFlags as Record) : { success: false as const }; diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts index cb888b5b094..872900c9e2d 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts @@ -3,6 +3,7 @@ import { EnvironmentPauseSource } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { PauseEnvironmentService } from "~/v3/services/pauseEnvironment.server"; const ParamsSchema = z.object({ @@ -43,6 +44,10 @@ export async function action({ request, params }: ActionFunctionArgs) { return json({ error: "Organization not found" }, { status: 404 }); } + // `runsEnabled` is embedded in every env of the org; drop all its cached env rows. The + // per-env pause writes below invalidate their own envs via PauseEnvironmentService. + controlPlaneResolver.invalidateOrganization(organizationId); + const environments = await prisma.runtimeEnvironment.findMany({ where: { organizationId, diff --git a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts index ea0dd757c25..3c62d9c7a5f 100644 --- a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts @@ -4,6 +4,7 @@ import { Prisma } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireUser } from "~/services/session.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { flags as getGlobalFlags } from "~/v3/featureFlags.server"; import { FEATURE_FLAG, @@ -132,5 +133,8 @@ export async function action({ request, params }: ActionFunctionArgs) { throw e; } + // Org feature flags are embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/services/archiveBranch.server.ts b/apps/webapp/app/services/archiveBranch.server.ts index e6dc3d3325a..3372ac87229 100644 --- a/apps/webapp/app/services/archiveBranch.server.ts +++ b/apps/webapp/app/services/archiveBranch.server.ts @@ -2,6 +2,7 @@ import { type PrismaClient } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { logger } from "./logger.server"; import { nanoid } from "nanoid"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export class ArchiveBranchService { #prismaClient: PrismaClient; @@ -88,6 +89,9 @@ export class ArchiveBranchService { data: { archivedAt: new Date(), slug, shortcode }, }); + // archivedAt/slug/shortcode changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environmentId); + return { success: true as const, branch: updatedBranch, diff --git a/apps/webapp/app/services/deleteOrganization.server.ts b/apps/webapp/app/services/deleteOrganization.server.ts index 6c490b276df..9f8eb1cd37a 100644 --- a/apps/webapp/app/services/deleteOrganization.server.ts +++ b/apps/webapp/app/services/deleteOrganization.server.ts @@ -4,6 +4,7 @@ import { prisma } from "~/db.server"; import { featuresForRequest } from "~/features.server"; import { DeleteProjectService } from "./deleteProject.server"; import { getCurrentPlan } from "./platform.v3.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export class DeleteOrganizationService { #prismaClient: PrismaClient; @@ -82,5 +83,8 @@ export class DeleteOrganizationService { deletedAt: new Date(), }, }); + + // runsEnabled + the org's projects (project.deletedAt) changed; drop all cached env rows. + controlPlaneResolver.invalidateOrganization(organization.id); } } diff --git a/apps/webapp/app/services/deleteProject.server.ts b/apps/webapp/app/services/deleteProject.server.ts index f6bc23d56a6..bbce896a57f 100644 --- a/apps/webapp/app/services/deleteProject.server.ts +++ b/apps/webapp/app/services/deleteProject.server.ts @@ -2,6 +2,7 @@ import type { PrismaClient } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { marqs } from "~/v3/marqs/index.server"; import { engine } from "~/v3/runEngine.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; type Options = ({ projectId: string } | { projectSlug: string }) & { userId: string; @@ -60,6 +61,11 @@ export class DeleteProjectService { deletedAt: new Date(), }, }); + + // project.deletedAt (which gates env resolution) changed; drop every cached env of this project. + for (const environment of project.environments) { + controlPlaneResolver.invalidateEnvironment(environment.id); + } } async #getProjectId(options: Options) { diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 3cb30ba6a27..98316b84fe8 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -11,6 +11,7 @@ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { parseDuration } from "./duration.server"; export function isPerOrgBasinsEnabled(): boolean { @@ -76,6 +77,9 @@ export async function provisionBasinForOrg( data: { streamBasinName: basin }, }); + // streamBasinName is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(org.id); + logger.info("[streamBasinProvisioner] provisioned basin for org", { orgId: org.id, basin, @@ -158,6 +162,9 @@ export async function deprovisionBasinForOrg( data: { streamBasinName: null }, }); + // streamBasinName is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(org.id); + logger.info("[streamBasinProvisioner] deprovisioned basin for org", { orgId, previousBasin: org.streamBasinName, diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts index f83d534c904..9c90e3067e1 100644 --- a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts @@ -10,9 +10,13 @@ import { // Minimal, structurally-irrelevant stand-ins: the cache stores and returns opaque values by // reference, so these only need to be distinguishable objects — the slot types are exercised for // key routing, not field shape. -const anEnv = { id: "env_1" } as unknown as ResolvedEnv; +const anEnv = { id: "env_1", organizationId: "org_1" } as unknown as ResolvedEnv; const aVersion = { worker: { id: "bw_1" } } as unknown as ResolvedWorkerVersion; -const anAuthEnv = { id: "env_1", slug: "prod" } as unknown as ResolvedAuthenticatedEnv; +const anAuthEnv = { + id: "env_1", + slug: "prod", + organizationId: "org_1", +} as unknown as ResolvedAuthenticatedEnv; const aLockedWorker = { lockedBy: null, lockedToVersion: null } as ResolvedRunLockedWorker; describe("ControlPlaneCache", () => { @@ -140,4 +144,94 @@ describe("ControlPlaneCache", () => { cache.setEnv("env_ttl", anEnv); expect(cache.getEnv("env_ttl")).toBeUndefined(); }); + + it("invalidateEnvironment forces the next env/authEnv/envExists read to miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_6", anEnv); + cache.setAuthEnv("env_6", anAuthEnv); + cache.setEnvExists("env_6", true); + expect(cache.getEnv("env_6")).toBe(anEnv); + expect(cache.getAuthEnv("env_6")).toBe(anAuthEnv); + expect(cache.getEnvExists("env_6")).toBe(true); + + cache.invalidateEnvironment("env_6"); + + expect(cache.getEnv("env_6")).toBeUndefined(); + expect(cache.getAuthEnv("env_6")).toBeUndefined(); + expect(cache.getEnvExists("env_6")).toBeUndefined(); + }); + + it("invalidateEnvironment is scoped to its own id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const keepEnv = { id: "env_keep", organizationId: "org_1" } as unknown as ResolvedEnv; + + cache.setEnv("env_drop", anEnv); + cache.setEnv("env_keep", keepEnv); + cache.invalidateEnvironment("env_drop"); + + expect(cache.getEnv("env_drop")).toBeUndefined(); + expect(cache.getEnv("env_keep")).toBe(keepEnv); + }); + + it("invalidateOrganization drops env/authEnv rows for that org across every env id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const envA = { id: "env_a", organizationId: "org_1" } as unknown as ResolvedEnv; + const envB = { id: "env_b", organizationId: "org_1" } as unknown as ResolvedEnv; + const authA = { + id: "env_a", + slug: "a", + organizationId: "org_1", + } as unknown as ResolvedAuthenticatedEnv; + + cache.setEnv("env_a", envA); + cache.setEnv("env_b", envB); + cache.setAuthEnv("env_a", authA); + expect(cache.getEnv("env_a")).toBe(envA); + expect(cache.getEnv("env_b")).toBe(envB); + expect(cache.getAuthEnv("env_a")).toBe(authA); + + cache.invalidateOrganization("org_1"); + + // Every env/authEnv row for org_1 misses — no reverse org->env index required. + expect(cache.getEnv("env_a")).toBeUndefined(); + expect(cache.getEnv("env_b")).toBeUndefined(); + expect(cache.getAuthEnv("env_a")).toBeUndefined(); + }); + + it("invalidateOrganization does not affect a different org's cached envs", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const otherOrgEnv = { id: "env_other", organizationId: "org_2" } as unknown as ResolvedEnv; + + cache.setEnv("env_1", anEnv); // org_1 + cache.setEnv("env_other", otherOrgEnv); // org_2 + + cache.invalidateOrganization("org_1"); + + expect(cache.getEnv("env_1")).toBeUndefined(); + expect(cache.getEnv("env_other")).toBe(otherOrgEnv); + }); + + it("re-setting an env after an org invalidation makes it readable again", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_1", anEnv); + cache.invalidateOrganization("org_1"); + expect(cache.getEnv("env_1")).toBeUndefined(); + + // A write after the bump stamps the new org epoch, so it reads back. + cache.setEnv("env_1", anEnv); + expect(cache.getEnv("env_1")).toBe(anEnv); + }); + + it("a cached null env survives an org invalidation (a confirmed absence carries no org)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_absent", null); + expect(cache.getEnv("env_absent")).toBeNull(); + + cache.invalidateOrganization("org_1"); + + expect(cache.getEnv("env_absent")).toBeNull(); + }); }); diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts index cc1731fd547..01fd205030a 100644 --- a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts @@ -23,6 +23,11 @@ import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environ * stamped epoch still matches the current epoch, otherwise it is treated as a miss. * `invalidate*` bumps the key's epoch, forcing the next read to miss. (If a future * rebase gives `BoundedTtlCache` a public `delete`, prefer it and drop the epoch map.) + * + * Two invalidation scopes: `invalidateEnvironment(id)` bumps every env-keyed slot for one + * env; `invalidateOrganization(orgId)` bumps a per-org epoch that env/authEnv values are + * also stamped with at write time (no reverse org->env index needed), so all of that org's + * cached env/authEnv rows miss on the next read. */ export const DEFAULT_CP_CACHE_TTL_MS = 30_000; @@ -52,9 +57,11 @@ export type ResolvedWorkerVersion = { deployment: WorkerDeployment | null; }; -// The canonical authenticated-environment shape (slug/type/project/organization/orgMember/…). -// Re-aliased from the engine type so the cache slot cannot drift from `toAuthenticated()`'s output. -export type ResolvedAuthenticatedEnv = AuthenticatedEnvironment; +// The canonical authenticated-environment shape (slug/type/project/organization/orgMember/…) +// PLUS the `git` JSON column the run-engine runAttemptSystem reads. `AuthenticatedEnvironment` +// does not carry `git`, so the intersection adds it; this matches the run-engine +// `ResolvedAuthenticatedEnv` so the engine adapter can delegate to this cached slot. +export type ResolvedAuthenticatedEnv = AuthenticatedEnvironment & { git: Prisma.JsonValue | null }; /** * The slim `lockedBy` (BackgroundWorkerTask) + `lockedToVersion` (BackgroundWorker, with its @@ -94,7 +101,9 @@ export type ResolvedRunLockedWorker = { } | null; }; -type Stamped = { value: V; epoch: number }; +// `orgEpoch` is stamped only on slots that embed org config (env/authEnv); undefined slots +// are exempt from the org-epoch check. +type Stamped = { value: V; epoch: number; orgEpoch?: number }; export class ControlPlaneCache { readonly #env: BoundedTtlCache>; @@ -103,8 +112,9 @@ export class ControlPlaneCache { readonly #authEnv: BoundedTtlCache>; readonly #lockedWorker: BoundedTtlCache>; - // Explicit invalidation: bumping a key's epoch forces the next read to miss. + // Explicit invalidation: bumping a key's (or org's) epoch forces the next read to miss. readonly #epochs = new Map(); + readonly #orgEpochs = new Map(); constructor(opts?: { ttlMs?: number; maxEntries?: number }) { const ttl = opts?.ttlMs ?? DEFAULT_CP_CACHE_TTL_MS; @@ -120,16 +130,27 @@ export class ControlPlaneCache { return this.#epochs.get(key) ?? 0; } - #read(cache: BoundedTtlCache>, key: string): V | undefined { + #orgEpoch(orgId: string): number { + return this.#orgEpochs.get(orgId) ?? 0; + } + + #read(cache: BoundedTtlCache>, key: string, orgId?: string): V | undefined { const entry = cache.get(key); if (entry === undefined || entry.epoch !== this.#epoch(key)) { return undefined; } + if (orgId !== undefined && entry.orgEpoch !== this.#orgEpoch(orgId)) { + return undefined; + } return entry.value; } - #write(cache: BoundedTtlCache>, key: string, value: V): void { - cache.set(key, { value, epoch: this.#epoch(key) }); + #write(cache: BoundedTtlCache>, key: string, value: V, orgId?: string): void { + cache.set(key, { + value, + epoch: this.#epoch(key), + orgEpoch: orgId !== undefined ? this.#orgEpoch(orgId) : undefined, + }); } #bump(key: string): void { @@ -137,10 +158,23 @@ export class ControlPlaneCache { } getEnv(id: string): (ResolvedEnv | null) | undefined { - return this.#read(this.#env, `env:${id}`); + const entry = this.#env.get(`env:${id}`); + if (entry === undefined || entry.epoch !== this.#epoch(`env:${id}`)) { + return undefined; + } + // A cached null (or an entry written without an org) carries no org, so it can never be + // stale against an org write. + if ( + entry.value !== null && + entry.value.organizationId && + entry.orgEpoch !== this.#orgEpoch(entry.value.organizationId) + ) { + return undefined; + } + return entry.value; } setEnv(id: string, value: ResolvedEnv | null): void { - this.#write(this.#env, `env:${id}`, value); + this.#write(this.#env, `env:${id}`, value, value?.organizationId); } invalidateEnv(id: string): void { this.#bump(`env:${id}`); @@ -164,10 +198,40 @@ export class ControlPlaneCache { // full authenticated environment (toAuthenticated shape) getAuthEnv(id: string): (ResolvedAuthenticatedEnv | null) | undefined { - return this.#read(this.#authEnv, `authEnv:${id}`); + const entry = this.#authEnv.get(`authEnv:${id}`); + if (entry === undefined || entry.epoch !== this.#epoch(`authEnv:${id}`)) { + return undefined; + } + if ( + entry.value !== null && + entry.value.organizationId && + entry.orgEpoch !== this.#orgEpoch(entry.value.organizationId) + ) { + return undefined; + } + return entry.value; } setAuthEnv(id: string, value: ResolvedAuthenticatedEnv | null): void { - this.#write(this.#authEnv, `authEnv:${id}`, value); + this.#write(this.#authEnv, `authEnv:${id}`, value, value?.organizationId); + } + + /** + * Invalidate every env-keyed slot for a single environment. Call this from a control-plane + * write that mutates one env's config (pause/resume, archive, concurrency/burst-factor). + */ + invalidateEnvironment(id: string): void { + this.#bump(`env:${id}`); + this.#bump(`authEnv:${id}`); + this.#bump(`envExists:${id}`); + } + + /** + * Invalidate every cached env/authEnv row belonging to an organization. Call this from a + * control-plane write that mutates org-level config (feature flags, org concurrency, runs + * enable/disable, rate limits) — it affects the org object embedded in each of the org's envs. + */ + invalidateOrganization(orgId: string): void { + this.#orgEpochs.set(orgId, this.#orgEpoch(orgId) + 1); } // run-locked worker (lockedBy + lockedToVersion); key = `${lockedById ?? "_"}:${lockedToVersionId ?? "_"}` diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts index 57595e6214b..8e2779aa8b6 100644 --- a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts @@ -7,11 +7,11 @@ import { ControlPlaneCache, DEFAULT_CP_CACHE_MAX_ENTRIES, DEFAULT_CP_CACHE_TTL_MS, + type ResolvedAuthenticatedEnv, type ResolvedEnv, type ResolvedWorkerVersion, } from "./controlPlaneCache.server"; import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; -import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environment"; import type { ResolvedRunLockedWorker } from "./controlPlaneCache.server"; /** @@ -36,7 +36,7 @@ import type { ResolvedRunLockedWorker } from "./controlPlaneCache.server"; */ export { ResolvedEnv, ResolvedWorkerVersion }; -export type { ResolvedRunLockedWorker }; +export type { ResolvedAuthenticatedEnv, ResolvedRunLockedWorker }; /** Thrown by `assertEnvExists` when a referenced control-plane env does not exist. */ export class ControlPlaneReferenceError extends Error { @@ -126,7 +126,7 @@ export class ControlPlaneResolver { }; } - async resolveAuthenticatedEnv(environmentId: string): Promise { + async resolveAuthenticatedEnv(environmentId: string): Promise { if (!this.splitEnabled()) { return this.#queryAuthenticatedEnv(this.controlPlanePrimary, environmentId); } @@ -144,7 +144,7 @@ export class ControlPlaneResolver { async #queryAuthenticatedEnv( client: CpClient, environmentId: string - ): Promise { + ): Promise { const env = await client.runtimeEnvironment.findFirst({ where: { id: environmentId }, include: authIncludeWithParent, @@ -154,7 +154,9 @@ export class ControlPlaneResolver { return null; } - return toAuthenticated(env); + // `authIncludeWithParent` returns all RuntimeEnvironment scalars on the row (including + // `git`), so we map the auth shape via toAuthenticated() and add `git` from the same row. + return { ...toAuthenticated(env), git: env.git }; } async resolveRunLockedWorker(args: { @@ -424,6 +426,22 @@ export class ControlPlaneResolver { }); return env !== null; } + + /** + * Drop cached control-plane rows for one environment after a control-plane write to that + * env's config. A no-op when split is OFF (nothing is cached), so it is always safe to call. + */ + invalidateEnvironment(environmentId: string): void { + this.cache.invalidateEnvironment(environmentId); + } + + /** + * Drop cached env/authEnv rows for every environment of an organization after a + * control-plane write to that org's config. Safe under split OFF (no cache). + */ + invalidateOrganization(organizationId: string): void { + this.cache.invalidateOrganization(organizationId); + } } // Module-level singleton: wires the real control-plane clients + env split predicate. diff --git a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts index c505e3d1081..8fbfb66b17d 100644 --- a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts @@ -5,8 +5,6 @@ import type { ResolvedWorkerVersion, } from "@internal/run-engine"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { $replica } from "~/db.server"; -import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; import { ControlPlaneResolver as AppControlPlaneResolver, controlPlaneResolver, @@ -62,25 +60,16 @@ export class RunEngineControlPlaneResolver implements EngineControlPlaneResolver } async resolveAuthenticatedEnv(environmentId: string): Promise { - // Mirror findEnvironmentById's data source ($replica) and auth shape, but the - // engine needs `git` too. A single findFirst with `include: authIncludeWithParent` - // returns all RuntimeEnvironment scalars (including `git`) on the row, so we map - // the auth shape via toAuthenticated() and add `git` from the same row. - const environment = await $replica.runtimeEnvironment.findFirst({ - where: { - id: environmentId, - }, - include: authIncludeWithParent, - }); + // Delegate to the cache-first, split-aware app resolver (like resolveEnv/resolveWorkerVersion): + // its authenticated-env slot now carries `git`. Keep the deleted-project guard the engine relies + // on — a deleted project's env must not resolve. + const environment = await this.#resolver.resolveAuthenticatedEnv(environmentId); if (!environment || environment.project.deletedAt !== null) { return null; } - return { - ...toAuthenticated(environment), - git: environment.git, - }; + return environment; } async assertEnvExists(environmentId: string): Promise { diff --git a/apps/webapp/app/v3/services/allocateConcurrency.server.ts b/apps/webapp/app/v3/services/allocateConcurrency.server.ts index 83fb69d0623..b9a28daba60 100644 --- a/apps/webapp/app/v3/services/allocateConcurrency.server.ts +++ b/apps/webapp/app/v3/services/allocateConcurrency.server.ts @@ -2,6 +2,7 @@ import { tryCatch } from "@trigger.dev/core"; import { ManageConcurrencyPresenter } from "~/presenters/v3/ManageConcurrencyPresenter.server"; import { BaseService } from "./baseService.server"; import { updateEnvConcurrencyLimits } from "../runQueue.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; type Input = { userId: string; @@ -88,6 +89,9 @@ export class AllocateConcurrencyService extends BaseService { if (!updatedEnvironment.paused) { await updateEnvConcurrencyLimits(updatedEnvironment); } + + // maximumConcurrencyLimit changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environment.id); } return { diff --git a/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts b/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts index 61238cd8a1c..0b59d4c7fae 100644 --- a/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts +++ b/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts @@ -7,6 +7,7 @@ import { } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { logger } from "~/services/logger.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { BILLABLE_ENVIRONMENT_TYPES, BILLING_LIMIT_CONVERGE_BATCH_SIZE, @@ -177,6 +178,9 @@ async function pauseEnvironmentForBillingLimit( data: { paused: false, pauseSource: null }, }); throw error; + } finally { + // The env's paused state changed (or was rolled back); drop any cached copy either way. + controlPlaneResolver.invalidateEnvironment(environment.id); } } @@ -208,5 +212,8 @@ async function resumeEnvironmentFromBillingLimit( }, }); throw error; + } finally { + // The env's paused state changed (or was rolled back); drop any cached copy either way. + controlPlaneResolver.invalidateEnvironment(environment.id); } } diff --git a/apps/webapp/app/v3/services/pauseEnvironment.server.ts b/apps/webapp/app/v3/services/pauseEnvironment.server.ts index 4cafbac1405..af9edff856c 100644 --- a/apps/webapp/app/v3/services/pauseEnvironment.server.ts +++ b/apps/webapp/app/v3/services/pauseEnvironment.server.ts @@ -5,6 +5,7 @@ import { getManualPauseEnvironmentResult } from "~/v3/services/billingLimit/manu import { updateEnvConcurrencyLimits } from "../runQueue.server"; import { WithRunEngine } from "./baseService.server"; import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export type PauseStatus = "paused" | "resumed"; @@ -127,9 +128,14 @@ export class PauseEnvironmentService extends WithRunEngine { pauseSource: previousPauseState?.pauseSource ?? null, }, }); + // Rollback still wrote the env row; drop any cached copy before rethrowing. + controlPlaneResolver.invalidateEnvironment(environment.id); throw error; } + // The env's `paused` state changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environment.id); + return { success: true, state: action, diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts index 207fb5201c2..a930ab0c328 100644 --- a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts @@ -468,6 +468,146 @@ heteroPostgresTest( } ); +heteroPostgresTest( + "resolveAuthenticatedEnv carries the `git` column (cached across calls)", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const gitMeta = { commitSha: "abc123", branchName: "main" }; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { git: gitMeta }, + }); + + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const first = await resolver.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + + // Served from cache, still carrying `git`. + const second = await resolver.resolveAuthenticatedEnv(environment.id); + expect(second!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + } +); + +// --- invalidation over the DB boundary ------------------------------------- + +heteroPostgresTest( + "invalidateEnvironment forces resolveEnv/resolveAuthenticatedEnv to re-read after a write", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache, + splitEnabled: () => true, + }); + + // Warm both env-scoped slots. + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).not.toBe(999); + expect((await resolver.resolveAuthenticatedEnv(environment.id))!.paused).toBe(false); + + // Control-plane write + invalidation (as a write site would do). + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { maximumConcurrencyLimit: 999, paused: true }, + }); + resolver.invalidateEnvironment(environment.id); + + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(999); + expect((await resolver.resolveAuthenticatedEnv(environment.id))!.paused).toBe(true); + } +); + +heteroPostgresTest( + "without invalidation a cached env stays stale after a control-plane write (fail-before contrast)", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const before = (await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { maximumConcurrencyLimit: 777 }, + }); + + // No invalidation: the cache still serves the pre-write value (this is the bug the + // write-site invalidation fixes). + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(before); + + // And with invalidation it re-reads. + resolver.invalidateEnvironment(environment.id); + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(777); + } +); + +heteroPostgresTest( + "invalidateOrganization forces every env of the org to re-read after an org write", + async ({ prisma14 }) => { + const { org: organization, project } = await seedControlPlane(prisma14); + // A second env in the same org. + const m = seedCounter++; + const secondEnv = await prisma14.runtimeEnvironment.create({ + data: { + type: "STAGING", + slug: `env-second-${m}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_stg_${m}`, + pkApiKey: `pk_stg_${m}`, + shortcode: `short_stg_${m}`, + }, + }); + const firstEnv = await prisma14.runtimeEnvironment.findFirstOrThrow({ + where: { projectId: project.id, type: "PRODUCTION" }, + }); + + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + // Warm both envs' authEnv slots. + expect((await resolver.resolveAuthenticatedEnv(firstEnv.id))!.organization.runsEnabled).toBe( + true + ); + expect((await resolver.resolveAuthenticatedEnv(secondEnv.id))!.organization.runsEnabled).toBe( + true + ); + + // Org-level write (runsEnabled) + a single org invalidation. + await prisma14.organization.update({ + where: { id: organization.id }, + data: { runsEnabled: false }, + }); + resolver.invalidateOrganization(organization.id); + + // BOTH envs re-read and now observe the org change, with no reverse org->env index. + expect((await resolver.resolveAuthenticatedEnv(firstEnv.id))!.organization.runsEnabled).toBe( + false + ); + expect((await resolver.resolveAuthenticatedEnv(secondEnv.id))!.organization.runsEnabled).toBe( + false + ); + } +); + // --- resolveRunLockedWorker ------------------------------------------------- heteroPostgresTest( diff --git a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts index 0c0d2b80fa7..e5c93418115 100644 --- a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts @@ -14,15 +14,32 @@ vi.setConfig({ testTimeout: 60_000 }); let n = 0; -function buildAppResolver(controlPlane: PrismaClient) { +function buildAppResolver(controlPlane: PrismaClient, opts?: { splitEnabled?: boolean }) { return new ControlPlaneResolver({ controlPlanePrimary: controlPlane, controlPlaneReplica: controlPlane, cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), - splitEnabled: () => false, + splitEnabled: () => opts?.splitEnabled ?? false, }); } +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook counting DB operations. + * Not a mock — the real query still runs; we only observe the boundary to prove cache hits. + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + async function seedEnv(prisma: PrismaClient, type: "PRODUCTION" | "DEVELOPMENT") { const suffix = `re-${n++}`; const organization = await prisma.organization.create({ @@ -197,4 +214,48 @@ describe("RunEngineControlPlaneResolver adapter", () => { await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow(); } ); + + heteroPostgresTest( + "resolveAuthenticatedEnv delegates to the app resolver, returns `git`, and is cached", + async ({ prisma14 }) => { + const { environment } = await seedEnv(prisma14, "PRODUCTION"); + const gitMeta = { commitSha: "deadbeef", branchName: "main" }; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { git: gitMeta }, + }); + + // split ON so the delegated app resolver caches; the counter proves the second call + // is a cache hit rather than re-querying $replica directly (the pre-fix behavior). + const { client: counting, reads } = countQueries(prisma14); + const adapter = new RunEngineControlPlaneResolver( + buildAppResolver(counting, { splitEnabled: true }) + ); + + const first = await adapter.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.id).toBe(environment.id); + expect(first!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + + const second = await adapter.resolveAuthenticatedEnv(environment.id); + expect(second!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + } + ); + + heteroPostgresTest( + "resolveAuthenticatedEnv returns null for a deleted project", + async ({ prisma14 }) => { + const { environment, project } = await seedEnv(prisma14, "PRODUCTION"); + await prisma14.project.update({ + where: { id: project.id }, + data: { deletedAt: new Date() }, + }); + + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + expect(await adapter.resolveAuthenticatedEnv(environment.id)).toBeNull(); + } + ); }); From bd8cd5ee4e07aacbf69ba67dc04792141c092a13 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Fri, 3 Jul 2026 09:47:42 +0100 Subject: [PATCH 12/14] chore(server-changes): consolidate pr05 run-ops split entries into one --- .server-changes/run-ops-split-realtime-interlock.md | 12 ------------ .server-changes/run-ops-split-webapp-foundation.md | 3 +-- 2 files changed, 1 insertion(+), 14 deletions(-) delete mode 100644 .server-changes/run-ops-split-realtime-interlock.md diff --git a/.server-changes/run-ops-split-realtime-interlock.md b/.server-changes/run-ops-split-realtime-interlock.md deleted file mode 100644 index f05bdf3744f..00000000000 --- a/.server-changes/run-ops-split-realtime-interlock.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -area: webapp -type: fix ---- - -Add a boot-time interlock that refuses to enable the run-ops DB split -(`RUN_OPS_SPLIT_ENABLED`) unless the native realtime backend -(`REALTIME_BACKEND_NATIVE_ENABLED`) is also on. Electric replicates only from the -control-plane database, so enabling the split without the native backend would -leave NEW-resident (ksuid) runs invisible to realtime and hang every -subscription. The check runs synchronously on the same eager-boot path as the -existing distinct-DB sentinel and fails fast before any run-ops routing is wired. diff --git a/.server-changes/run-ops-split-webapp-foundation.md b/.server-changes/run-ops-split-webapp-foundation.md index e70532c3e5e..d1d8111b68e 100644 --- a/.server-changes/run-ops-split-webapp-foundation.md +++ b/.server-changes/run-ops-split-webapp-foundation.md @@ -2,5 +2,4 @@ area: webapp type: feature --- - -Add the webapp foundation for the run-ops database split: topology/flag wiring, split-mode gating, a distinct-DB boot sentinel, and control-plane resolver read-through (all inert until `RUN_OPS_SPLIT_ENABLED`). The control-plane cache is now invalidated at env/org write sites (pause/resume, archive, concurrency/burst-factor, API-key regen, feature flags, rate limits, runs enable/disable, org/project delete, stream-basin provisioning) so admin/control-plane changes are reflected immediately rather than after the cache TTL, and the run-engine authenticated-env resolution goes through the cache-first, split-aware resolver. +Add the run-ops database split webapp foundation — DB topology/flag wiring, split-mode gating, distinct-DB and native-realtime boot interlocks, and a cache-first control-plane resolver with cache invalidation on env/org writes — all inert until the split is enabled. From 32830f5bdbc8e2cf50133c82989af273bf849d1a Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Fri, 3 Jul 2026 11:19:00 +0100 Subject: [PATCH 13/14] chore(run-ops): fix lint/format for main lint rules Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/webapp/app/v3/runEngineHandlers.server.ts | 1 - apps/webapp/app/v3/runEngineHandlersShared.server.ts | 2 +- .../v3/runOpsMigration/controlPlaneResolver.server.ts | 9 ++++++--- .../runEngineControlPlaneResolver.server.ts | 6 ++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 021b04d5822..d6c4dd5fe3f 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -3,7 +3,6 @@ import { SpanKind } from "@internal/tracing"; import { tryCatch } from "@trigger.dev/core/utils"; import { createJsonErrorObject, sanitizeError, TaskRunErrorCodes } from "@trigger.dev/core/v3"; import { RunId } from "@trigger.dev/core/v3/isomorphic"; -import type { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; import { $replica, prisma, diff --git a/apps/webapp/app/v3/runEngineHandlersShared.server.ts b/apps/webapp/app/v3/runEngineHandlersShared.server.ts index f4c07f730be..338bbd2f9fc 100644 --- a/apps/webapp/app/v3/runEngineHandlersShared.server.ts +++ b/apps/webapp/app/v3/runEngineHandlersShared.server.ts @@ -6,7 +6,7 @@ */ import type { CompleteBatchResult } from "@internal/run-engine"; import type { RunStore } from "@internal/run-store"; -import { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; +import type { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; import type { PrismaClient, PrismaReplicaClient } from "~/db.server"; import { logger } from "~/services/logger.server"; import { readThroughRun } from "~/v3/runOpsMigration/readThrough.server"; diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts index 8e2779aa8b6..72aeb3f55f8 100644 --- a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts @@ -1,6 +1,9 @@ import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; -import { RuntimeEnvironmentType } from "@trigger.dev/database"; -import type { PrismaClient, PrismaReplicaClient } from "@trigger.dev/database"; +import type { + PrismaClient, + PrismaReplicaClient, + RuntimeEnvironmentType, +} from "@trigger.dev/database"; import { prisma, $replica } from "~/db.server"; import { env } from "~/env.server"; import { @@ -10,9 +13,9 @@ import { type ResolvedAuthenticatedEnv, type ResolvedEnv, type ResolvedWorkerVersion, + type ResolvedRunLockedWorker, } from "./controlPlaneCache.server"; import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; -import type { ResolvedRunLockedWorker } from "./controlPlaneCache.server"; /** * App-level control-plane resolution + cache layer. Replaces the run-ops -> control-plane diff --git a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts index 8fbfb66b17d..014e446464f 100644 --- a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts @@ -5,10 +5,8 @@ import type { ResolvedWorkerVersion, } from "@internal/run-engine"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { - ControlPlaneResolver as AppControlPlaneResolver, - controlPlaneResolver, -} from "./controlPlaneResolver.server"; +import type { ControlPlaneResolver as AppControlPlaneResolver } from "./controlPlaneResolver.server"; +import { controlPlaneResolver } from "./controlPlaneResolver.server"; /** * Adapter that presents the webapp's cross-DB cached ControlPlaneResolver as the From 071cdc163d31399d704ab5bb68f07092b27746a0 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Fri, 3 Jul 2026 15:42:55 +0100 Subject: [PATCH 14/14] fix(run-ops split): make webapp assertEnvExists a no-op when split is OFF With the split OFF there is a single DB, so a run and its environment are co-located and there is no cross-seam FK/check to replace (matches main). Skip the always-on hot-path read in that branch; the split-ON branch is unchanged (cache-first, throws on a genuinely missing env). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../v3/runOpsMigration/controlPlaneResolver.server.ts | 8 ++------ .../controlPlaneResolver.server.test.ts | 11 ++++++----- .../runEngineControlPlaneResolver.server.test.ts | 8 ++++++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts index 72aeb3f55f8..ce83c632abf 100644 --- a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts @@ -394,12 +394,8 @@ export class ControlPlaneResolver { async assertEnvExists(environmentId: string): Promise { if (!this.splitEnabled()) { - const exists = await this.#queryEnvExists(this.controlPlanePrimary, environmentId); - if (!exists) { - throw new ControlPlaneReferenceError( - `Referenced environment does not exist: ${environmentId}` - ); - } + // Split OFF = single DB, so run and env are co-located and there is no FK/check + // to replace (matches main). Skip the hot-path read entirely. return; } diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts index a930ab0c328..e85843114db 100644 --- a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts @@ -335,10 +335,10 @@ heteroPostgresTest( ); heteroPostgresTest( - "assertEnvExists passthrough (split OFF) still validates a real env", + "assertEnvExists passthrough (split OFF) is a no-op: never reads, never throws", async ({ prisma14 }) => { const { environment } = await seedControlPlane(prisma14); - const { client: counting } = countQueries(prisma14); + const { client: counting, reads } = countQueries(prisma14); const resolver = new ControlPlaneResolver({ controlPlaneReplica: counting, controlPlanePrimary: counting, @@ -346,10 +346,11 @@ heteroPostgresTest( splitEnabled: () => false, }); + // Split OFF = single DB, run and env co-located, so there is nothing to assert + // and the hot-path read is skipped entirely — resolves for present and missing. await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); - await expect(resolver.assertEnvExists("env_missing")).rejects.toBeInstanceOf( - ControlPlaneReferenceError - ); + await expect(resolver.assertEnvExists("env_missing")).resolves.toBeUndefined(); + expect(reads()).toBe(0); } ); diff --git a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts index e5c93418115..5e1c6902f7c 100644 --- a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts +++ b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts @@ -205,10 +205,14 @@ describe("RunEngineControlPlaneResolver adapter", () => { ); heteroPostgresTest( - "assertEnvExists resolves for a present env, rejects for a missing one", + "assertEnvExists (split ON) resolves for a present env, rejects for a missing one", async ({ prisma14 }) => { const { environment } = await seedEnv(prisma14, "PRODUCTION"); - const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + // split ON: the only mode where assertEnvExists asserts (split OFF is a no-op, + // covered in controlPlaneResolver.server.test.ts). + const adapter = new RunEngineControlPlaneResolver( + buildAppResolver(prisma14, { splitEnabled: true }) + ); await expect(adapter.assertEnvExists(environment.id)).resolves.toBeUndefined(); await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow();