diff --git a/apps/webapp/app/runEngine/concerns/waitpointCompletionPacket.server.ts b/apps/webapp/app/runEngine/concerns/waitpointCompletionPacket.server.ts index 5e029221344..e59ede59f18 100644 --- a/apps/webapp/app/runEngine/concerns/waitpointCompletionPacket.server.ts +++ b/apps/webapp/app/runEngine/concerns/waitpointCompletionPacket.server.ts @@ -2,6 +2,7 @@ import { type IOPacket, packetRequiresOffloading, tryCatch } from "@trigger.dev/ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { env } from "~/env.server"; import { uploadPacketToObjectStore } from "~/v3/objectStore.server"; +import { logger } from "~/services/logger.server"; import { ServiceValidationError } from "~/v3/services/common.server"; function packetExtensionForDataType(dataType: string): string { @@ -53,6 +54,11 @@ export async function processWaitpointCompletionPacket( ); if (uploadError) { + logger.error("Failed to upload large waitpoint to object store", { + error: uploadError, + filename, + environmentId: environment.id, + }); throw new ServiceValidationError("Failed to upload large waitpoint to object store", 500); } diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index 9f8037a151e..51075c1b87d 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -31,6 +31,7 @@ import { newProjectPath, organizationBillingPath } from "~/utils/pathBuilder"; import { singleton } from "~/utils/singleton"; import { RedisCacheStore } from "./unkey/redisCacheStore.server"; import { $replica } from "~/db.server"; +import { metrics } from "@opentelemetry/api"; function initializeClient() { if (isCloud() && process.env.BILLING_API_URL && process.env.BILLING_API_KEY) { @@ -43,6 +44,23 @@ function initializeClient() { } const client = singleton("billingClient", initializeClient); +// Failures from @trigger.dev/platform billing client calls are tracked via +// this metric (with low-cardinality {function, kind} labels) rather than +// logged. Every task invocation hits these paths, so per-call logs were too +// noisy; dashboard the counter for visibility instead. +const platformClientMeter = metrics.getMeter("trigger.dev/platform-client"); +const platformClientFailuresCounter = platformClientMeter.createCounter( + "platform_client.failures_total", + { + description: + "Failures returned or thrown by @trigger.dev/platform billing client calls", + } +); + +function recordPlatformFailure(fn: string, kind: "caught" | "no_success") { + platformClientFailuresCounter.add(1, { function: fn, kind }); +} + function initializePlatformCache() { const ctx = new DefaultStatefulContext(); @@ -206,7 +224,7 @@ export async function getCurrentPlan(orgId: string) { firstDayOfNextMonth.setUTCHours(0, 0, 0, 0); if (!result.success) { - logger.error("Error getting current plan - no success", { orgId, error: result.error }); + recordPlatformFailure("getCurrentPlan", "no_success"); return undefined; } @@ -222,7 +240,7 @@ export async function getCurrentPlan(orgId: string) { return { ...result, usage }; } catch (e) { - logger.error("Error getting current plan - caught error", { orgId, error: e }); + recordPlatformFailure("getCurrentPlan", "caught"); return undefined; } } @@ -233,13 +251,13 @@ export async function getLimits(orgId: string) { try { const result = await client.currentPlan(orgId); if (!result.success) { - logger.error("Error getting limits - no success", { orgId, error: result.error }); + recordPlatformFailure("getLimits", "no_success"); return undefined; } return result.v3Subscription?.plan?.limits; } catch (e) { - logger.error("Error getting limits - caught error", { orgId, error: e }); + recordPlatformFailure("getLimits", "caught"); return undefined; } } @@ -315,7 +333,7 @@ export async function customerPortalUrl(orgId: string, orgSlug: string) { returnUrl: `${env.APP_ORIGIN}${organizationBillingPath({ slug: orgSlug })}`, }); } catch (e) { - logger.error("Error getting customer portal Url", { orgId, error: e }); + recordPlatformFailure("customerPortalUrl", "caught"); return undefined; } } @@ -326,12 +344,12 @@ export async function getPlans() { try { const result = await client.plans(); if (!result.success) { - logger.error("Error getting plans - no success", { error: result.error }); + recordPlatformFailure("getPlans", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error getting plans - caught error", { error: e }); + recordPlatformFailure("getPlans", "caught"); return undefined; } } @@ -408,12 +426,12 @@ export async function setConcurrencyAddOn(organizationId: string, amount: number try { const result = await client.setAddOn(organizationId, { type: "concurrency", amount }); if (!result.success) { - logger.error("Error setting concurrency add on - no success", { error: result.error }); + recordPlatformFailure("setConcurrencyAddOn", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error setting concurrency add on - caught error", { error: e }); + recordPlatformFailure("setConcurrencyAddOn", "caught"); return undefined; } } @@ -424,12 +442,12 @@ export async function setSeatsAddOn(organizationId: string, amount: number) { try { const result = await client.setAddOn(organizationId, { type: "seats", amount }); if (!result.success) { - logger.error("Error setting seats add on - no success", { error: result.error }); + recordPlatformFailure("setSeatsAddOn", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error setting seats add on - caught error", { error: e }); + recordPlatformFailure("setSeatsAddOn", "caught"); return undefined; } } @@ -440,12 +458,12 @@ export async function setBranchesAddOn(organizationId: string, amount: number) { try { const result = await client.setAddOn(organizationId, { type: "branches", amount }); if (!result.success) { - logger.error("Error setting branches add on - no success", { error: result.error }); + recordPlatformFailure("setBranchesAddOn", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error setting branches add on - caught error", { error: e }); + recordPlatformFailure("setBranchesAddOn", "caught"); return undefined; } } @@ -456,12 +474,12 @@ export async function getUsage(organizationId: string, { from, to }: { from: Dat try { const result = await client.usage(organizationId, { from, to }); if (!result.success) { - logger.error("Error getting usage - no success", { error: result.error }); + recordPlatformFailure("getUsage", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error getting usage - caught error", { error: e }); + recordPlatformFailure("getUsage", "caught"); return undefined; } } @@ -490,12 +508,12 @@ export async function getUsageSeries(organizationId: string, params: UsageSeries try { const result = await client.usageSeries(organizationId, params); if (!result.success) { - logger.error("Error getting usage series - no success", { error: result.error }); + recordPlatformFailure("getUsageSeries", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error getting usage series - caught error", { error: e }); + recordPlatformFailure("getUsageSeries", "caught"); return undefined; } } @@ -514,12 +532,12 @@ export async function reportInvocationUsage( additionalData, }); if (!result.success) { - logger.error("Error reporting invocation - no success", { error: result.error }); + recordPlatformFailure("reportInvocationUsage", "no_success"); return undefined; } return result; } catch (e) { - logger.error("Error reporting invocation - caught error", { error: e }); + recordPlatformFailure("reportInvocationUsage", "caught"); return undefined; } } @@ -550,12 +568,12 @@ export async function getEntitlement( try { const response = await client.getEntitlement(organizationId); if (!response.success) { - logger.error("Error getting entitlement - no success", { error: response.error }); + recordPlatformFailure("getEntitlement", "no_success"); return undefined; } return response; } catch (e) { - logger.error("Error getting entitlement - caught error", { error: e }); + recordPlatformFailure("getEntitlement", "caught"); return undefined; } }); @@ -602,7 +620,7 @@ export async function getBillingAlerts( if (!client) return undefined; const result = await client.getBillingAlerts(organizationId); if (!result.success) { - logger.error("Error getting billing alert", { error: result.error, organizationId }); + recordPlatformFailure("getBillingAlert", "no_success"); throw new Error("Error getting billing alert"); } return result; @@ -615,7 +633,7 @@ export async function setBillingAlert( if (!client) return undefined; const result = await client.updateBillingAlerts(organizationId, alert); if (!result.success) { - logger.error("Error setting billing alert", { error: result.error, organizationId }); + recordPlatformFailure("setBillingAlert", "no_success"); throw new Error("Error setting billing alert"); } return result; @@ -628,11 +646,7 @@ export async function generateRegistryCredentials( if (!client) return undefined; const result = await client.generateRegistryCredentials(projectId, region); if (!result.success) { - logger.error("Error generating registry credentials", { - error: result.error, - projectId, - region, - }); + recordPlatformFailure("generateRegistryCredentials", "no_success"); throw new Error("Failed to generate registry credentials"); } @@ -651,13 +665,7 @@ export async function enqueueBuild( if (!client) return undefined; const result = await client.enqueueBuild(projectId, { deploymentId, artifactKey, options }); if (!result.success) { - logger.error("Error enqueuing build", { - error: result.error, - projectId, - deploymentId, - artifactKey, - options, - }); + recordPlatformFailure("enqueueBuild", "no_success"); throw new Error("Failed to enqueue build"); } @@ -672,12 +680,12 @@ export async function getPrivateLinks( const [error, result] = await tryCatch(client.getPrivateLinks(organizationId)); if (error) { - logger.error("Error getting private links", { organizationId, error }); + recordPlatformFailure("getPrivateLinks", "caught"); return undefined; } if (!result.success) { - logger.error("Error getting private links - no success", { organizationId, error: result.error }); + recordPlatformFailure("getPrivateLinks", "no_success"); return undefined; } @@ -693,12 +701,12 @@ export async function createPrivateLink( const [error, result] = await tryCatch(client.createPrivateLink(organizationId, body)); if (error) { - logger.error("Error creating private link", { organizationId, error }); + recordPlatformFailure("createPrivateLink", "caught"); throw error; } if (!result.success) { - logger.error("Error creating private link - no success", { organizationId, error: result.error }); + recordPlatformFailure("createPrivateLink", "no_success"); throw new Error(result.error ?? "Failed to create private link"); } @@ -714,12 +722,12 @@ export async function deletePrivateLink( const [error, result] = await tryCatch(client.deletePrivateLink(organizationId, connectionId)); if (error) { - logger.error("Error deleting private link", { organizationId, connectionId, error }); + recordPlatformFailure("deletePrivateLink", "caught"); throw error; } if (!result.success) { - logger.error("Error deleting private link - no success", { organizationId, connectionId, error: result.error }); + recordPlatformFailure("deletePrivateLink", "no_success"); throw new Error(result.error ?? "Failed to delete private link"); } } @@ -732,12 +740,12 @@ export async function getPrivateLinkRegions( const [error, result] = await tryCatch(client.getPrivateLinkRegions(organizationId)); if (error) { - logger.error("Error getting private link regions", { organizationId, error }); + recordPlatformFailure("getPrivateLinkRegions", "caught"); return undefined; } if (!result.success) { - logger.error("Error getting private link regions - no success", { organizationId, error: result.error }); + recordPlatformFailure("getPrivateLinkRegions", "no_success"); return undefined; } diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts index aae3c7ff54e..9e439938d0d 100644 --- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts +++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts @@ -26,6 +26,30 @@ import { WORKER_HEADERS } from "@trigger.dev/core/v3/runEngineWorker"; import { ServiceValidationError } from "~/v3/services/common.server"; import { EngineServiceValidationError } from "@internal/run-engine"; +// Client aborts and service-level validation errors aren't bugs — they're +// expected at API boundaries. Log them at `warn` so they stay in stdout +// without flowing to Sentry via Logger.onError. +function logBoundaryError( + message: "Error in loader" | "Error in action", + error: unknown, + url: string +) { + const formatted = + error instanceof Error + ? { name: error.name, message: error.message, stack: error.stack } + : String(error); + const isExpected = + error instanceof Error && + (error.name === "AbortError" || + error instanceof ServiceValidationError || + error instanceof EngineServiceValidationError); + if (isExpected) { + logger.warn(message, { error: formatted, url }); + } else { + logger.error(message, { error: formatted, url }); + } +} + type AnyZodSchema = z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion; type ApiKeyRouteBuilderOptions< @@ -260,17 +284,7 @@ export function createLoaderApiRoute< return await wrapResponse(request, error, corsStrategy !== "none"); } - logger.error("Error in loader", { - error: - error instanceof Error - ? { - name: error.name, - message: error.message, - stack: error.stack, - } - : String(error), - url: request.url, - }); + logBoundaryError("Error in loader", error, request.url); return await wrapResponse( request, @@ -756,17 +770,7 @@ export function createActionApiRoute< return await wrapResponse(request, error, corsStrategy !== "none"); } - logger.error("Error in action", { - error: - error instanceof Error - ? { - name: error.name, - message: error.message, - stack: error.stack, - } - : String(error), - url: request.url, - }); + logBoundaryError("Error in action", error, request.url); return await wrapResponse( request, @@ -1044,13 +1048,7 @@ export function createMultiMethodApiRoute< return await wrapResponse(request, error, corsStrategy !== "none"); } - logger.error("Error in action", { - error: - error instanceof Error - ? { name: error.name, message: error.message, stack: error.stack } - : String(error), - url: request.url, - }); + logBoundaryError("Error in action", error, request.url); return await wrapResponse( request, @@ -1188,17 +1186,7 @@ export function createLoaderWorkerApiRoute< return error; } - logger.error("Error in loader", { - error: - error instanceof Error - ? { - name: error.name, - message: error.message, - stack: error.stack, - } - : String(error), - url: request.url, - }); + logBoundaryError("Error in loader", error, request.url); return json({ error: "Internal Server Error" }, { status: 500 }); } @@ -1363,17 +1351,7 @@ export function createActionWorkerApiRoute< return json({ error: error.message }, { status: error.status ?? 422 }); } - logger.error("Error in action", { - error: - error instanceof Error - ? { - name: error.name, - message: error.message, - stack: error.stack, - } - : String(error), - url: request.url, - }); + logBoundaryError("Error in action", error, request.url); return json({ error: "Internal Server Error" }, { status: 500 }); } diff --git a/apps/webapp/app/utils/queryPerformanceMonitor.server.ts b/apps/webapp/app/utils/queryPerformanceMonitor.server.ts index 69a11f22b3f..2398a49da10 100644 --- a/apps/webapp/app/utils/queryPerformanceMonitor.server.ts +++ b/apps/webapp/app/utils/queryPerformanceMonitor.server.ts @@ -32,15 +32,16 @@ export class QueryPerformanceMonitor { const { duration, query, params, target, timestamp } = log; - // Only log very slow queries as errors + // Slow queries are an observability signal (DB load, missing indexes, + // etc.), not an application error. Logged at warn so it lands in stdout + // without flowing to Sentry — track via metrics/dashboards instead. if (duration > this.config.verySlowQueryThreshold) { - // Truncate long queries for readability const truncatedQuery = query.length > this.config.maxQueryLogLength ? query.substring(0, this.config.maxQueryLogLength) + "..." : query; - logger.error("Prisma: very slow database query", { + logger.warn("Prisma: very slow database query", { clientType, durationMs: duration, query: truncatedQuery, diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index fbc50428407..7a2c0c3e366 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -489,7 +489,10 @@ function createWorkerNamespace({ next(); } catch (error) { - logger.error("Worker authentication failed", { + // System handles auth failure by disconnecting the socket — not an + // error. Most volume is V1 /dev-worker reconnect churn from outdated + // CLIs anyway. + logger.warn("Worker authentication failed", { namespace, error: error instanceof Error ? error.message : error, }); diff --git a/apps/webapp/app/v3/services/timeoutDeployment.server.ts b/apps/webapp/app/v3/services/timeoutDeployment.server.ts index efb2569a94e..79d1fd9e332 100644 --- a/apps/webapp/app/v3/services/timeoutDeployment.server.ts +++ b/apps/webapp/app/v3/services/timeoutDeployment.server.ts @@ -27,7 +27,10 @@ export class TimeoutDeploymentService extends BaseService { } if (deployment.status !== fromStatus) { - logger.error("Deployment is not in the correct state to be timed out", { + // Race: timeout job fired after the deployment moved out of the + // expected state (already deployed/failed). System handles it by + // returning early — not an error. + logger.warn("Deployment is not in the correct state to be timed out", { currentStatus: deployment.status, fromStatus, }); diff --git a/apps/webapp/scripts/test-sentry-filter.mts b/apps/webapp/scripts/test-sentry-filter.mts new file mode 100644 index 00000000000..e08bc78c8e4 --- /dev/null +++ b/apps/webapp/scripts/test-sentry-filter.mts @@ -0,0 +1,27 @@ +import "dotenv/config"; +import "../sentry.server"; +import * as Sentry from "@sentry/remix"; + +const tag = `filter-test-${Date.now()}`; + +const cases = [ + { name: "ServiceValidationError", expect: "DROPPED" }, + { name: "QueueSizeLimitExceededError", expect: "KEPT" }, + { name: "MetadataTooLargeError", expect: "KEPT" }, + { name: "Error", expect: "KEPT" }, +]; + +for (const { name, expect } of cases) { + const err = new Error(`[FILTER TEST ${tag}] ${name} expect=${expect}`); + err.name = name; + Sentry.withScope((scope) => { + scope.setTag("filter_test", tag); + scope.setTag("expect", expect); + Sentry.captureException(err); + }); +} + +const ok = await Sentry.flush(10_000); +console.log(`flushed=${ok} tag=${tag}`); +console.log(`Sentry search: https://triggerdev.sentry.io/issues/?query=filter_test%3A${tag}`); +process.exit(0); diff --git a/apps/webapp/sentry.server.ts b/apps/webapp/sentry.server.ts index a50efcfe85b..d34e676dca5 100644 --- a/apps/webapp/sentry.server.ts +++ b/apps/webapp/sentry.server.ts @@ -21,7 +21,12 @@ if (process.env.SENTRY_DSN) { serverName: process.env.SERVICE_NAME, environment: process.env.APP_ENV, - ignoreErrors: ["queryRoute() call aborted"], + // ServiceValidationError is thrown deliberately for user-facing + // validation failures (quota, parent run state, invalid input). Anchored + // regex matches the exception type exactly; subclasses + // (QueueSizeLimitExceededError, MetadataTooLargeError) override `.name` + // and stay visible. + ignoreErrors: ["queryRoute() call aborted", /^ServiceValidationError(?::|$)/], includeLocalVariables: false, }); } diff --git a/internal-packages/run-engine/src/batch-queue/index.ts b/internal-packages/run-engine/src/batch-queue/index.ts index 6b49795da52..1f7a6221f48 100644 --- a/internal-packages/run-engine/src/batch-queue/index.ts +++ b/internal-packages/run-engine/src/batch-queue/index.ts @@ -929,14 +929,30 @@ export class BatchQueue { errorCode: result.errorCode, }); - this.logger.error("Batch item processing failed after all attempts", { - batchId, - itemIndex, - error: result.error, - processedCount, - expectedCount: meta.runCount, - attempts: attempt, - }); + // skipRetries=true means the callback knew the failure was + // intentional/non-retryable (e.g. customer hit queue size limit). + // Don't promote it back to error — the callback already logged + // appropriately and a permanent-failure record was written. + if (result.skipRetries) { + this.logger.warn("Batch item processing failed (non-retryable)", { + batchId, + itemIndex, + error: result.error, + errorCode: result.errorCode, + processedCount, + expectedCount: meta.runCount, + attempts: attempt, + }); + } else { + this.logger.error("Batch item processing failed after all attempts", { + batchId, + itemIndex, + error: result.error, + processedCount, + expectedCount: meta.runCount, + attempts: attempt, + }); + } } } catch (error) { span?.setAttribute("batch.result", "unexpected_error"); diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 27ddedde006..fdcf1a6f89a 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -1980,30 +1980,25 @@ export class RunAttemptSystem { } if (completion.flushedMetadata) { - const [packetError, packet] = await tryCatch(parsePacket(completion.flushedMetadata)); + const [, packet] = await tryCatch(parsePacket(completion.flushedMetadata)); if (!packet) { return; } - if (packetError) { - this.$.logger.error("RunEngine.completeRunAttempt(): failed to parse flushed metadata", { - runId, - flushedMetadata: completion.flushedMetadata, - error: packetError, - }); - - return; - } - const metadata = FlushedRunMetadata.safeParse(packet); if (!metadata.success) { - this.$.logger.error("RunEngine.completeRunAttempt(): failed to parse flushed metadata", { - runId, - flushedMetadata: completion.flushedMetadata, - error: metadata.error, - }); + // Customer's metadata operations don't match the schema (typically + // non-JSON values in `operations[].value`). System ignores it. + this.$.logger.warn( + "RunEngine.completeRunAttempt(): failed to validate flushed metadata", + { + runId, + flushedMetadata: completion.flushedMetadata, + error: metadata.error, + } + ); return; } diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts index d650c1c6d08..8b8d4f82fcf 100644 --- a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -837,6 +837,20 @@ export class WaitpointSystem { } case "SUSPENDED": { if (!snapshot.checkpointId) { + // A run canceled mid-suspend has its checkpoint cleared by the + // cancel path; reaching here just means cancel won the race. + // Skip rather than throw — there's nothing to resume. + if (snapshot.runStatus === "CANCELED") { + this.$.logger.warn( + `continueRunIfUnblocked: run was canceled while suspended, skipping`, + { runId, snapshot } + ); + return { + status: "skipped", + reason: "run was canceled while suspended", + }; + } + this.$.logger.error(`continueRunIfUnblocked: run is suspended, but has no checkpoint`, { runId, snapshot,