fix(schedule): don't fault trigger run on error-recovery failures

TheodoreSpeaks · claude · TheodoreSpeaks · commit 682477a434ce · 2026-06-02T22:37:24.000-07:00
The schedule task already treats workflow-execution failures as recorded
errors rather than trigger faults, but the outermost catch's own recovery
code (the infra-retry and releaseClaim calls) was unguarded. A secondary DB
blip while releasing the claim re-threw and escaped run(), faulting the
trigger.dev run and firing an alert — a double-fault during cleanup.

Wrap the recovery path in a try/catch: log and record the exception on the
span without re-throwing. The claim expires on its TTL and the next tick
re-claims the schedule, so swallowing the cleanup failure is safe.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/sim/background/schedule-execution.ts b/apps/sim/background/schedule-execution.ts
@@ -1,3 +1,4 @@
+import { trace } from '@opentelemetry/api'
 import {
   db,
   jobExecutionLogs,
@@ -947,16 +948,28 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) {
         )
       }
     } catch (error: unknown) {
-      if (isRetryableInfrastructureError(error)) {
-        await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
-        return
-      }
+      try {
+        if (isRetryableInfrastructureError(error)) {
+          await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
+          return
+        }
 
-      logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
-      await releaseClaim(
-        now,
-        `Failed to release schedule ${payload.scheduleId} after unhandled error`
-      )
+        logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
+        await releaseClaim(
+          now,
+          `Failed to release schedule ${payload.scheduleId} after unhandled error`
+        )
+      } catch (recoveryError: unknown) {
+        // A secondary failure during error recovery (e.g. a transient DB blip while
+        // releasing the claim or scheduling an infra retry) must not fault the run. The
+        // claim expires on its TTL and the next tick re-claims the schedule. Record the
+        // exception on the span so it stays visible in traces without faulting the run.
+        logger.error(
+          `[${requestId}] Failed to recover schedule ${payload.scheduleId} after error`,
+          recoveryError
+        )
+        trace.getActiveSpan()?.recordException(toError(recoveryError))
+      }
     }
   })
 }