From da46de628af78b4fa1b8041e7c4ebd5c2bedb61b Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Tue, 19 May 2026 02:05:46 +0000
Subject: [PATCH] fix(distributed): gate per-worker SwiftShader probe to worker
 0 only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After #916 moved `assertSwiftShader` from `renderChunk()`'s eager probe
session into `executeWorkerTask`, every parallel worker began running its
own `chrome://gpu` / canvas-WebGL probe. At `chunkWorkerCount=6` (texture
launch at chunks=3) that's 6 concurrent CDP page-loads per chunk × 3
chunks = 18 simultaneous probes. Bench data on dev (12 producer pods × 22
vCPU) showed c=3 worst-case wall-clock at 67.3s, 24.7s above c=6 worst
(42.6s) — pod_total inflates 100s → 147s uniformly across all three
chunks per slow iter, the signature of cluster-level CDP contention
rather than within-pod contention.

Workers within a chunk share the same Chrome binary, flags, and OS/driver
state on a single pod, so worker 0's success is representative for the
rest. Gate the probe via `shouldVerifyWorkerGpu(workerId, config)` so
only worker 0 navigates to the probe page; workers 1..N-1 skip it. The
fail-fast contract still holds at the chunk level (worker 0 still aborts
the chunk if SwiftShader didn't load) — just without the concurrent CDP
traffic.

Expected wall-clock impact: c=3 worst drops from ~67s to in line with
c=6 worst (~42-44s). c=6 (3 workers/pod) and c=8 (2 workers/pod) should
see smaller wins; c=12 (1 worker/pod, sequential branch) is unaffected.

Closes #955.
---
 .../src/services/parallelCoordinator.test.ts  | 33 ++++++++++++++++-
 .../src/services/parallelCoordinator.ts       | 35 +++++++++++++------
 .../src/services/distributed/renderChunk.ts   | 10 +++---
 3 files changed, 63 insertions(+), 15 deletions(-)
diff --git a/packages/engine/src/services/parallelCoordinator.test.ts b/packages/engine/src/services/parallelCoordinator.test.ts
index 696343618..3cf266c1f 100644
--- a/packages/engine/src/services/parallelCoordinator.test.ts
+++ b/packages/engine/src/services/parallelCoordinator.test.ts
@@ -1,5 +1,10 @@
 import { describe, it, expect } from "vitest";
-import { calculateOptimalWorkers, distributeFrames } from "./parallelCoordinator.js";
+import {
+  calculateOptimalWorkers,
+  distributeFrames,
+  shouldVerifyWorkerGpu,
+} from "./parallelCoordinator.js";
+import type { EngineConfig } from "../config.js";
 
 describe("distributeFrames", () => {
   it("distributes frames evenly across workers", () => {
@@ -68,3 +73,29 @@ describe("calculateOptimalWorkers", () => {
     expect(workers).toBe(4);
   });
 });
+
+describe("shouldVerifyWorkerGpu", () => {
+  const softwareConfig: Partial<EngineConfig> = { browserGpuMode: "software" };
+
+  it("returns true for worker 0 when GPU mode is software", () => {
+    expect(shouldVerifyWorkerGpu(0, softwareConfig)).toBe(true);
+  });
+
+  it("returns false for non-zero workers when GPU mode is software", () => {
+    expect(shouldVerifyWorkerGpu(1, softwareConfig)).toBe(false);
+    expect(shouldVerifyWorkerGpu(5, softwareConfig)).toBe(false);
+    expect(shouldVerifyWorkerGpu(17, softwareConfig)).toBe(false);
+  });
+
+  it("returns false for any worker when GPU mode is not software", () => {
+    expect(shouldVerifyWorkerGpu(0, { browserGpuMode: "hardware" } as Partial<EngineConfig>)).toBe(
+      false,
+    );
+    expect(shouldVerifyWorkerGpu(0, {})).toBe(false);
+  });
+
+  it("returns false when config is undefined", () => {
+    expect(shouldVerifyWorkerGpu(0, undefined)).toBe(false);
+    expect(shouldVerifyWorkerGpu(3, undefined)).toBe(false);
+  });
+});
diff --git a/packages/engine/src/services/parallelCoordinator.ts b/packages/engine/src/services/parallelCoordinator.ts
index 6800ff918..236b8f147 100644
--- a/packages/engine/src/services/parallelCoordinator.ts
+++ b/packages/engine/src/services/parallelCoordinator.ts
@@ -181,6 +181,16 @@ export function distributeFrames(
   return tasks;
 }
 
+/**
+ * Decide whether a parallel worker should run the per-worker SwiftShader
+ * assertion. Gated to worker 0 only: workers within a chunk share the same
+ * Chrome binary, flags, and OS/driver state, so one verification per chunk
+ * is sufficient. See `heygen-com/hyperframes#955`.
+ */
+export function shouldVerifyWorkerGpu(workerId: number, config?: Partial<EngineConfig>): boolean {
+  return config?.browserGpuMode === "software" && workerId === 0;
+}
+
 async function executeWorkerTask(
   task: WorkerTask,
   serverUrl: string,
@@ -207,17 +217,22 @@ async function executeWorkerTask(
       createBeforeCaptureHook(),
       config,
     );
-    // Per-worker SwiftShader assertion: when the caller declares
-    // `browserGpuMode: "software"`, every worker session must verify Chrome's
-    // WebGL backend is actually SwiftShader before the first frame. Hosts
-    // that fall back to a hardware GL backend (or silently fail to load
+    // Per-worker SwiftShader assertion, gated to worker 0 only.
+    // When `browserGpuMode: "software"` is declared, the chunk's GL backend
+    // must be verified as SwiftShader before the first frame — a host that
+    // falls back to a hardware GL backend (or silently fails to load
     // SwiftShader) would otherwise produce non-deterministic pixels and
-    // break the distributed byte-identical-retry contract — the parallel
-    // branch wouldn't catch it via the pre-warmup probe (renderChunk now
-    // skips that when chunkWorkerCount > 1). The canvas-based reader works
-    // on both regular Chrome and chrome-headless-shell (which serves
-    // `chrome://gpu` as an empty document).
-    if (config?.browserGpuMode === "software") {
+    // break the distributed byte-identical-retry contract. Running this
+    // probe on every worker means N concurrent navigations to a WebGL
+    // probe page per chunk; with `chunkWorkerCount=6` × 3 chunks, that's
+    // 18 simultaneous CDP page-loads, which inflated c=3 worst-case wall
+    // by ~24s vs c=6/c=8 on the texture-launch bench. Workers in the same
+    // chunk share the same Chrome binary, flags, and OS/driver state, so
+    // worker 0's success is representative — gate it there and skip the
+    // rest. See `heygen-com/hyperframes#955` for the bench data and the
+    // pre-warmup probe interaction (which `renderChunk` already skips
+    // when `chunkWorkerCount > 1`).
+    if (shouldVerifyWorkerGpu(task.workerId, config)) {
       await assertSwiftShader(session.page, readWebGlVendorInfoFromCanvas);
     }
     await initializeSession(session);
diff --git a/packages/producer/src/services/distributed/renderChunk.ts b/packages/producer/src/services/distributed/renderChunk.ts
index 84fd9ac9d..5309cf52f 100644
--- a/packages/producer/src/services/distributed/renderChunk.ts
+++ b/packages/producer/src/services/distributed/renderChunk.ts
@@ -469,10 +469,12 @@ export async function renderChunk(
     // Resolve worker count up-front so we can decide whether to bother
     // pre-warming a probe session at all. The parallel branch
     // (chunkWorkerCount > 1) closes the probe immediately and creates fresh
-    // per-worker sessions; `executeWorkerTask` now runs its own
-    // `assertSwiftShader` against each worker session (gated on
-    // `cfg.browserGpuMode === "software"`), so the safety contract holds
-    // without the eager pre-probe.
+    // per-worker sessions; `executeWorkerTask` runs `assertSwiftShader`
+    // on worker 0 only (gated on `cfg.browserGpuMode === "software"`), so
+    // the safety contract holds without the eager pre-probe and without
+    // every worker concurrently navigating to the GL probe page. See
+    // `heygen-com/hyperframes#955` for the worst-case wall regression that
+    // motivated gating the probe to worker 0.
     //
     // Capture-cost calibration based on shader transitions / renderModeHints
     // is not threaded through to chunks yet; the in-process renderer's