From da46de628af78b4fa1b8041e7c4ebd5c2bedb61b Mon Sep 17 00:00:00 2001 From: James Date: Tue, 19 May 2026 02:05:46 +0000 Subject: [PATCH] fix(distributed): gate per-worker SwiftShader probe to worker 0 only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After #916 moved `assertSwiftShader` from `renderChunk()`'s eager probe session into `executeWorkerTask`, every parallel worker began running its own `chrome://gpu` / canvas-WebGL probe. At `chunkWorkerCount=6` (texture launch at chunks=3) that's 6 concurrent CDP page-loads per chunk × 3 chunks = 18 simultaneous probes. Bench data on dev (12 producer pods × 22 vCPU) showed c=3 worst-case wall-clock at 67.3s, 24.7s above c=6 worst (42.6s) — pod_total inflates 100s → 147s uniformly across all three chunks per slow iter, the signature of cluster-level CDP contention rather than within-pod contention. Workers within a chunk share the same Chrome binary, flags, and OS/driver state on a single pod, so worker 0's success is representative for the rest. Gate the probe via `shouldVerifyWorkerGpu(workerId, config)` so only worker 0 navigates to the probe page; workers 1..N-1 skip it. The fail-fast contract still holds at the chunk level (worker 0 still aborts the chunk if SwiftShader didn't load) — just without the concurrent CDP traffic. Expected wall-clock impact: c=3 worst drops from ~67s to in line with c=6 worst (~42-44s). c=6 (3 workers/pod) and c=8 (2 workers/pod) should see smaller wins; c=12 (1 worker/pod, sequential branch) is unaffected. Closes #955. --- .../src/services/parallelCoordinator.test.ts | 33 ++++++++++++++++- .../src/services/parallelCoordinator.ts | 35 +++++++++++++------ .../src/services/distributed/renderChunk.ts | 10 +++--- 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/packages/engine/src/services/parallelCoordinator.test.ts b/packages/engine/src/services/parallelCoordinator.test.ts index 696343618..3cf266c1f 100644 --- a/packages/engine/src/services/parallelCoordinator.test.ts +++ b/packages/engine/src/services/parallelCoordinator.test.ts @@ -1,5 +1,10 @@ import { describe, it, expect } from "vitest"; -import { calculateOptimalWorkers, distributeFrames } from "./parallelCoordinator.js"; +import { + calculateOptimalWorkers, + distributeFrames, + shouldVerifyWorkerGpu, +} from "./parallelCoordinator.js"; +import type { EngineConfig } from "../config.js"; describe("distributeFrames", () => { it("distributes frames evenly across workers", () => { @@ -68,3 +73,29 @@ describe("calculateOptimalWorkers", () => { expect(workers).toBe(4); }); }); + +describe("shouldVerifyWorkerGpu", () => { + const softwareConfig: Partial = { browserGpuMode: "software" }; + + it("returns true for worker 0 when GPU mode is software", () => { + expect(shouldVerifyWorkerGpu(0, softwareConfig)).toBe(true); + }); + + it("returns false for non-zero workers when GPU mode is software", () => { + expect(shouldVerifyWorkerGpu(1, softwareConfig)).toBe(false); + expect(shouldVerifyWorkerGpu(5, softwareConfig)).toBe(false); + expect(shouldVerifyWorkerGpu(17, softwareConfig)).toBe(false); + }); + + it("returns false for any worker when GPU mode is not software", () => { + expect(shouldVerifyWorkerGpu(0, { browserGpuMode: "hardware" } as Partial)).toBe( + false, + ); + expect(shouldVerifyWorkerGpu(0, {})).toBe(false); + }); + + it("returns false when config is undefined", () => { + expect(shouldVerifyWorkerGpu(0, undefined)).toBe(false); + expect(shouldVerifyWorkerGpu(3, undefined)).toBe(false); + }); +}); diff --git a/packages/engine/src/services/parallelCoordinator.ts b/packages/engine/src/services/parallelCoordinator.ts index 6800ff918..236b8f147 100644 --- a/packages/engine/src/services/parallelCoordinator.ts +++ b/packages/engine/src/services/parallelCoordinator.ts @@ -181,6 +181,16 @@ export function distributeFrames( return tasks; } +/** + * Decide whether a parallel worker should run the per-worker SwiftShader + * assertion. Gated to worker 0 only: workers within a chunk share the same + * Chrome binary, flags, and OS/driver state, so one verification per chunk + * is sufficient. See `heygen-com/hyperframes#955`. + */ +export function shouldVerifyWorkerGpu(workerId: number, config?: Partial): boolean { + return config?.browserGpuMode === "software" && workerId === 0; +} + async function executeWorkerTask( task: WorkerTask, serverUrl: string, @@ -207,17 +217,22 @@ async function executeWorkerTask( createBeforeCaptureHook(), config, ); - // Per-worker SwiftShader assertion: when the caller declares - // `browserGpuMode: "software"`, every worker session must verify Chrome's - // WebGL backend is actually SwiftShader before the first frame. Hosts - // that fall back to a hardware GL backend (or silently fail to load + // Per-worker SwiftShader assertion, gated to worker 0 only. + // When `browserGpuMode: "software"` is declared, the chunk's GL backend + // must be verified as SwiftShader before the first frame — a host that + // falls back to a hardware GL backend (or silently fails to load // SwiftShader) would otherwise produce non-deterministic pixels and - // break the distributed byte-identical-retry contract — the parallel - // branch wouldn't catch it via the pre-warmup probe (renderChunk now - // skips that when chunkWorkerCount > 1). The canvas-based reader works - // on both regular Chrome and chrome-headless-shell (which serves - // `chrome://gpu` as an empty document). - if (config?.browserGpuMode === "software") { + // break the distributed byte-identical-retry contract. Running this + // probe on every worker means N concurrent navigations to a WebGL + // probe page per chunk; with `chunkWorkerCount=6` × 3 chunks, that's + // 18 simultaneous CDP page-loads, which inflated c=3 worst-case wall + // by ~24s vs c=6/c=8 on the texture-launch bench. Workers in the same + // chunk share the same Chrome binary, flags, and OS/driver state, so + // worker 0's success is representative — gate it there and skip the + // rest. See `heygen-com/hyperframes#955` for the bench data and the + // pre-warmup probe interaction (which `renderChunk` already skips + // when `chunkWorkerCount > 1`). + if (shouldVerifyWorkerGpu(task.workerId, config)) { await assertSwiftShader(session.page, readWebGlVendorInfoFromCanvas); } await initializeSession(session); diff --git a/packages/producer/src/services/distributed/renderChunk.ts b/packages/producer/src/services/distributed/renderChunk.ts index 84fd9ac9d..5309cf52f 100644 --- a/packages/producer/src/services/distributed/renderChunk.ts +++ b/packages/producer/src/services/distributed/renderChunk.ts @@ -469,10 +469,12 @@ export async function renderChunk( // Resolve worker count up-front so we can decide whether to bother // pre-warming a probe session at all. The parallel branch // (chunkWorkerCount > 1) closes the probe immediately and creates fresh - // per-worker sessions; `executeWorkerTask` now runs its own - // `assertSwiftShader` against each worker session (gated on - // `cfg.browserGpuMode === "software"`), so the safety contract holds - // without the eager pre-probe. + // per-worker sessions; `executeWorkerTask` runs `assertSwiftShader` + // on worker 0 only (gated on `cfg.browserGpuMode === "software"`), so + // the safety contract holds without the eager pre-probe and without + // every worker concurrently navigating to the GL probe page. See + // `heygen-com/hyperframes#955` for the worst-case wall regression that + // motivated gating the probe to worker 0. // // Capture-cost calibration based on shader transitions / renderModeHints // is not threaded through to chunks yet; the in-process renderer's