From 484294f838c0afba014aed828de64a4ee8ec8187 Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Sat, 16 May 2026 02:08:44 +0000
Subject: [PATCH 1/2] feat(producer): add Rio-style residual-RMS check to
 regression harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing audio comparison in the regression harness measures the
Pearson correlation between RMS envelopes of the rendered and snapshot
streams. That catches shape-level drift but is insensitive to level
shifts, phase offsets, or codec-quantization noise — two streams can
correlate >0.9 while differing audibly.

Rio's approach (rio/tests/checksum.py:compare_audio_files_ffmpeg) is
sample-level: subtract the snapshot from the rendered stream, run
`astats`, read the residual Overall RMS in dBFS. Identical streams
cancel to silence (-inf, or sub -90 dBFS for AAC-vs-AAC); anything
>= -50 dBFS is considered drift.

This commit adds the same check as an optional secondary gate:

  - utils/audioRegression.ts: new `computeAudioResidualRmsDb()` that
    spawns ffmpeg with the same filter graph Rio uses (aresample +
    pan + volume=-1 + amix + astats) and returns the parsed Overall
    RMS plus a pass/fail flag.
  - utils/audioRegression.test.ts: 3 new tests covering identical
    streams (-inf result), drifted streams (440Hz vs 880Hz sine),
    and missing-audio-stream input.
  - regression-harness.ts: optional `maxAudioResidualRmsDb` field in
    meta.json. Default is undefined (skip the check) so legacy
    fixtures aren't retroactively gated; new fixtures opt in by
    setting a threshold (e.g. -50). Harness emits `residualRmsDb` in
    the audio_comparison_complete JSON event and the pretty log line.

The existing correlation check stays in place; the new residual check
is independent. They measure complementary properties (shape vs
sample-cancellation) and both should hold for a faithful render.
---
 packages/producer/src/regression-harness.ts   |  51 ++++++-
 .../src/utils/audioRegression.test.ts         |  92 +++++++++++-
 .../producer/src/utils/audioRegression.ts     | 131 ++++++++++++++++++
 3 files changed, 269 insertions(+), 5 deletions(-)

diff --git a/packages/producer/src/regression-harness.ts b/packages/producer/src/regression-harness.ts
index fee1bd8a0..b06e0cfe9 100644
--- a/packages/producer/src/regression-harness.ts
+++ b/packages/producer/src/regression-harness.ts
@@ -18,7 +18,11 @@ import { createRenderJob, executeRenderJob } from "./services/renderOrchestrator
 import { compileForRender } from "./services/htmlCompiler.js";
 import { validateCompilation } from "./services/compilationTester.js";
 import { extractMediaMetadata } from "./utils/ffprobe.js";
-import { buildRmsEnvelope, compareAudioEnvelopes } from "./utils/audioRegression.js";
+import {
+  buildRmsEnvelope,
+  compareAudioEnvelopes,
+  computeAudioResidualRmsDb,
+} from "./utils/audioRegression.js";
 import { parseFps, fpsToNumber } from "@hyperframes/core";
 import {
   checkDistributedSupport,
@@ -38,6 +42,15 @@ type TestMetadata = {
   maxFrameFailures: number;
   minAudioCorrelation: number;
   maxAudioLagWindows: number;
+  /**
+   * Optional Rio-style residual-RMS check. Subtracts the rendered audio
+   * from the baseline and reads the residual Overall RMS via `astats`.
+   * A value of `-50` (Rio's convention) treats residuals at-or-below
+   * -50 dBFS as effectively-silent — i.e. the streams are sample-level
+   * equivalent. Omit (undefined) to skip the check; in-process renders
+   * authored before this field was introduced have implicit `undefined`.
+   */
+  maxAudioResidualRmsDb?: number;
   renderConfig: {
     /**
      * Frame rate. Stored on disk as a JSON number (integer fps, e.g. `30`)
@@ -229,6 +242,12 @@ function validateMetadata(meta: unknown): TestMetadata {
   if (typeof m.maxAudioLagWindows !== "number" || m.maxAudioLagWindows < 1) {
     throw new Error("meta.json: 'maxAudioLagWindows' must be >= 1");
   }
+  if (
+    m.maxAudioResidualRmsDb !== undefined &&
+    (typeof m.maxAudioResidualRmsDb !== "number" || !Number.isFinite(m.maxAudioResidualRmsDb))
+  ) {
+    throw new Error("meta.json: 'maxAudioResidualRmsDb' must be a finite number when present");
+  }
   if (!m.renderConfig || typeof m.renderConfig !== "object") {
     throw new Error("meta.json: 'renderConfig' must be an object");
   }
@@ -1051,6 +1070,7 @@ async function runTestSuite(
     let audioPassed = true;
     let audioCorrelation = 1;
     let audioLagWindows = 0;
+    let audioResidualRmsDb: number | null = null;
 
     if (!isPngSequence) {
       logPretty("Comparing audio quality...", "🔊");
@@ -1068,6 +1088,24 @@ async function runTestSuite(
         audioCorrelation = audio.correlation;
         audioLagWindows = audio.lagWindows;
         audioPassed = audio.correlation >= suite.meta.minAudioCorrelation;
+
+        // Rio-style residual RMS check, sample-level rather than
+        // envelope-level. Only runs when the fixture opts in by
+        // setting `maxAudioResidualRmsDb`; the envelope-correlation
+        // gate above stays in place either way for legacy fixtures
+        // (correlation is shape similarity; residual RMS is exact
+        // cancellation — both surface different drift classes).
+        if (suite.meta.maxAudioResidualRmsDb !== undefined) {
+          const residual = computeAudioResidualRmsDb(
+            renderedOutputPath,
+            snapshotVideoPath,
+            suite.meta.maxAudioResidualRmsDb,
+          );
+          audioResidualRmsDb = residual.overallDb;
+          if (!residual.ok) {
+            audioPassed = false;
+          }
+        }
       }
     }
 
@@ -1084,17 +1122,24 @@ async function runTestSuite(
         passed: audioPassed,
         correlation: audioCorrelation,
         lagWindows: audioLagWindows,
+        residualRmsDb: audioResidualRmsDb,
       }),
     );
 
+    const residualSuffix =
+      audioResidualRmsDb === null
+        ? ""
+        : `, residualRMS: ${
+            Number.isFinite(audioResidualRmsDb) ? audioResidualRmsDb.toFixed(2) : "-inf"
+          } dBFS`;
     if (audioPassed) {
       logPretty(
-        `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows})`,
+        `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows}${residualSuffix})`,
         "✓",
       );
     } else {
       logPretty(
-        `Audio quality: FAILED (correlation: ${audioCorrelation.toFixed(3)}, threshold: ${suite.meta.minAudioCorrelation})`,
+        `Audio quality: FAILED (correlation: ${audioCorrelation.toFixed(3)}, threshold: ${suite.meta.minAudioCorrelation}${residualSuffix})`,
         "✗",
       );
     }
diff --git a/packages/producer/src/utils/audioRegression.test.ts b/packages/producer/src/utils/audioRegression.test.ts
index ba7c6521e..814bd622e 100644
--- a/packages/producer/src/utils/audioRegression.test.ts
+++ b/packages/producer/src/utils/audioRegression.test.ts
@@ -1,5 +1,13 @@
-import { describe, expect, it } from "vitest";
-import { buildRmsEnvelope, compareAudioEnvelopes } from "./audioRegression.js";
+import { spawnSync } from "node:child_process";
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterAll, beforeAll, describe, expect, it } from "vitest";
+import {
+  buildRmsEnvelope,
+  compareAudioEnvelopes,
+  computeAudioResidualRmsDb,
+} from "./audioRegression.js";
 
 describe("compareAudioEnvelopes", () => {
   it("treats silent-vs-silent audio as a perfect match", () => {
@@ -14,3 +22,83 @@ describe("compareAudioEnvelopes", () => {
     });
   });
 });
+
+// Skip the spawn-based tests entirely on hosts without ffmpeg. The
+// regression harness only runs in environments where ffmpeg is present
+// (`Dockerfile.test`, dev boxes with apt's ffmpeg), so an absent ffmpeg
+// is a developer-laptop fact, not a producer regression.
+const HAS_FFMPEG = spawnSync("ffmpeg", ["-version"], { encoding: "utf-8" }).status === 0;
+
+describe.skipIf(!HAS_FFMPEG)("computeAudioResidualRmsDb", () => {
+  let tmp: string;
+
+  beforeAll(() => {
+    tmp = mkdtempSync(join(tmpdir(), "hf-audio-residual-test-"));
+    // Two test wavs: identical 1-second 440 Hz sine, and a 880 Hz sine
+    // that's audibly different from the 440 reference.
+    for (const [name, freq] of [
+      ["sine-440-a.wav", 440],
+      ["sine-440-b.wav", 440],
+      ["sine-880.wav", 880],
+    ] as const) {
+      const result = spawnSync(
+        "ffmpeg",
+        [
+          "-nostdin",
+          "-v",
+          "error",
+          "-f",
+          "lavfi",
+          "-i",
+          `sine=frequency=${freq}:duration=1:sample_rate=48000`,
+          "-ac",
+          "2",
+          "-c:a",
+          "pcm_s16le",
+          join(tmp, name),
+        ],
+        { encoding: "utf-8" },
+      );
+      if (result.status !== 0) {
+        throw new Error(`ffmpeg setup failed for ${name}: ${result.stderr}`);
+      }
+    }
+  });
+
+  afterAll(() => {
+    rmSync(tmp, { recursive: true, force: true });
+  });
+
+  it("returns -inf (or very low dBFS) for two identical streams", () => {
+    const result = computeAudioResidualRmsDb(
+      join(tmp, "sine-440-a.wav"),
+      join(tmp, "sine-440-b.wav"),
+    );
+    expect(result.ok).toBe(true);
+    // 440-vs-440 PCM cancels to silence; ffmpeg reports -inf which we
+    // normalize to NEGATIVE_INFINITY, OR a value well below -90 if the
+    // resampler introduces sub-bit-quantization noise.
+    expect(result.overallDb).toBeLessThan(-80);
+  });
+
+  it("fails when streams are audibly different (440 Hz vs 880 Hz)", () => {
+    const result = computeAudioResidualRmsDb(
+      join(tmp, "sine-440-a.wav"),
+      join(tmp, "sine-880.wav"),
+    );
+    expect(result.ok).toBe(false);
+    // The residual of two uncorrelated unit-amplitude sines is roughly
+    // the sum of both signals at near-full level — typically around
+    // -3 dBFS in this resampled-stereo configuration.
+    expect(result.overallDb).toBeGreaterThan(-30);
+  });
+
+  it("reports ok=false when an input has no audio stream", () => {
+    // A bare empty file: ffmpeg can't probe it, so the function reports
+    // a parse failure (ok=false, NaN). Callers decide whether to treat
+    // that as a pass (no-audio fixture) or a fail (audio expected).
+    const result = computeAudioResidualRmsDb("/dev/null", join(tmp, "sine-440-a.wav"));
+    expect(result.ok).toBe(false);
+    expect(Number.isNaN(result.overallDb)).toBe(true);
+  });
+});
diff --git a/packages/producer/src/utils/audioRegression.ts b/packages/producer/src/utils/audioRegression.ts
index 5acc06af8..30fe3f4d7 100644
--- a/packages/producer/src/utils/audioRegression.ts
+++ b/packages/producer/src/utils/audioRegression.ts
@@ -77,3 +77,134 @@ export function compareAudioEnvelopes(
 
   return bestEnvelopeCorrelation(rendered, snapshot, maxLagWindows);
 }
+
+// ── Sample-level residual RMS ───────────────────────────────────────────────
+//
+// Rio-style precise equivalence check: subtract one audio stream from
+// the other, run `astats`, read the residual Overall RMS in dBFS.
+// Perfectly-equivalent streams produce silence (≤ -90 dBFS in practice
+// for AAC-vs-AAC); the Rio convention is `≤ -50 dBFS = effectively
+// identical`.
+//
+// This catches level/phase drift the envelope-correlation check cannot.
+// Correlation measures shape similarity at envelope granularity (2048-
+// sample windows by default); residual RMS measures sample-level
+// cancellation, so it falls out as soon as the two streams disagree by
+// a fraction of a sample in alignment or by a fraction of a dB in
+// level.
+//
+// `astats` is invoked via `ffmpeg` spawned in-process. We require ffmpeg
+// on PATH — the regression harness already requires it for encode +
+// envelope extraction.
+
+import { spawnSync } from "node:child_process";
+
+/**
+ * Result of {@link computeAudioResidualRmsDb}.
+ *
+ * `overallDb` is the residual Overall RMS reading from astats. For
+ * exact-cancellation (truly identical streams), ffmpeg returns `-inf`;
+ * this helper normalizes that to `Number.NEGATIVE_INFINITY` so callers
+ * don't have to special-case the literal string.
+ */
+export interface AudioResidualRms {
+  overallDb: number;
+  ok: boolean;
+  /** Raw stderr lines that mention `RMS level` (one per channel + overall). Useful for debugging unexpected drift. */
+  rmsLines: string[];
+}
+
+/**
+ * Compute the residual Overall RMS (dBFS) of `rendered - snapshot`.
+ *
+ * Both inputs are paths to media files containing an audio stream.
+ * They're resampled to 48 kHz stereo, the snapshot is phase-inverted,
+ * the two are summed via `amix`, and `astats` reports the residual
+ * level.
+ *
+ * Returns `{ ok: false, overallDb: NaN }` if either input lacks an
+ * audio stream, or if ffmpeg's output didn't contain a parseable RMS
+ * line — the caller decides whether that's a pass (no-audio fixture)
+ * or a fail (audio expected but missing).
+ *
+ * `maxResidualRmsDb` defaults to `-50` (Rio convention). Pass `-Infinity`
+ * to compute the value without gating it.
+ */
+export function computeAudioResidualRmsDb(
+  rendered: string,
+  snapshot: string,
+  maxResidualRmsDb = -50,
+): AudioResidualRms {
+  const proc = spawnSync(
+    "ffmpeg",
+    [
+      "-nostdin",
+      "-v",
+      "info",
+      "-i",
+      rendered,
+      "-i",
+      snapshot,
+      "-filter_complex",
+      // Align both streams (resample + stereo + zero-based PTS), invert the
+      // snapshot, sum via amix, run astats. Avoids amix's `normalize`
+      // option (not available on ffmpeg 4.x) — we use volume=-1 + amix to
+      // subtract.
+      [
+        "[0:a]aresample=48000,pan=stereo|c0=c0|c1=c1,asetpts=N/SR/TB[a0]",
+        "[1:a]aresample=48000,pan=stereo|c0=c0|c1=c1,asetpts=N/SR/TB,volume=-1[a1]",
+        "[a0][a1]amix=inputs=2:duration=shortest:dropout_transition=0,astats=metadata=1:reset=1[out]",
+      ].join(";"),
+      "-map",
+      "[out]",
+      "-f",
+      "null",
+      "-",
+    ],
+    { encoding: "utf-8" },
+  );
+
+  const stderr = proc.stderr || "";
+  // Per-channel + overall RMS lines look like:
+  //   [Parsed_astats_8 @ 0x...] Overall RMS level dB: -90.32
+  //   [Parsed_astats_8 @ 0x...] RMS level dB: -90.36         (per-channel; no "Overall" prefix)
+  // Older ffmpeg builds use `Overall RMS level: -inf dB` — handle both shapes.
+  const rmsLines = stderr.split(/\r?\n/).filter((line) => /RMS level/.test(line));
+
+  // Prefer the "Overall" line if it appears; otherwise take the max
+  // per-channel RMS (the most pessimistic channel — that's what Rio
+  // does as its fallback path).
+  const overall = pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i);
+  const channelMax =
+    pickRms(rmsLines, /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i, "max") ??
+    pickRms(rmsLines, /RMS level:\s*(-?inf|[-\d.]+)/i, "max");
+
+  const value = overall ?? channelMax;
+  if (value === null) {
+    return { overallDb: Number.NaN, ok: false, rmsLines };
+  }
+  return {
+    overallDb: value,
+    ok: value <= maxResidualRmsDb,
+    rmsLines,
+  };
+}
+
+function pickRms(lines: string[], re: RegExp, mode: "first" | "max" = "first"): number | null {
+  const values: number[] = [];
+  for (const line of lines) {
+    const m = re.exec(line);
+    if (!m) continue;
+    const raw = m[1];
+    if (raw === "-inf" || raw === "inf") {
+      values.push(Number.NEGATIVE_INFINITY);
+    } else {
+      const n = Number.parseFloat(raw ?? "");
+      if (!Number.isNaN(n)) values.push(n);
+    }
+    if (mode === "first") break;
+  }
+  if (values.length === 0) return null;
+  if (mode === "max") return Math.max(...values);
+  return values[0] ?? null;
+}

From 54e88e36c92d3e46562dd7f2c4109594ecb852f4 Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Sat, 16 May 2026 17:09:28 +0000
Subject: [PATCH 2/2] fix(producer): harden residual-RMS check (parser,
 duration guard, error surfacing)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses review feedback on PR #882:

- Stateful astats parse: modern ffmpeg emits `Overall` on its own line
  followed by per-stat lines, so the single-line `Overall RMS level dB:`
  regex never fires on 6.x/7.x/8.x. Find the `Overall` header, take the
  next `RMS level dB:` line. Single-line fallback preserved for 4.x.
- Pre-probe both inputs' audio durations and fail up-front if they differ
  by >5 ms — `amix=duration=shortest` was silently masking trailing
  audio differences.
- Surface ffmpeg/ffprobe spawn errors, signal kills, and non-zero exits
  with a stderr tail. Previously every failure mode collapsed into
  "NaN, fail" with no diagnostic.
- Extend `TestResult.audio` with `residualRmsDb` + `residualError`,
  propagate to `audio-failures.json`.
- Fix `residualSuffix` formatter: NaN (real failure) was being rendered
  as "-inf dBFS" (perfect match). Split the branch on `Number.isNaN`
  separately from `Number.isFinite` and add an explicit error label.
---
 packages/producer/src/regression-harness.ts   |  81 ++++++--
 .../producer/src/utils/audioRegression.ts     | 196 ++++++++++++++++--
 2 files changed, 242 insertions(+), 35 deletions(-)

diff --git a/packages/producer/src/regression-harness.ts b/packages/producer/src/regression-harness.ts
index b06e0cfe9..fbbf4e2ba 100644
--- a/packages/producer/src/regression-harness.ts
+++ b/packages/producer/src/regression-harness.ts
@@ -43,12 +43,12 @@ type TestMetadata = {
   minAudioCorrelation: number;
   maxAudioLagWindows: number;
   /**
-   * Optional Rio-style residual-RMS check. Subtracts the rendered audio
-   * from the baseline and reads the residual Overall RMS via `astats`.
-   * A value of `-50` (Rio's convention) treats residuals at-or-below
-   * -50 dBFS as effectively-silent — i.e. the streams are sample-level
-   * equivalent. Omit (undefined) to skip the check; in-process renders
-   * authored before this field was introduced have implicit `undefined`.
+   * Optional residual-RMS check. Subtracts the rendered audio from the
+   * baseline and reads the residual Overall RMS via `astats`. A value
+   * of `-50` treats residuals at-or-below -50 dBFS as effectively-
+   * silent — i.e. the streams are sample-level equivalent. Omit
+   * (undefined) to skip the check; fixtures authored before this field
+   * was introduced have implicit `undefined`.
    */
   maxAudioResidualRmsDb?: number;
   renderConfig: {
@@ -153,6 +153,15 @@ type TestResult = {
     passed: boolean;
     correlation: number;
     lagWindows: number;
+    /**
+     * Residual Overall RMS (dBFS) of `rendered - snapshot`. Present only
+     * when the fixture opts in via `meta.maxAudioResidualRmsDb`.
+     * `Number.NEGATIVE_INFINITY` ⇒ perfect cancellation. `NaN` ⇒ residual
+     * check could not run (missing ffmpeg, duration mismatch, ...); see
+     * `audio.residualError` for the reason.
+     */
+    residualRmsDb?: number;
+    residualError?: string;
   };
   renderedOutputPath?: string;
 };
@@ -166,6 +175,28 @@ function logPretty(message: string, emoji = "•") {
   console.error(`${emoji} ${message}`);
 }
 
+/**
+ * Format the residual-RMS suffix used in the audio-quality log line.
+ *
+ * Three states must surface distinctly:
+ *   • `null`            → fixture didn't opt into residual RMS         → "" (no suffix)
+ *   • `NaN`             → check ran but produced no parseable reading  → "(error: ...)"
+ *   • `-Infinity`       → perfect cancellation (identical streams)     → "-inf dBFS"
+ *   • finite number     → measured residual                            → "<value> dBFS"
+ *
+ * Pre-fix this branched on `Number.isFinite()` only, collapsing NaN
+ * (a real-failure signal) into the `-inf` label (a perfect-match signal).
+ */
+function formatResidualSuffix(residualRmsDb: number | null, error: string | undefined): string {
+  if (residualRmsDb === null && !error) return "";
+  if (error) return `, residualRMS: error (${error})`;
+  if (residualRmsDb === null || Number.isNaN(residualRmsDb)) {
+    return ", residualRMS: error (no parseable reading)";
+  }
+  if (!Number.isFinite(residualRmsDb)) return ", residualRMS: -inf dBFS";
+  return `, residualRMS: ${residualRmsDb.toFixed(2)} dBFS`;
+}
+
 function parseArgs(argv: string[]): CliOptions {
   const testNames: string[] = [];
   const excludeTags: string[] = [];
@@ -690,16 +721,29 @@ function saveFailureDetails(
 
   // Save audio failures
   if (result.audio && !result.audio.passed) {
+    const residualRmsDb = result.audio.residualRmsDb;
+    const residualError = result.audio.residualError;
+    const residualThreshold = suite.meta.maxAudioResidualRmsDb;
+    const residualExceeds =
+      residualThreshold !== undefined &&
+      typeof residualRmsDb === "number" &&
+      Number.isFinite(residualRmsDb) &&
+      residualRmsDb > residualThreshold;
     const audioReport = {
       summary: {
         correlation: result.audio.correlation,
         lagWindows: result.audio.lagWindows,
         threshold: suite.meta.minAudioCorrelation,
         maxLagWindows: suite.meta.maxAudioLagWindows,
+        ...(residualRmsDb !== undefined ? { residualRmsDb } : {}),
+        ...(residualThreshold !== undefined ? { residualThreshold } : {}),
+        ...(residualError ? { residualError } : {}),
       },
       analysis: {
         correlationBelowThreshold: result.audio.correlation < suite.meta.minAudioCorrelation,
         lagExceedsLimit: Math.abs(result.audio.lagWindows) > suite.meta.maxAudioLagWindows,
+        residualExceedsThreshold: residualExceeds,
+        residualCheckFailed: residualError !== undefined,
       },
     };
 
@@ -1071,6 +1115,7 @@ async function runTestSuite(
     let audioCorrelation = 1;
     let audioLagWindows = 0;
     let audioResidualRmsDb: number | null = null;
+    let audioResidualError: string | undefined;
 
     if (!isPngSequence) {
       logPretty("Comparing audio quality...", "🔊");
@@ -1089,12 +1134,13 @@ async function runTestSuite(
         audioLagWindows = audio.lagWindows;
         audioPassed = audio.correlation >= suite.meta.minAudioCorrelation;
 
-        // Rio-style residual RMS check, sample-level rather than
-        // envelope-level. Only runs when the fixture opts in by
-        // setting `maxAudioResidualRmsDb`; the envelope-correlation
-        // gate above stays in place either way for legacy fixtures
-        // (correlation is shape similarity; residual RMS is exact
-        // cancellation — both surface different drift classes).
+        // Sample-level residual-RMS check (complementary to the
+        // envelope-correlation gate above). Only runs when the fixture
+        // opts in via `maxAudioResidualRmsDb`; the correlation gate
+        // stays in place either way for legacy fixtures. Correlation
+        // measures shape similarity at envelope granularity; residual
+        // RMS measures sample-level cancellation — both surface
+        // different drift classes.
         if (suite.meta.maxAudioResidualRmsDb !== undefined) {
           const residual = computeAudioResidualRmsDb(
             renderedOutputPath,
@@ -1102,6 +1148,7 @@ async function runTestSuite(
             suite.meta.maxAudioResidualRmsDb,
           );
           audioResidualRmsDb = residual.overallDb;
+          audioResidualError = residual.error;
           if (!residual.ok) {
             audioPassed = false;
           }
@@ -1113,6 +1160,8 @@ async function runTestSuite(
       passed: audioPassed,
       correlation: audioCorrelation,
       lagWindows: audioLagWindows,
+      ...(audioResidualRmsDb !== null ? { residualRmsDb: audioResidualRmsDb } : {}),
+      ...(audioResidualError ? { residualError: audioResidualError } : {}),
     };
 
     console.log(
@@ -1123,15 +1172,11 @@ async function runTestSuite(
         correlation: audioCorrelation,
         lagWindows: audioLagWindows,
         residualRmsDb: audioResidualRmsDb,
+        residualError: audioResidualError,
       }),
     );
 
-    const residualSuffix =
-      audioResidualRmsDb === null
-        ? ""
-        : `, residualRMS: ${
-            Number.isFinite(audioResidualRmsDb) ? audioResidualRmsDb.toFixed(2) : "-inf"
-          } dBFS`;
+    const residualSuffix = formatResidualSuffix(audioResidualRmsDb, audioResidualError);
     if (audioPassed) {
       logPretty(
         `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows}${residualSuffix})`,
diff --git a/packages/producer/src/utils/audioRegression.ts b/packages/producer/src/utils/audioRegression.ts
index 30fe3f4d7..4659d1523 100644
--- a/packages/producer/src/utils/audioRegression.ts
+++ b/packages/producer/src/utils/audioRegression.ts
@@ -80,11 +80,11 @@ export function compareAudioEnvelopes(
 
 // ── Sample-level residual RMS ───────────────────────────────────────────────
 //
-// Rio-style precise equivalence check: subtract one audio stream from
-// the other, run `astats`, read the residual Overall RMS in dBFS.
-// Perfectly-equivalent streams produce silence (≤ -90 dBFS in practice
-// for AAC-vs-AAC); the Rio convention is `≤ -50 dBFS = effectively
-// identical`.
+// Precise sample-cancellation equivalence check: subtract one audio
+// stream from the other, run `astats`, read the residual Overall RMS in
+// dBFS. Perfectly-equivalent streams produce silence (≤ -90 dBFS in
+// practice for AAC-vs-AAC); ≤ -50 dBFS is the conventional threshold
+// for treating two streams as effectively identical.
 //
 // This catches level/phase drift the envelope-correlation check cannot.
 // Correlation measures shape similarity at envelope granularity (2048-
@@ -112,6 +112,13 @@ export interface AudioResidualRms {
   ok: boolean;
   /** Raw stderr lines that mention `RMS level` (one per channel + overall). Useful for debugging unexpected drift. */
   rmsLines: string[];
+  /**
+   * Diagnostic when the helper could not produce a residual reading
+   * (ffmpeg missing, ffprobe duration mismatch, astats output unparseable,
+   * etc.). When set, callers should treat it as a hard failure even though
+   * `overallDb` may be `NaN`.
+   */
+  error?: string;
 }
 
 /**
@@ -127,7 +134,7 @@ export interface AudioResidualRms {
  * line — the caller decides whether that's a pass (no-audio fixture)
  * or a fail (audio expected but missing).
  *
- * `maxResidualRmsDb` defaults to `-50` (Rio convention). Pass `-Infinity`
+ * `maxResidualRmsDb` defaults to `-50`. Pass `-Infinity`
  * to compute the value without gating it.
  */
 export function computeAudioResidualRmsDb(
@@ -135,6 +142,38 @@ export function computeAudioResidualRmsDb(
   snapshot: string,
   maxResidualRmsDb = -50,
 ): AudioResidualRms {
+  // Pre-probe both inputs' audio durations. `amix=duration=shortest`
+  // truncates at the shorter input, which means trailing audio on the
+  // longer side never enters astats — a fixture that drops the last
+  // half-second of audio would still report a clean residual. Fail
+  // up-front instead. One-frame tolerance @ 48 kHz ≈ 20.83 µs (one
+  // audio frame); we widen to 5 ms (~240 samples) so trivial container
+  // muxer rounding doesn't trip the gate.
+  const renderedDur = probeAudioDuration(rendered);
+  const snapshotDur = probeAudioDuration(snapshot);
+  if (renderedDur.error || snapshotDur.error) {
+    return {
+      overallDb: Number.NaN,
+      ok: false,
+      rmsLines: [],
+      error: renderedDur.error ?? snapshotDur.error,
+    };
+  }
+  const delta = Math.abs(renderedDur.seconds - snapshotDur.seconds);
+  const TOLERANCE_SECONDS = 0.005;
+  if (delta > TOLERANCE_SECONDS) {
+    return {
+      overallDb: Number.NaN,
+      ok: false,
+      rmsLines: [],
+      error: `audio duration mismatch: rendered=${renderedDur.seconds.toFixed(
+        4,
+      )}s, snapshot=${snapshotDur.seconds.toFixed(4)}s (Δ=${delta.toFixed(
+        4,
+      )}s > ${TOLERANCE_SECONDS}s) — amix=duration=shortest would hide the trailing difference`,
+    };
+  }
+
   const proc = spawnSync(
     "ffmpeg",
     [
@@ -164,22 +203,63 @@ export function computeAudioResidualRmsDb(
     { encoding: "utf-8" },
   );
 
+  // `spawnSync` swallows `ENOENT`, signal kills, and non-zero exits
+  // silently — without surfacing them, every failure mode collapses
+  // into "no RMS line found, NaN, fail". Surface the actual cause so
+  // CI logs are actionable.
+  if (proc.error) {
+    return {
+      overallDb: Number.NaN,
+      ok: false,
+      rmsLines: [],
+      error: `ffmpeg spawn failed: ${(proc.error as NodeJS.ErrnoException).code ?? proc.error.message}`,
+    };
+  }
+  if (proc.signal) {
+    return {
+      overallDb: Number.NaN,
+      ok: false,
+      rmsLines: [],
+      error: `ffmpeg killed by signal ${proc.signal}`,
+    };
+  }
+  if (typeof proc.status === "number" && proc.status !== 0) {
+    return {
+      overallDb: Number.NaN,
+      ok: false,
+      rmsLines: [],
+      error: `ffmpeg exited with status ${proc.status}: ${tailStderr(proc.stderr ?? "")}`,
+    };
+  }
+
   const stderr = proc.stderr || "";
-  // Per-channel + overall RMS lines look like:
-  //   [Parsed_astats_8 @ 0x...] Overall RMS level dB: -90.32
-  //   [Parsed_astats_8 @ 0x...] RMS level dB: -90.36         (per-channel; no "Overall" prefix)
-  // Older ffmpeg builds use `Overall RMS level: -inf dB` — handle both shapes.
-  const rmsLines = stderr.split(/\r?\n/).filter((line) => /RMS level/.test(line));
-
-  // Prefer the "Overall" line if it appears; otherwise take the max
-  // per-channel RMS (the most pessimistic channel — that's what Rio
-  // does as its fallback path).
-  const overall = pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i);
+  // Modern ffmpeg's astats emits per-channel stats first, then an
+  // `Overall` section header on its own line, then overall stats.
+  // Example (ffmpeg 6.x / 7.x / 8.x):
+  //   [Parsed_astats_0 @ 0x...] RMS level dB: -21.43         ← channel 1
+  //   [Parsed_astats_0 @ 0x...] ...
+  //   [Parsed_astats_0 @ 0x...] Overall                       ← section header (no value)
+  //   [Parsed_astats_0 @ 0x...] DC offset: ...
+  //   [Parsed_astats_0 @ 0x...] RMS level dB: -21.43         ← overall value
+  // A single-line `Overall RMS level dB:` regex never fires on these
+  // builds — the `Overall` token and `RMS level` token are on different
+  // lines. We do a stateful scan: find the `Overall` header, take the
+  // first `RMS level dB:` line that follows. Older ffmpeg builds (4.x)
+  // do emit `Overall RMS level dB:` on a single line; the
+  // single-line fallback regex covers those.
+  const lines = stderr.split(/\r?\n/);
+  const rmsLines = lines.filter((line) => /RMS level/.test(line));
+
+  const overallDb = parseOverallRms(lines) ?? parseInlineOverallRms(rmsLines);
+  // Fallback to per-channel max if the Overall section is missing
+  // (unusual ffmpeg build, or astats truncated). For a 2-channel mix
+  // this is the more pessimistic of the two channels, which is a
+  // strictly tighter gate than Overall.
   const channelMax =
     pickRms(rmsLines, /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i, "max") ??
     pickRms(rmsLines, /RMS level:\s*(-?inf|[-\d.]+)/i, "max");
 
-  const value = overall ?? channelMax;
+  const value = overallDb ?? channelMax;
   if (value === null) {
     return { overallDb: Number.NaN, ok: false, rmsLines };
   }
@@ -190,6 +270,88 @@ export function computeAudioResidualRmsDb(
   };
 }
 
+/** Stateful parse: find an `Overall` header line, return the first `RMS level dB:` value after it. */
+function parseOverallRms(lines: string[]): number | null {
+  let inOverall = false;
+  for (const line of lines) {
+    // The `Overall` header is the literal token at end of an astats
+    // prefix; match on word boundary so `Overall RMS level...` (the
+    // inline form for older ffmpeg) isn't accidentally consumed here.
+    if (!inOverall && /\bOverall\s*$/.test(line)) {
+      inOverall = true;
+      continue;
+    }
+    if (inOverall) {
+      const m = /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i.exec(line);
+      if (m && m[1] !== undefined) {
+        return m[1] === "-inf" || m[1] === "inf"
+          ? Number.NEGATIVE_INFINITY
+          : Number.parseFloat(m[1]);
+      }
+    }
+  }
+  return null;
+}
+
+/** Single-line `Overall RMS level dB: <value>` parser for older ffmpeg builds (4.x). */
+function parseInlineOverallRms(rmsLines: string[]): number | null {
+  return pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i);
+}
+
+/**
+ * Probe a media file's audio-stream duration via `ffprobe`. Returns
+ * `{ seconds: NaN, error }` if the file has no audio stream or
+ * `ffprobe` can't be invoked.
+ */
+function probeAudioDuration(file: string): { seconds: number; error?: string } {
+  const proc = spawnSync(
+    "ffprobe",
+    [
+      "-v",
+      "error",
+      "-select_streams",
+      "a:0",
+      "-show_entries",
+      "stream=duration",
+      "-of",
+      "default=noprint_wrappers=1:nokey=1",
+      file,
+    ],
+    { encoding: "utf-8" },
+  );
+  if (proc.error) {
+    return {
+      seconds: Number.NaN,
+      error: `ffprobe spawn failed for ${file}: ${(proc.error as NodeJS.ErrnoException).code ?? proc.error.message}`,
+    };
+  }
+  if (typeof proc.status === "number" && proc.status !== 0) {
+    return {
+      seconds: Number.NaN,
+      error: `ffprobe exited ${proc.status} for ${file}: ${tailStderr(proc.stderr ?? "")}`,
+    };
+  }
+  const raw = (proc.stdout ?? "").trim();
+  if (!raw || raw === "N/A") {
+    return { seconds: Number.NaN, error: `no audio stream in ${file}` };
+  }
+  const seconds = Number.parseFloat(raw);
+  if (!Number.isFinite(seconds)) {
+    return {
+      seconds: Number.NaN,
+      error: `ffprobe returned unparseable duration "${raw}" for ${file}`,
+    };
+  }
+  return { seconds };
+}
+
+function tailStderr(stderr: string, lines = 5): string {
+  const trimmed = stderr.trim();
+  if (!trimmed) return "<empty>";
+  const tail = trimmed.split(/\r?\n/).slice(-lines).join(" | ");
+  return tail.length > 500 ? `${tail.slice(0, 500)}…` : tail;
+}
+
 function pickRms(lines: string[], re: RegExp, mode: "first" | "max" = "first"): number | null {
   const values: number[] = [];
   for (const line of lines) {