From 484294f838c0afba014aed828de64a4ee8ec8187 Mon Sep 17 00:00:00 2001 From: James Date: Sat, 16 May 2026 02:08:44 +0000 Subject: [PATCH 1/2] feat(producer): add Rio-style residual-RMS check to regression harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing audio comparison in the regression harness measures the Pearson correlation between RMS envelopes of the rendered and snapshot streams. That catches shape-level drift but is insensitive to level shifts, phase offsets, or codec-quantization noise โ€” two streams can correlate >0.9 while differing audibly. Rio's approach (rio/tests/checksum.py:compare_audio_files_ffmpeg) is sample-level: subtract the snapshot from the rendered stream, run `astats`, read the residual Overall RMS in dBFS. Identical streams cancel to silence (-inf, or sub -90 dBFS for AAC-vs-AAC); anything >= -50 dBFS is considered drift. This commit adds the same check as an optional secondary gate: - utils/audioRegression.ts: new `computeAudioResidualRmsDb()` that spawns ffmpeg with the same filter graph Rio uses (aresample + pan + volume=-1 + amix + astats) and returns the parsed Overall RMS plus a pass/fail flag. - utils/audioRegression.test.ts: 3 new tests covering identical streams (-inf result), drifted streams (440Hz vs 880Hz sine), and missing-audio-stream input. - regression-harness.ts: optional `maxAudioResidualRmsDb` field in meta.json. Default is undefined (skip the check) so legacy fixtures aren't retroactively gated; new fixtures opt in by setting a threshold (e.g. -50). Harness emits `residualRmsDb` in the audio_comparison_complete JSON event and the pretty log line. The existing correlation check stays in place; the new residual check is independent. They measure complementary properties (shape vs sample-cancellation) and both should hold for a faithful render. --- packages/producer/src/regression-harness.ts | 51 ++++++- .../src/utils/audioRegression.test.ts | 92 +++++++++++- .../producer/src/utils/audioRegression.ts | 131 ++++++++++++++++++ 3 files changed, 269 insertions(+), 5 deletions(-) diff --git a/packages/producer/src/regression-harness.ts b/packages/producer/src/regression-harness.ts index fee1bd8a0..b06e0cfe9 100644 --- a/packages/producer/src/regression-harness.ts +++ b/packages/producer/src/regression-harness.ts @@ -18,7 +18,11 @@ import { createRenderJob, executeRenderJob } from "./services/renderOrchestrator import { compileForRender } from "./services/htmlCompiler.js"; import { validateCompilation } from "./services/compilationTester.js"; import { extractMediaMetadata } from "./utils/ffprobe.js"; -import { buildRmsEnvelope, compareAudioEnvelopes } from "./utils/audioRegression.js"; +import { + buildRmsEnvelope, + compareAudioEnvelopes, + computeAudioResidualRmsDb, +} from "./utils/audioRegression.js"; import { parseFps, fpsToNumber } from "@hyperframes/core"; import { checkDistributedSupport, @@ -38,6 +42,15 @@ type TestMetadata = { maxFrameFailures: number; minAudioCorrelation: number; maxAudioLagWindows: number; + /** + * Optional Rio-style residual-RMS check. Subtracts the rendered audio + * from the baseline and reads the residual Overall RMS via `astats`. + * A value of `-50` (Rio's convention) treats residuals at-or-below + * -50 dBFS as effectively-silent โ€” i.e. the streams are sample-level + * equivalent. Omit (undefined) to skip the check; in-process renders + * authored before this field was introduced have implicit `undefined`. + */ + maxAudioResidualRmsDb?: number; renderConfig: { /** * Frame rate. Stored on disk as a JSON number (integer fps, e.g. `30`) @@ -229,6 +242,12 @@ function validateMetadata(meta: unknown): TestMetadata { if (typeof m.maxAudioLagWindows !== "number" || m.maxAudioLagWindows < 1) { throw new Error("meta.json: 'maxAudioLagWindows' must be >= 1"); } + if ( + m.maxAudioResidualRmsDb !== undefined && + (typeof m.maxAudioResidualRmsDb !== "number" || !Number.isFinite(m.maxAudioResidualRmsDb)) + ) { + throw new Error("meta.json: 'maxAudioResidualRmsDb' must be a finite number when present"); + } if (!m.renderConfig || typeof m.renderConfig !== "object") { throw new Error("meta.json: 'renderConfig' must be an object"); } @@ -1051,6 +1070,7 @@ async function runTestSuite( let audioPassed = true; let audioCorrelation = 1; let audioLagWindows = 0; + let audioResidualRmsDb: number | null = null; if (!isPngSequence) { logPretty("Comparing audio quality...", "๐Ÿ”Š"); @@ -1068,6 +1088,24 @@ async function runTestSuite( audioCorrelation = audio.correlation; audioLagWindows = audio.lagWindows; audioPassed = audio.correlation >= suite.meta.minAudioCorrelation; + + // Rio-style residual RMS check, sample-level rather than + // envelope-level. Only runs when the fixture opts in by + // setting `maxAudioResidualRmsDb`; the envelope-correlation + // gate above stays in place either way for legacy fixtures + // (correlation is shape similarity; residual RMS is exact + // cancellation โ€” both surface different drift classes). + if (suite.meta.maxAudioResidualRmsDb !== undefined) { + const residual = computeAudioResidualRmsDb( + renderedOutputPath, + snapshotVideoPath, + suite.meta.maxAudioResidualRmsDb, + ); + audioResidualRmsDb = residual.overallDb; + if (!residual.ok) { + audioPassed = false; + } + } } } @@ -1084,17 +1122,24 @@ async function runTestSuite( passed: audioPassed, correlation: audioCorrelation, lagWindows: audioLagWindows, + residualRmsDb: audioResidualRmsDb, }), ); + const residualSuffix = + audioResidualRmsDb === null + ? "" + : `, residualRMS: ${ + Number.isFinite(audioResidualRmsDb) ? audioResidualRmsDb.toFixed(2) : "-inf" + } dBFS`; if (audioPassed) { logPretty( - `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows})`, + `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows}${residualSuffix})`, "โœ“", ); } else { logPretty( - `Audio quality: FAILED (correlation: ${audioCorrelation.toFixed(3)}, threshold: ${suite.meta.minAudioCorrelation})`, + `Audio quality: FAILED (correlation: ${audioCorrelation.toFixed(3)}, threshold: ${suite.meta.minAudioCorrelation}${residualSuffix})`, "โœ—", ); } diff --git a/packages/producer/src/utils/audioRegression.test.ts b/packages/producer/src/utils/audioRegression.test.ts index ba7c6521e..814bd622e 100644 --- a/packages/producer/src/utils/audioRegression.test.ts +++ b/packages/producer/src/utils/audioRegression.test.ts @@ -1,5 +1,13 @@ -import { describe, expect, it } from "vitest"; -import { buildRmsEnvelope, compareAudioEnvelopes } from "./audioRegression.js"; +import { spawnSync } from "node:child_process"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterAll, beforeAll, describe, expect, it } from "vitest"; +import { + buildRmsEnvelope, + compareAudioEnvelopes, + computeAudioResidualRmsDb, +} from "./audioRegression.js"; describe("compareAudioEnvelopes", () => { it("treats silent-vs-silent audio as a perfect match", () => { @@ -14,3 +22,83 @@ describe("compareAudioEnvelopes", () => { }); }); }); + +// Skip the spawn-based tests entirely on hosts without ffmpeg. The +// regression harness only runs in environments where ffmpeg is present +// (`Dockerfile.test`, dev boxes with apt's ffmpeg), so an absent ffmpeg +// is a developer-laptop fact, not a producer regression. +const HAS_FFMPEG = spawnSync("ffmpeg", ["-version"], { encoding: "utf-8" }).status === 0; + +describe.skipIf(!HAS_FFMPEG)("computeAudioResidualRmsDb", () => { + let tmp: string; + + beforeAll(() => { + tmp = mkdtempSync(join(tmpdir(), "hf-audio-residual-test-")); + // Two test wavs: identical 1-second 440 Hz sine, and a 880 Hz sine + // that's audibly different from the 440 reference. + for (const [name, freq] of [ + ["sine-440-a.wav", 440], + ["sine-440-b.wav", 440], + ["sine-880.wav", 880], + ] as const) { + const result = spawnSync( + "ffmpeg", + [ + "-nostdin", + "-v", + "error", + "-f", + "lavfi", + "-i", + `sine=frequency=${freq}:duration=1:sample_rate=48000`, + "-ac", + "2", + "-c:a", + "pcm_s16le", + join(tmp, name), + ], + { encoding: "utf-8" }, + ); + if (result.status !== 0) { + throw new Error(`ffmpeg setup failed for ${name}: ${result.stderr}`); + } + } + }); + + afterAll(() => { + rmSync(tmp, { recursive: true, force: true }); + }); + + it("returns -inf (or very low dBFS) for two identical streams", () => { + const result = computeAudioResidualRmsDb( + join(tmp, "sine-440-a.wav"), + join(tmp, "sine-440-b.wav"), + ); + expect(result.ok).toBe(true); + // 440-vs-440 PCM cancels to silence; ffmpeg reports -inf which we + // normalize to NEGATIVE_INFINITY, OR a value well below -90 if the + // resampler introduces sub-bit-quantization noise. + expect(result.overallDb).toBeLessThan(-80); + }); + + it("fails when streams are audibly different (440 Hz vs 880 Hz)", () => { + const result = computeAudioResidualRmsDb( + join(tmp, "sine-440-a.wav"), + join(tmp, "sine-880.wav"), + ); + expect(result.ok).toBe(false); + // The residual of two uncorrelated unit-amplitude sines is roughly + // the sum of both signals at near-full level โ€” typically around + // -3 dBFS in this resampled-stereo configuration. + expect(result.overallDb).toBeGreaterThan(-30); + }); + + it("reports ok=false when an input has no audio stream", () => { + // A bare empty file: ffmpeg can't probe it, so the function reports + // a parse failure (ok=false, NaN). Callers decide whether to treat + // that as a pass (no-audio fixture) or a fail (audio expected). + const result = computeAudioResidualRmsDb("/dev/null", join(tmp, "sine-440-a.wav")); + expect(result.ok).toBe(false); + expect(Number.isNaN(result.overallDb)).toBe(true); + }); +}); diff --git a/packages/producer/src/utils/audioRegression.ts b/packages/producer/src/utils/audioRegression.ts index 5acc06af8..30fe3f4d7 100644 --- a/packages/producer/src/utils/audioRegression.ts +++ b/packages/producer/src/utils/audioRegression.ts @@ -77,3 +77,134 @@ export function compareAudioEnvelopes( return bestEnvelopeCorrelation(rendered, snapshot, maxLagWindows); } + +// โ”€โ”€ Sample-level residual RMS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// +// Rio-style precise equivalence check: subtract one audio stream from +// the other, run `astats`, read the residual Overall RMS in dBFS. +// Perfectly-equivalent streams produce silence (โ‰ค -90 dBFS in practice +// for AAC-vs-AAC); the Rio convention is `โ‰ค -50 dBFS = effectively +// identical`. +// +// This catches level/phase drift the envelope-correlation check cannot. +// Correlation measures shape similarity at envelope granularity (2048- +// sample windows by default); residual RMS measures sample-level +// cancellation, so it falls out as soon as the two streams disagree by +// a fraction of a sample in alignment or by a fraction of a dB in +// level. +// +// `astats` is invoked via `ffmpeg` spawned in-process. We require ffmpeg +// on PATH โ€” the regression harness already requires it for encode + +// envelope extraction. + +import { spawnSync } from "node:child_process"; + +/** + * Result of {@link computeAudioResidualRmsDb}. + * + * `overallDb` is the residual Overall RMS reading from astats. For + * exact-cancellation (truly identical streams), ffmpeg returns `-inf`; + * this helper normalizes that to `Number.NEGATIVE_INFINITY` so callers + * don't have to special-case the literal string. + */ +export interface AudioResidualRms { + overallDb: number; + ok: boolean; + /** Raw stderr lines that mention `RMS level` (one per channel + overall). Useful for debugging unexpected drift. */ + rmsLines: string[]; +} + +/** + * Compute the residual Overall RMS (dBFS) of `rendered - snapshot`. + * + * Both inputs are paths to media files containing an audio stream. + * They're resampled to 48 kHz stereo, the snapshot is phase-inverted, + * the two are summed via `amix`, and `astats` reports the residual + * level. + * + * Returns `{ ok: false, overallDb: NaN }` if either input lacks an + * audio stream, or if ffmpeg's output didn't contain a parseable RMS + * line โ€” the caller decides whether that's a pass (no-audio fixture) + * or a fail (audio expected but missing). + * + * `maxResidualRmsDb` defaults to `-50` (Rio convention). Pass `-Infinity` + * to compute the value without gating it. + */ +export function computeAudioResidualRmsDb( + rendered: string, + snapshot: string, + maxResidualRmsDb = -50, +): AudioResidualRms { + const proc = spawnSync( + "ffmpeg", + [ + "-nostdin", + "-v", + "info", + "-i", + rendered, + "-i", + snapshot, + "-filter_complex", + // Align both streams (resample + stereo + zero-based PTS), invert the + // snapshot, sum via amix, run astats. Avoids amix's `normalize` + // option (not available on ffmpeg 4.x) โ€” we use volume=-1 + amix to + // subtract. + [ + "[0:a]aresample=48000,pan=stereo|c0=c0|c1=c1,asetpts=N/SR/TB[a0]", + "[1:a]aresample=48000,pan=stereo|c0=c0|c1=c1,asetpts=N/SR/TB,volume=-1[a1]", + "[a0][a1]amix=inputs=2:duration=shortest:dropout_transition=0,astats=metadata=1:reset=1[out]", + ].join(";"), + "-map", + "[out]", + "-f", + "null", + "-", + ], + { encoding: "utf-8" }, + ); + + const stderr = proc.stderr || ""; + // Per-channel + overall RMS lines look like: + // [Parsed_astats_8 @ 0x...] Overall RMS level dB: -90.32 + // [Parsed_astats_8 @ 0x...] RMS level dB: -90.36 (per-channel; no "Overall" prefix) + // Older ffmpeg builds use `Overall RMS level: -inf dB` โ€” handle both shapes. + const rmsLines = stderr.split(/\r?\n/).filter((line) => /RMS level/.test(line)); + + // Prefer the "Overall" line if it appears; otherwise take the max + // per-channel RMS (the most pessimistic channel โ€” that's what Rio + // does as its fallback path). + const overall = pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i); + const channelMax = + pickRms(rmsLines, /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i, "max") ?? + pickRms(rmsLines, /RMS level:\s*(-?inf|[-\d.]+)/i, "max"); + + const value = overall ?? channelMax; + if (value === null) { + return { overallDb: Number.NaN, ok: false, rmsLines }; + } + return { + overallDb: value, + ok: value <= maxResidualRmsDb, + rmsLines, + }; +} + +function pickRms(lines: string[], re: RegExp, mode: "first" | "max" = "first"): number | null { + const values: number[] = []; + for (const line of lines) { + const m = re.exec(line); + if (!m) continue; + const raw = m[1]; + if (raw === "-inf" || raw === "inf") { + values.push(Number.NEGATIVE_INFINITY); + } else { + const n = Number.parseFloat(raw ?? ""); + if (!Number.isNaN(n)) values.push(n); + } + if (mode === "first") break; + } + if (values.length === 0) return null; + if (mode === "max") return Math.max(...values); + return values[0] ?? null; +} From 54e88e36c92d3e46562dd7f2c4109594ecb852f4 Mon Sep 17 00:00:00 2001 From: James Date: Sat, 16 May 2026 17:09:28 +0000 Subject: [PATCH 2/2] fix(producer): harden residual-RMS check (parser, duration guard, error surfacing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback on PR #882: - Stateful astats parse: modern ffmpeg emits `Overall` on its own line followed by per-stat lines, so the single-line `Overall RMS level dB:` regex never fires on 6.x/7.x/8.x. Find the `Overall` header, take the next `RMS level dB:` line. Single-line fallback preserved for 4.x. - Pre-probe both inputs' audio durations and fail up-front if they differ by >5 ms โ€” `amix=duration=shortest` was silently masking trailing audio differences. - Surface ffmpeg/ffprobe spawn errors, signal kills, and non-zero exits with a stderr tail. Previously every failure mode collapsed into "NaN, fail" with no diagnostic. - Extend `TestResult.audio` with `residualRmsDb` + `residualError`, propagate to `audio-failures.json`. - Fix `residualSuffix` formatter: NaN (real failure) was being rendered as "-inf dBFS" (perfect match). Split the branch on `Number.isNaN` separately from `Number.isFinite` and add an explicit error label. --- packages/producer/src/regression-harness.ts | 81 ++++++-- .../producer/src/utils/audioRegression.ts | 196 ++++++++++++++++-- 2 files changed, 242 insertions(+), 35 deletions(-) diff --git a/packages/producer/src/regression-harness.ts b/packages/producer/src/regression-harness.ts index b06e0cfe9..fbbf4e2ba 100644 --- a/packages/producer/src/regression-harness.ts +++ b/packages/producer/src/regression-harness.ts @@ -43,12 +43,12 @@ type TestMetadata = { minAudioCorrelation: number; maxAudioLagWindows: number; /** - * Optional Rio-style residual-RMS check. Subtracts the rendered audio - * from the baseline and reads the residual Overall RMS via `astats`. - * A value of `-50` (Rio's convention) treats residuals at-or-below - * -50 dBFS as effectively-silent โ€” i.e. the streams are sample-level - * equivalent. Omit (undefined) to skip the check; in-process renders - * authored before this field was introduced have implicit `undefined`. + * Optional residual-RMS check. Subtracts the rendered audio from the + * baseline and reads the residual Overall RMS via `astats`. A value + * of `-50` treats residuals at-or-below -50 dBFS as effectively- + * silent โ€” i.e. the streams are sample-level equivalent. Omit + * (undefined) to skip the check; fixtures authored before this field + * was introduced have implicit `undefined`. */ maxAudioResidualRmsDb?: number; renderConfig: { @@ -153,6 +153,15 @@ type TestResult = { passed: boolean; correlation: number; lagWindows: number; + /** + * Residual Overall RMS (dBFS) of `rendered - snapshot`. Present only + * when the fixture opts in via `meta.maxAudioResidualRmsDb`. + * `Number.NEGATIVE_INFINITY` โ‡’ perfect cancellation. `NaN` โ‡’ residual + * check could not run (missing ffmpeg, duration mismatch, ...); see + * `audio.residualError` for the reason. + */ + residualRmsDb?: number; + residualError?: string; }; renderedOutputPath?: string; }; @@ -166,6 +175,28 @@ function logPretty(message: string, emoji = "โ€ข") { console.error(`${emoji} ${message}`); } +/** + * Format the residual-RMS suffix used in the audio-quality log line. + * + * Three states must surface distinctly: + * โ€ข `null` โ†’ fixture didn't opt into residual RMS โ†’ "" (no suffix) + * โ€ข `NaN` โ†’ check ran but produced no parseable reading โ†’ "(error: ...)" + * โ€ข `-Infinity` โ†’ perfect cancellation (identical streams) โ†’ "-inf dBFS" + * โ€ข finite number โ†’ measured residual โ†’ " dBFS" + * + * Pre-fix this branched on `Number.isFinite()` only, collapsing NaN + * (a real-failure signal) into the `-inf` label (a perfect-match signal). + */ +function formatResidualSuffix(residualRmsDb: number | null, error: string | undefined): string { + if (residualRmsDb === null && !error) return ""; + if (error) return `, residualRMS: error (${error})`; + if (residualRmsDb === null || Number.isNaN(residualRmsDb)) { + return ", residualRMS: error (no parseable reading)"; + } + if (!Number.isFinite(residualRmsDb)) return ", residualRMS: -inf dBFS"; + return `, residualRMS: ${residualRmsDb.toFixed(2)} dBFS`; +} + function parseArgs(argv: string[]): CliOptions { const testNames: string[] = []; const excludeTags: string[] = []; @@ -690,16 +721,29 @@ function saveFailureDetails( // Save audio failures if (result.audio && !result.audio.passed) { + const residualRmsDb = result.audio.residualRmsDb; + const residualError = result.audio.residualError; + const residualThreshold = suite.meta.maxAudioResidualRmsDb; + const residualExceeds = + residualThreshold !== undefined && + typeof residualRmsDb === "number" && + Number.isFinite(residualRmsDb) && + residualRmsDb > residualThreshold; const audioReport = { summary: { correlation: result.audio.correlation, lagWindows: result.audio.lagWindows, threshold: suite.meta.minAudioCorrelation, maxLagWindows: suite.meta.maxAudioLagWindows, + ...(residualRmsDb !== undefined ? { residualRmsDb } : {}), + ...(residualThreshold !== undefined ? { residualThreshold } : {}), + ...(residualError ? { residualError } : {}), }, analysis: { correlationBelowThreshold: result.audio.correlation < suite.meta.minAudioCorrelation, lagExceedsLimit: Math.abs(result.audio.lagWindows) > suite.meta.maxAudioLagWindows, + residualExceedsThreshold: residualExceeds, + residualCheckFailed: residualError !== undefined, }, }; @@ -1071,6 +1115,7 @@ async function runTestSuite( let audioCorrelation = 1; let audioLagWindows = 0; let audioResidualRmsDb: number | null = null; + let audioResidualError: string | undefined; if (!isPngSequence) { logPretty("Comparing audio quality...", "๐Ÿ”Š"); @@ -1089,12 +1134,13 @@ async function runTestSuite( audioLagWindows = audio.lagWindows; audioPassed = audio.correlation >= suite.meta.minAudioCorrelation; - // Rio-style residual RMS check, sample-level rather than - // envelope-level. Only runs when the fixture opts in by - // setting `maxAudioResidualRmsDb`; the envelope-correlation - // gate above stays in place either way for legacy fixtures - // (correlation is shape similarity; residual RMS is exact - // cancellation โ€” both surface different drift classes). + // Sample-level residual-RMS check (complementary to the + // envelope-correlation gate above). Only runs when the fixture + // opts in via `maxAudioResidualRmsDb`; the correlation gate + // stays in place either way for legacy fixtures. Correlation + // measures shape similarity at envelope granularity; residual + // RMS measures sample-level cancellation โ€” both surface + // different drift classes. if (suite.meta.maxAudioResidualRmsDb !== undefined) { const residual = computeAudioResidualRmsDb( renderedOutputPath, @@ -1102,6 +1148,7 @@ async function runTestSuite( suite.meta.maxAudioResidualRmsDb, ); audioResidualRmsDb = residual.overallDb; + audioResidualError = residual.error; if (!residual.ok) { audioPassed = false; } @@ -1113,6 +1160,8 @@ async function runTestSuite( passed: audioPassed, correlation: audioCorrelation, lagWindows: audioLagWindows, + ...(audioResidualRmsDb !== null ? { residualRmsDb: audioResidualRmsDb } : {}), + ...(audioResidualError ? { residualError: audioResidualError } : {}), }; console.log( @@ -1123,15 +1172,11 @@ async function runTestSuite( correlation: audioCorrelation, lagWindows: audioLagWindows, residualRmsDb: audioResidualRmsDb, + residualError: audioResidualError, }), ); - const residualSuffix = - audioResidualRmsDb === null - ? "" - : `, residualRMS: ${ - Number.isFinite(audioResidualRmsDb) ? audioResidualRmsDb.toFixed(2) : "-inf" - } dBFS`; + const residualSuffix = formatResidualSuffix(audioResidualRmsDb, audioResidualError); if (audioPassed) { logPretty( `Audio quality: PASSED (correlation: ${audioCorrelation.toFixed(3)}, lag: ${audioLagWindows}${residualSuffix})`, diff --git a/packages/producer/src/utils/audioRegression.ts b/packages/producer/src/utils/audioRegression.ts index 30fe3f4d7..4659d1523 100644 --- a/packages/producer/src/utils/audioRegression.ts +++ b/packages/producer/src/utils/audioRegression.ts @@ -80,11 +80,11 @@ export function compareAudioEnvelopes( // โ”€โ”€ Sample-level residual RMS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ // -// Rio-style precise equivalence check: subtract one audio stream from -// the other, run `astats`, read the residual Overall RMS in dBFS. -// Perfectly-equivalent streams produce silence (โ‰ค -90 dBFS in practice -// for AAC-vs-AAC); the Rio convention is `โ‰ค -50 dBFS = effectively -// identical`. +// Precise sample-cancellation equivalence check: subtract one audio +// stream from the other, run `astats`, read the residual Overall RMS in +// dBFS. Perfectly-equivalent streams produce silence (โ‰ค -90 dBFS in +// practice for AAC-vs-AAC); โ‰ค -50 dBFS is the conventional threshold +// for treating two streams as effectively identical. // // This catches level/phase drift the envelope-correlation check cannot. // Correlation measures shape similarity at envelope granularity (2048- @@ -112,6 +112,13 @@ export interface AudioResidualRms { ok: boolean; /** Raw stderr lines that mention `RMS level` (one per channel + overall). Useful for debugging unexpected drift. */ rmsLines: string[]; + /** + * Diagnostic when the helper could not produce a residual reading + * (ffmpeg missing, ffprobe duration mismatch, astats output unparseable, + * etc.). When set, callers should treat it as a hard failure even though + * `overallDb` may be `NaN`. + */ + error?: string; } /** @@ -127,7 +134,7 @@ export interface AudioResidualRms { * line โ€” the caller decides whether that's a pass (no-audio fixture) * or a fail (audio expected but missing). * - * `maxResidualRmsDb` defaults to `-50` (Rio convention). Pass `-Infinity` + * `maxResidualRmsDb` defaults to `-50`. Pass `-Infinity` * to compute the value without gating it. */ export function computeAudioResidualRmsDb( @@ -135,6 +142,38 @@ export function computeAudioResidualRmsDb( snapshot: string, maxResidualRmsDb = -50, ): AudioResidualRms { + // Pre-probe both inputs' audio durations. `amix=duration=shortest` + // truncates at the shorter input, which means trailing audio on the + // longer side never enters astats โ€” a fixture that drops the last + // half-second of audio would still report a clean residual. Fail + // up-front instead. One-frame tolerance @ 48 kHz โ‰ˆ 20.83 ยตs (one + // audio frame); we widen to 5 ms (~240 samples) so trivial container + // muxer rounding doesn't trip the gate. + const renderedDur = probeAudioDuration(rendered); + const snapshotDur = probeAudioDuration(snapshot); + if (renderedDur.error || snapshotDur.error) { + return { + overallDb: Number.NaN, + ok: false, + rmsLines: [], + error: renderedDur.error ?? snapshotDur.error, + }; + } + const delta = Math.abs(renderedDur.seconds - snapshotDur.seconds); + const TOLERANCE_SECONDS = 0.005; + if (delta > TOLERANCE_SECONDS) { + return { + overallDb: Number.NaN, + ok: false, + rmsLines: [], + error: `audio duration mismatch: rendered=${renderedDur.seconds.toFixed( + 4, + )}s, snapshot=${snapshotDur.seconds.toFixed(4)}s (ฮ”=${delta.toFixed( + 4, + )}s > ${TOLERANCE_SECONDS}s) โ€” amix=duration=shortest would hide the trailing difference`, + }; + } + const proc = spawnSync( "ffmpeg", [ @@ -164,22 +203,63 @@ export function computeAudioResidualRmsDb( { encoding: "utf-8" }, ); + // `spawnSync` swallows `ENOENT`, signal kills, and non-zero exits + // silently โ€” without surfacing them, every failure mode collapses + // into "no RMS line found, NaN, fail". Surface the actual cause so + // CI logs are actionable. + if (proc.error) { + return { + overallDb: Number.NaN, + ok: false, + rmsLines: [], + error: `ffmpeg spawn failed: ${(proc.error as NodeJS.ErrnoException).code ?? proc.error.message}`, + }; + } + if (proc.signal) { + return { + overallDb: Number.NaN, + ok: false, + rmsLines: [], + error: `ffmpeg killed by signal ${proc.signal}`, + }; + } + if (typeof proc.status === "number" && proc.status !== 0) { + return { + overallDb: Number.NaN, + ok: false, + rmsLines: [], + error: `ffmpeg exited with status ${proc.status}: ${tailStderr(proc.stderr ?? "")}`, + }; + } + const stderr = proc.stderr || ""; - // Per-channel + overall RMS lines look like: - // [Parsed_astats_8 @ 0x...] Overall RMS level dB: -90.32 - // [Parsed_astats_8 @ 0x...] RMS level dB: -90.36 (per-channel; no "Overall" prefix) - // Older ffmpeg builds use `Overall RMS level: -inf dB` โ€” handle both shapes. - const rmsLines = stderr.split(/\r?\n/).filter((line) => /RMS level/.test(line)); - - // Prefer the "Overall" line if it appears; otherwise take the max - // per-channel RMS (the most pessimistic channel โ€” that's what Rio - // does as its fallback path). - const overall = pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i); + // Modern ffmpeg's astats emits per-channel stats first, then an + // `Overall` section header on its own line, then overall stats. + // Example (ffmpeg 6.x / 7.x / 8.x): + // [Parsed_astats_0 @ 0x...] RMS level dB: -21.43 โ† channel 1 + // [Parsed_astats_0 @ 0x...] ... + // [Parsed_astats_0 @ 0x...] Overall โ† section header (no value) + // [Parsed_astats_0 @ 0x...] DC offset: ... + // [Parsed_astats_0 @ 0x...] RMS level dB: -21.43 โ† overall value + // A single-line `Overall RMS level dB:` regex never fires on these + // builds โ€” the `Overall` token and `RMS level` token are on different + // lines. We do a stateful scan: find the `Overall` header, take the + // first `RMS level dB:` line that follows. Older ffmpeg builds (4.x) + // do emit `Overall RMS level dB:` on a single line; the + // single-line fallback regex covers those. + const lines = stderr.split(/\r?\n/); + const rmsLines = lines.filter((line) => /RMS level/.test(line)); + + const overallDb = parseOverallRms(lines) ?? parseInlineOverallRms(rmsLines); + // Fallback to per-channel max if the Overall section is missing + // (unusual ffmpeg build, or astats truncated). For a 2-channel mix + // this is the more pessimistic of the two channels, which is a + // strictly tighter gate than Overall. const channelMax = pickRms(rmsLines, /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i, "max") ?? pickRms(rmsLines, /RMS level:\s*(-?inf|[-\d.]+)/i, "max"); - const value = overall ?? channelMax; + const value = overallDb ?? channelMax; if (value === null) { return { overallDb: Number.NaN, ok: false, rmsLines }; } @@ -190,6 +270,88 @@ export function computeAudioResidualRmsDb( }; } +/** Stateful parse: find an `Overall` header line, return the first `RMS level dB:` value after it. */ +function parseOverallRms(lines: string[]): number | null { + let inOverall = false; + for (const line of lines) { + // The `Overall` header is the literal token at end of an astats + // prefix; match on word boundary so `Overall RMS level...` (the + // inline form for older ffmpeg) isn't accidentally consumed here. + if (!inOverall && /\bOverall\s*$/.test(line)) { + inOverall = true; + continue; + } + if (inOverall) { + const m = /RMS level\s*dB:\s*(-?inf|[-\d.]+)/i.exec(line); + if (m && m[1] !== undefined) { + return m[1] === "-inf" || m[1] === "inf" + ? Number.NEGATIVE_INFINITY + : Number.parseFloat(m[1]); + } + } + } + return null; +} + +/** Single-line `Overall RMS level dB: ` parser for older ffmpeg builds (4.x). */ +function parseInlineOverallRms(rmsLines: string[]): number | null { + return pickRms(rmsLines, /Overall RMS level(?:\s*dB)?:\s*(-?inf|[-\d.]+)/i); +} + +/** + * Probe a media file's audio-stream duration via `ffprobe`. Returns + * `{ seconds: NaN, error }` if the file has no audio stream or + * `ffprobe` can't be invoked. + */ +function probeAudioDuration(file: string): { seconds: number; error?: string } { + const proc = spawnSync( + "ffprobe", + [ + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + file, + ], + { encoding: "utf-8" }, + ); + if (proc.error) { + return { + seconds: Number.NaN, + error: `ffprobe spawn failed for ${file}: ${(proc.error as NodeJS.ErrnoException).code ?? proc.error.message}`, + }; + } + if (typeof proc.status === "number" && proc.status !== 0) { + return { + seconds: Number.NaN, + error: `ffprobe exited ${proc.status} for ${file}: ${tailStderr(proc.stderr ?? "")}`, + }; + } + const raw = (proc.stdout ?? "").trim(); + if (!raw || raw === "N/A") { + return { seconds: Number.NaN, error: `no audio stream in ${file}` }; + } + const seconds = Number.parseFloat(raw); + if (!Number.isFinite(seconds)) { + return { + seconds: Number.NaN, + error: `ffprobe returned unparseable duration "${raw}" for ${file}`, + }; + } + return { seconds }; +} + +function tailStderr(stderr: string, lines = 5): string { + const trimmed = stderr.trim(); + if (!trimmed) return ""; + const tail = trimmed.split(/\r?\n/).slice(-lines).join(" | "); + return tail.length > 500 ? `${tail.slice(0, 500)}โ€ฆ` : tail; +} + function pickRms(lines: string[], re: RegExp, mode: "first" | "max" = "first"): number | null { const values: number[] = []; for (const line of lines) {