diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml index 0775ad844..eaf5f8fd8 100644 --- a/.github/workflows/diarizer-benchmark.yml +++ b/.github/workflows/diarizer-benchmark.yml @@ -44,6 +44,12 @@ jobs: path: ~/FluidAudioDatasets/ami_official key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }} + - name: Cache AMI annotations + uses: actions/cache@v4 + with: + path: Datasets/ami_public_1.6.2 + key: ${{ runner.os }}-ami-annotations-1.6.2 + - name: Build package run: swift build -c release @@ -123,7 +129,9 @@ jobs: fi - name: Comment PR with Benchmark Results - if: always() + # Only comment when metrics were actually extracted — a failed run must + # show up as a red check, not a comment with garbage numbers (issue #752) + if: steps.extract.outcome == 'success' uses: actions/github-script@v7 with: script: | diff --git a/.github/workflows/offline-pipeline.yml b/.github/workflows/offline-pipeline.yml index 25cab0278..c35a246f7 100644 --- a/.github/workflows/offline-pipeline.yml +++ b/.github/workflows/offline-pipeline.yml @@ -24,6 +24,12 @@ jobs: with: swift-version: "6.1" + - name: Cache AMI annotations + uses: actions/cache@v4 + with: + path: Datasets/ami_public_1.6.2 + key: ${{ runner.os }}-ami-annotations-1.6.2 + - name: Build package run: swift build -c release diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml index a3e04d662..8df3cadd7 100644 --- a/.github/workflows/sortformer-benchmark.yml +++ b/.github/workflows/sortformer-benchmark.yml @@ -44,6 +44,12 @@ jobs: path: ~/FluidAudioDatasets/ami_official key: ${{ runner.os }}-ami-dataset + - name: Cache AMI annotations + uses: actions/cache@v4 + with: + path: Datasets/ami_public_1.6.2 + key: ${{ runner.os }}-ami-annotations-1.6.2 + - name: Build package run: swift build -c release diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 455870931..83111c5d6 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -589,43 +589,47 @@ Step Ratio 1, min duration 0 (edited) Note that the baseline pytorch version is ~11% DER, we lost some precision dropping down to fp16 precision in order to run most of the embedding model on neural engine. But as a result, we significantly out perform the baseline `mps` backend as well. the pyannote-community-1 on cpu is ~1.5-2 RTFx, on mps, it's ~20-25 RTFx. -Running on the full AMI SDM 16-meeting test set (official NeMo/pyannote evaluation split: EN2002, ES2004, IS1009, TS3003 × a-d): +Running on the full AMI SDM 16-meeting test set (official NeMo/pyannote evaluation split: EN2002, ES2004, IS1009, TS3003 × a-d). Re-run 2026-07-03 on an Apple M5 Pro: ```bash swift run -c release fluidaudiocli diarization-benchmark --mode offline \ - --dataset ami-sdm --auto-download + --dataset ami-sdm --auto-download --threshold 0.7 ``` ```text ------------------------------------------------------------------------------------------ Meeting DER % JER % Miss % FA % SE % Speakers RTFx ------------------------------------------------------------------------------------------ -IS1009c 5.1 5.9 3.1 1.5 0.6 4/4 94.6 -IS1009b 5.4 6.4 2.8 1.4 1.1 4/4 77.6 -ES2004b 6.0 7.0 2.7 2.2 1.1 4/4 70.4 -ES2004c 6.4 7.3 2.0 3.4 1.0 4/4 70.5 -EN2002c 7.8 9.7 5.1 0.5 2.2 3/3 60.3 -TS3003b 8.0 7.8 3.6 3.7 0.7 4/4 71.4 -TS3003c 9.0 8.7 6.1 1.9 0.9 4/4 70.4 -EN2002b 9.1 12.9 4.0 1.9 3.2 5/4 63.4 -IS1009d 9.2 11.7 4.5 2.6 2.2 4/4 91.6 -IS1009a 9.9 11.9 5.0 2.5 2.4 4/4 60.8 -ES2004a 10.4 13.4 7.5 1.6 1.4 4/4 60.0 -EN2002a 10.6 15.0 5.4 1.2 4.0 4/4 52.2 -ES2004d 11.4 16.4 5.3 2.6 3.5 4/4 62.1 -TS3003a 17.2 64.1 13.1 1.3 2.8 2/4 68.7 -EN2002d 18.3 38.2 4.6 1.5 12.2 3/4 78.6 -TS3003d 26.0 41.6 11.0 2.2 12.8 3/4 64.5 +IS1009c 5.1 5.9 3.1 1.5 0.6 4/4 335.3 +IS1009b 5.4 6.4 2.8 1.4 1.1 4/4 337.1 +ES2004b 6.0 7.0 2.7 2.2 1.1 4/4 314.0 +ES2004c 6.4 7.3 2.0 3.4 1.0 4/4 304.0 +EN2002c 7.8 9.7 5.1 0.5 2.2 3/3 305.0 +TS3003b 8.0 7.8 3.6 3.7 0.7 4/4 307.3 +TS3003c 9.0 8.7 6.1 1.9 0.9 4/4 350.0 +EN2002b 9.1 12.9 4.0 1.9 3.2 5/4 318.5 +IS1009d 9.2 11.7 4.5 2.6 2.2 4/4 320.9 +IS1009a 9.9 11.9 5.0 2.5 2.4 4/4 342.9 +ES2004a 10.4 13.4 7.5 1.6 1.4 4/4 334.0 +EN2002a 10.6 15.0 5.4 1.2 4.0 4/4 298.7 +ES2004d 11.4 16.4 5.3 2.6 3.5 4/4 329.4 +TS3003a 17.2 64.1 13.1 1.3 2.8 2/4 346.4 +EN2002d 18.3 38.2 4.6 1.5 12.2 3/4 302.5 +TS3003d 26.0 41.6 11.0 2.2 12.8 3/4 324.6 ------------------------------------------------------------------------------------------ -AVERAGE 10.6 17.4 5.4 2.0 3.3 - 69.8 +AVERAGE 10.6 17.4 5.4 2.0 3.3 - 323.2 ========================================================================================== ``` -12/16 meetings detect the correct speaker count. Average DER 10.62% matches published pyannote-community-1 offline numbers on this split (~11-12%). +12/16 meetings detect the correct speaker count. Average DER 10.62% matches published pyannote-community-1 offline numbers on this split (~11-12%). Results are fully deterministic (two consecutive runs produce bit-identical metrics). + +**Clustering threshold matters on this split.** The table above uses `--threshold 0.7`. Since #616 the CLI default is the community-1 preset (0.6), which merges clusters more aggressively and undercounts speakers on 4 meetings (EN2002a 2/4 → 41.9% DER, ES2004d 3/4 → 34.8%, EN2002b 19.0%, IS1009d 16.4%), degrading the average to 15.5% DER. Pass `--threshold 0.7` (or set `clusteringThreshold: 0.7` in `OfflineDiarizerConfig`) for AMI-SDM-like meeting audio. ### Streaming/online Diarization -This is more tricky and honestly a lot more fragile to clustering. Expect +10-15% worse DER for the streaming implementation. Only use this when you critically need realtime streaming speaker diarization. In most cases, offline is more than enough for most applications. +This is more tricky and honestly a lot more fragile to clustering. Expect substantially worse DER than the offline pipeline — the tail meetings suffer heavy speaker confusion. Only use this when you critically need realtime streaming speaker diarization. In most cases, offline is more than enough for most applications. + +All streaming tables below were re-run 2026-07-03 on an Apple M5 Pro against the official AMI SDM 16-meeting test set (same split as the offline table above; earlier revisions of these tables used a different 7-meeting subset, so numbers are not directly comparable to those). Running a near real-time diarization benchmark for 3s chunks, 1s overlap, and 0.85 clustering threshold: ```bash @@ -641,26 +645,35 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \ ------------------------------------------------------------------------------------------ Meeting DER % JER % Miss % FA % SE % Speakers RTFx ------------------------------------------------------------------------------------------ -ES2004a 31.6 41.6 6.7 2.1 22.7 7/4 49.8 -ES2005a 39.7 65.0 6.9 7.3 25.5 5/4 59.1 -IS1002b 40.4 51.3 1.1 5.2 34.1 9/4 45.3 -ES2002a 41.5 56.0 5.3 10.1 26.1 6/4 48.6 -ES2003a 53.1 78.7 5.3 2.3 45.5 5/4 57.1 -IS1000a 66.7 74.0 6.1 7.6 53.0 7/4 50.7 -IS1001a 75.0 88.6 7.1 4.7 63.2 10/4 48.8 +TS3003a 19.3 37.0 7.3 3.0 9.1 5/4 26.1 +ES2004d 21.7 36.9 4.8 3.1 13.8 6/4 24.5 +IS1009d 24.4 38.4 2.0 4.6 17.8 10/4 24.7 +IS1009a 29.8 46.0 2.8 3.7 23.4 6/4 25.3 +TS3003c 31.6 45.9 4.2 3.4 24.0 5/4 25.4 +ES2004a 31.7 41.6 6.7 2.1 22.8 7/4 24.9 +TS3003b 55.3 63.5 3.0 5.3 47.0 5/4 24.4 +EN2002d 60.3 62.8 5.3 2.2 52.8 7/4 23.6 +IS1009c 61.0 70.7 1.4 3.8 55.8 6/4 24.7 +EN2002a 63.0 62.2 4.8 2.0 56.2 7/4 23.4 +EN2002c 67.0 70.9 4.7 2.0 60.4 9/3 23.7 +ES2004b 69.5 71.2 3.2 2.6 63.8 9/4 24.2 +ES2004c 70.8 68.2 2.5 3.2 65.2 7/4 24.2 +TS3003d 74.8 89.2 7.0 3.5 64.3 7/4 24.9 +EN2002b 83.9 84.7 3.6 2.3 77.9 11/4 24.2 +IS1009b 88.1 90.8 1.3 2.3 84.5 7/4 24.7 ------------------------------------------------------------------------------------------ -AVERAGE 49.7 65.0 5.5 5.6 38.6 - 51.4 +AVERAGE 53.3 61.2 4.0 3.1 46.2 - 24.6 ========================================================================================== ``` -Diarization benchmark with 10s chunks, 0s overlap, and 0.7 clustering threshold: +Diarization benchmark with 10s chunks, 0s overlap, and 0.7 clustering threshold (best streaming configuration found on this split): ```bash swift run fluidaudiocli diarization-benchmark --mode streaming \ - --dataset ami-sdm - --threshold 0.7 - --auto-download - --chunk-seconds 10.0 + --dataset ami-sdm \ + --threshold 0.7 \ + --auto-download \ + --chunk-seconds 10.0 \ --overlap-seconds 0.0 ... @@ -668,26 +681,35 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \ ------------------------------------------------------------------------------------------ Meeting DER % JER % Miss % FA % SE % Speakers RTFx ------------------------------------------------------------------------------------------ -ES2003a 12.0 19.5 6.9 1.2 3.9 4/4 477.0 -ES2004a 15.1 24.8 9.2 1.2 4.7 4/4 367.4 -ES2002a 17.8 26.8 8.6 5.8 3.4 6/4 356.8 -IS1002b 38.0 41.8 3.1 3.1 31.8 5/4 361.9 -ES2005a 22.5 36.8 7.7 6.8 8.0 4/4 460.8 -IS1000a 57.7 80.6 11.9 3.9 41.9 8/4 352.1 -IS1001a 70.1 85.4 11.2 2.4 56.5 7/4 370.9 +ES2004a 15.1 24.8 9.2 1.2 4.7 4/4 208.9 +ES2004d 17.5 24.5 9.0 1.3 7.2 5/4 199.9 +TS3003a 18.8 29.3 13.0 1.2 4.5 5/4 212.2 +EN2002b 20.5 28.9 7.1 1.2 12.2 5/4 199.8 +TS3003b 20.9 26.0 7.2 2.8 10.8 4/4 214.0 +TS3003c 22.1 30.7 9.0 1.3 11.9 4/4 221.3 +IS1009c 38.2 45.6 5.7 1.4 31.1 4/4 221.0 +EN2002c 39.0 44.5 6.9 0.6 31.4 5/3 200.7 +IS1009b 39.3 45.8 4.9 1.0 33.4 6/4 215.6 +EN2002d 42.1 47.5 7.0 1.1 34.1 6/4 191.1 +IS1009d 44.1 53.5 6.7 2.0 35.3 5/4 210.5 +EN2002a 45.5 51.8 8.6 0.8 36.0 7/4 184.8 +TS3003d 52.7 68.5 13.5 1.5 37.6 4/4 197.3 +ES2004c 55.7 62.6 4.3 2.4 49.1 9/4 211.5 +IS1009a 61.0 77.5 6.0 1.9 53.1 5/4 215.5 +ES2004b 78.5 86.9 5.1 1.8 71.6 7/4 216.3 ------------------------------------------------------------------------------------------ -AVERAGE 33.3 45.1 8.4 3.5 21.5 - 392.4 +AVERAGE 38.2 46.8 7.7 1.5 29.0 - 207.5 ========================================================================================== ``` -Diarization benchmark with 5s chunks, 0s overlap, and 0.8 clustering threshold (best configuration found): +Diarization benchmark with 5s chunks, 0s overlap, and 0.8 clustering threshold: ```bash swift run fluidaudiocli diarization-benchmark --mode streaming \ - --dataset ami-sdm - --threshold 0.8 - --auto-download - --chunk-seconds 5.0 + --dataset ami-sdm \ + --threshold 0.8 \ + --auto-download \ + --chunk-seconds 5.0 \ --overlap-seconds 0.0 ... @@ -695,15 +717,24 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \ ------------------------------------------------------------------------------------------ Meeting DER % JER % Miss % FA % SE % Speakers RTFx ------------------------------------------------------------------------------------------ -IS1002b 9.8 11.7 3.5 3.8 2.6 5/4 205.2 -ES2003a 14.4 23.3 7.4 1.6 5.3 4/4 260.9 -ES2004a 17.0 26.0 9.0 1.3 6.7 7/4 218.1 -ES2005a 18.4 31.0 9.2 5.8 3.4 4/4 259.8 -ES2002a 20.8 30.5 9.5 7.4 3.9 5/4 198.0 -IS1000a 24.7 35.7 12.1 4.3 8.3 6/4 204.2 -IS1001a 78.0 94.5 13.3 3.0 61.6 6/4 215.7 +ES2004a 17.0 26.0 9.0 1.3 6.7 7/4 113.4 +IS1009a 18.1 26.5 4.7 2.7 10.8 4/4 113.2 +TS3003a 21.0 32.3 12.7 1.4 6.8 2/4 114.4 +TS3003b 21.5 26.2 7.1 4.2 10.2 4/4 109.8 +ES2004d 22.5 29.6 9.6 1.8 11.0 6/4 108.6 +IS1009c 25.5 30.7 3.4 2.5 19.6 5/4 112.9 +ES2004c 25.6 29.9 4.8 2.4 18.4 5/4 111.8 +TS3003c 33.2 44.3 8.0 2.5 22.7 4/4 114.2 +EN2002c 40.6 46.5 8.4 1.5 30.7 5/3 29.7 +EN2002b 46.4 57.2 8.2 1.2 36.9 8/4 9.9 +IS1009d 47.8 56.7 4.9 3.0 39.9 5/4 111.4 +IS1009b 56.1 63.9 2.5 1.5 52.1 5/4 111.7 +ES2004b 57.6 64.0 5.9 2.0 49.7 8/4 111.5 +EN2002d 62.4 71.3 10.1 1.4 50.9 7/4 103.4 +EN2002a 63.4 71.1 9.2 1.1 53.0 7/4 105.2 +TS3003d 66.3 83.3 12.7 2.6 51.0 5/4 105.3 ------------------------------------------------------------------------------------------ -AVERAGE 26.2 36.1 9.2 3.9 13.1 - 223.1 +AVERAGE 39.0 47.5 7.6 2.1 29.4 - 99.1 ========================================================================================== ``` @@ -711,10 +742,10 @@ AVERAGE 26.2 36.1 9.2 3.9 13.1 - 223.1 Diarization benchmark with 5s chunks, 2s overlap, and 0.8 clustering threshold: ```bash swift run fluidaudiocli diarization-benchmark --mode streaming \ - --dataset ami-sdm - --threshold 0.8 - --auto-download - --chunk-seconds 5.0 + --dataset ami-sdm \ + --threshold 0.8 \ + --auto-download \ + --chunk-seconds 5.0 \ --overlap-seconds 2.0 ... @@ -722,18 +753,29 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \ ------------------------------------------------------------------------------------------ Meeting DER % JER % Miss % FA % SE % Speakers RTFx ------------------------------------------------------------------------------------------ -ES2003a 24.5 42.1 4.7 1.9 18.0 6/4 81.4 -ES2005a 27.5 50.6 5.5 7.6 14.4 5/4 76.8 -ES2004a 31.6 54.8 6.4 2.3 23.0 5/4 66.9 -IS1002b 39.6 57.0 0.8 5.1 33.7 6/4 63.7 -ES2002a 41.1 57.2 4.7 9.8 26.7 5/4 65.5 -IS1000a 57.4 54.2 6.1 7.7 43.6 9/4 67.2 -IS1001a 79.0 86.8 7.0 5.0 66.9 10/4 64.5 +TS3003a 20.2 45.0 6.7 2.9 10.6 4/4 37.6 +ES2004c 26.8 40.7 1.7 3.5 21.6 6/4 34.9 +ES2004a 31.6 54.8 6.4 2.3 23.0 5/4 35.2 +IS1009c 38.1 39.0 1.3 4.0 32.7 7/4 36.1 +EN2002c 38.1 39.3 3.1 1.9 33.1 6/3 33.6 +ES2004d 39.0 44.3 4.4 3.1 31.5 5/4 35.1 +EN2002b 45.8 59.1 3.3 2.4 40.1 7/4 34.5 +IS1009d 55.3 62.1 2.2 4.4 48.7 8/4 35.6 +EN2002a 60.9 54.0 3.6 1.9 55.5 7/4 33.8 +ES2004b 69.5 74.6 2.4 2.9 64.1 8/4 34.9 +TS3003d 71.7 83.4 5.2 4.7 61.8 6/4 34.5 +IS1009a 72.2 75.1 1.7 3.5 67.0 5/4 36.9 +TS3003c 72.8 85.2 3.2 3.5 66.1 5/4 36.5 +IS1009b 77.6 80.3 0.8 2.1 74.6 6/4 35.9 +TS3003b 86.4 86.7 2.8 5.5 78.0 5/4 35.5 +EN2002d 88.8 92.2 3.7 2.0 83.1 9/4 33.4 ------------------------------------------------------------------------------------------ -AVERAGE 43.0 57.5 5.0 5.6 32.3 - 69.4 +AVERAGE 55.9 63.5 3.3 3.2 49.5 - 35.2 ========================================================================================== ``` +Takeaways from the 2026-07-03 sweep: on the official 16-meeting split, no-overlap configs clearly beat overlap configs (10s/0s 38.2% and 5s/0s 39.0% vs 3s/1s 53.3% and 5s/2s 55.9% average DER) — overlap increases chunk count and drives over-clustering (5-11 detected speakers vs 4 truth on the worst meetings). Note: the two overlap configs were measured one meeting per process; running many overlapping-chunk meetings back-to-back in a single process can exhaust IOSurface-backed CoreML buffers on macOS (E5RT `Failed to allocate memory IOSurface object`). + ## Sortformer Streaming Diarization NVIDIA's Sortformer model for streaming speaker diarization, converted to CoreML. diff --git a/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift b/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift index 890475ce0..1257c9400 100644 --- a/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift @@ -605,6 +605,13 @@ enum StreamDiarizationBenchmark { } } + // Fail loudly if no meeting produced a result (e.g. missing ground truth + // annotations) instead of exiting cleanly with empty metrics (issue #752). + guard !allResults.isEmpty else { + logger.error("❌ Benchmark produced no results — see errors above") + exit(1) + } + // Print final summary printFinalSummary(results: allResults) @@ -765,8 +772,9 @@ enum StreamDiarizationBenchmark { let totalElapsed = Date().timeIntervalSince(startTime) let finalRTFx = totalDuration / totalElapsed - // Load ground truth - let groundTruth = await AMIParser.loadAMIGroundTruth( + // Load ground truth (throws if annotations are missing — never scores + // against placeholder data, see issue #752) + let groundTruth = try AMIParser.loadAMIGroundTruth( for: meetingName, duration: Float(totalDuration) ) @@ -870,8 +878,9 @@ enum StreamDiarizationBenchmark { logger.info(" RTFx: \(String(format: "%.1f", finalRTFx))x") } - // Load ground truth - let groundTruth = await AMIParser.loadAMIGroundTruth( + // Load ground truth (throws if annotations are missing — never scores + // against placeholder data, see issue #752) + let groundTruth = try AMIParser.loadAMIGroundTruth( for: meetingName, duration: Float(totalDuration) ) diff --git a/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift b/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift index 156cdee5f..27968bf5d 100644 --- a/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift @@ -487,7 +487,7 @@ enum LSEENDBenchmark { if dataset == .ami { print(" [REF] Using AMI word-aligned annotations") - referenceSegments = await AMIParser.loadWordAlignedDERReference( + referenceSegments = try AMIParser.loadWordAlignedDERReference( for: meetingName, duration: duration ) diff --git a/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift b/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift index 68bfd171c..26e7c46f1 100644 --- a/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift @@ -608,7 +608,7 @@ enum SortformerBenchmark { // Fall back to AMI word-aligned annotations if no RTTM available (AMI only) if groundTruth.isEmpty && dataset == .ami { print(" [RTTM] No RTTM file, falling back to AMI word-aligned annotations") - groundTruth = await AMIParser.loadWordAlignedGroundTruth( + groundTruth = try AMIParser.loadWordAlignedGroundTruth( for: meetingName, duration: duration ) diff --git a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift index 74adb1f21..3e7cb447b 100644 --- a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift +++ b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift @@ -2,6 +2,25 @@ import FluidAudio import Foundation +/// Errors thrown when AMI reference annotations cannot be loaded. +/// +/// Benchmarks must fail loudly on these instead of scoring against a synthetic +/// reference: a transient annotation download failure once produced a bogus +/// 80.8% DER report scored against placeholder ground truth (issue #752). +enum AMIParserError: Error, LocalizedError { + case annotationsNotFound(subdirectory: String) + + var errorDescription: String? { + switch self { + case .annotationsNotFound(let subdirectory): + return + "AMI annotations not found in any expected location. " + + "Expected structure: [path]/\(subdirectory)/ AND [path]/corpusResources/meetings.xml. " + + "Run with --auto-download or download manually from https://groups.inf.ed.ac.uk/ami/download/" + } + } +} + /// AMI annotation parser and ground truth handling struct AMIParser { private static let logger = AppLogger(category: "AMIParser") @@ -40,41 +59,31 @@ struct AMIParser { return 4 // AMI meetings typically have 4 speakers } - /// Load AMI ground truth annotations for a specific meeting + /// Load AMI ground truth annotations for a specific meeting. + /// + /// Throws if annotations are missing or unparsable — never substitutes a + /// placeholder reference, so callers cannot silently score against fake data. static func loadAMIGroundTruth( - for meetingId: String, duration: Float - ) async - -> [TimedSpeakerSegment] - { - guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else { - logger.warning(" AMI annotations not found in any expected location") - logger.warning( - " 📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml" - ) - logger.warning( - " 🔧 To download annotations: visit https://groups.inf.ed.ac.uk/ami/download/" - ) - logger.warning( - " 📋 Using simplified placeholder ground truth (causes poor DER performance)" - ) - return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4) + for meetingId: String, + duration: Float, + searchRoots: [URL]? = nil + ) throws -> [TimedSpeakerSegment] { + guard + let validAmiDir = findAnnotationRoot( + requiringSubdirectory: "segments", searchRoots: searchRoots) + else { + throw AMIParserError.annotationsNotFound(subdirectory: "segments") } logger.info(" 📖 Loading AMI annotations for meeting: \(meetingId)") - do { - let allSegments = try loadAMIGroundTruth( - for: meetingId, - in: validAmiDir, - duration: duration - ) - logger.info(" Total segments loaded: \(allSegments.count)") - return allSegments - } catch { - logger.warning(" Failed to parse AMI annotations: \(error)") - logger.warning(" Using simplified placeholder instead") - return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4) - } + let allSegments = try loadAMIGroundTruth( + for: meetingId, + in: validAmiDir, + duration: duration + ) + logger.info(" Total segments loaded: \(allSegments.count)") + return allSegments } /// Internal hook for tests and benchmark helpers that need deterministic parsing @@ -163,35 +172,22 @@ struct AMIParser { static func loadFrameAlignedDERReference( for meetingId: String, duration: Float, - frameStep: Double = defaultReferenceFrameStepSeconds - ) async -> [DERSpeakerSegment] { - guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else { - logger.warning(" AMI annotations not found in any expected location") - logger.warning( - " 📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml" - ) - logger.warning(" 📋 Falling back to simplified placeholder ground truth") - return frameAlignedDERReference( - from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4), - frameStep: frameStep - ) + frameStep: Double = defaultReferenceFrameStepSeconds, + searchRoots: [URL]? = nil + ) throws -> [DERSpeakerSegment] { + guard + let validAmiDir = findAnnotationRoot( + requiringSubdirectory: "segments", searchRoots: searchRoots) + else { + throw AMIParserError.annotationsNotFound(subdirectory: "segments") } - do { - return try loadFrameAlignedDERReference( - for: meetingId, - in: validAmiDir, - duration: duration, - frameStep: frameStep - ) - } catch { - logger.warning(" Failed to parse AMI annotations: \(error)") - logger.warning(" Falling back to simplified placeholder ground truth") - return frameAlignedDERReference( - from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4), - frameStep: frameStep - ) - } + return try loadFrameAlignedDERReference( + for: meetingId, + in: validAmiDir, + duration: duration, + frameStep: frameStep + ) } static func loadFrameAlignedDERReference( @@ -216,29 +212,22 @@ struct AMIParser { static func loadWordAlignedGroundTruth( for meetingId: String, duration: Float, - mergeGap: Double = defaultMergeGapSeconds - ) async -> [TimedSpeakerSegment] { - guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "words") else { - logger.warning(" AMI word annotations not found in any expected location") - logger.warning( - " 📁 Expected structure: [path]/words/ AND [path]/corpusResources/meetings.xml" - ) - logger.warning(" 📋 Falling back to simplified placeholder ground truth") - return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4) + mergeGap: Double = defaultMergeGapSeconds, + searchRoots: [URL]? = nil + ) throws -> [TimedSpeakerSegment] { + guard + let validAmiDir = findAnnotationRoot( + requiringSubdirectory: "words", searchRoots: searchRoots) + else { + throw AMIParserError.annotationsNotFound(subdirectory: "words") } - do { - return try loadWordAlignedGroundTruth( - for: meetingId, - in: validAmiDir, - duration: duration, - mergeGap: mergeGap - ) - } catch { - logger.warning(" Failed to parse AMI word annotations: \(error)") - logger.warning(" Falling back to simplified placeholder ground truth") - return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4) - } + return try loadWordAlignedGroundTruth( + for: meetingId, + in: validAmiDir, + duration: duration, + mergeGap: mergeGap + ) } /// Internal hook for tests and benchmark helpers that need deterministic parsing @@ -293,12 +282,14 @@ struct AMIParser { static func loadWordAlignedDERReference( for meetingId: String, duration: Float, - mergeGap: Double = defaultMergeGapSeconds - ) async -> [DERSpeakerSegment] { - let segments = await loadWordAlignedGroundTruth( + mergeGap: Double = defaultMergeGapSeconds, + searchRoots: [URL]? = nil + ) throws -> [DERSpeakerSegment] { + let segments = try loadWordAlignedGroundTruth( for: meetingId, duration: duration, - mergeGap: mergeGap + mergeGap: mergeGap, + searchRoots: searchRoots ) return segments.map { DERSpeakerSegment( @@ -330,34 +321,6 @@ struct AMIParser { } } - /// Generate simplified ground truth for testing - static func generateSimplifiedGroundTruth( - duration: Float, speakerCount: Int - ) - -> [TimedSpeakerSegment] - { - let segmentDuration = duration / Float(speakerCount * 2) - var segments: [TimedSpeakerSegment] = [] - let dummyEmbedding: [Float] = Array(repeating: 0.1, count: 512) - - for i in 0..<(speakerCount * 2) { - let speakerId = "Speaker \((i % speakerCount) + 1)" - let startTime = Float(i) * segmentDuration - let endTime = min(startTime + segmentDuration, duration) - - segments.append( - TimedSpeakerSegment( - speakerId: speakerId, - embedding: dummyEmbedding, - startTimeSeconds: startTime, - endTimeSeconds: endTime, - qualityScore: 1.0 - )) - } - - return segments - } - /// Generate consistent placeholder embeddings for each speaker static func generatePlaceholderEmbedding(for participantId: String) -> [Float] { // Generate a consistent embedding based on participant ID @@ -388,8 +351,11 @@ struct AMIParser { ] } - private static func findAnnotationRoot(requiringSubdirectory subdirectory: String) -> URL? { - for path in possibleAnnotationRoots() { + private static func findAnnotationRoot( + requiringSubdirectory subdirectory: String, + searchRoots: [URL]? = nil + ) -> URL? { + for path in searchRoots ?? possibleAnnotationRoots() { let requiredDir = path.appendingPathComponent(subdirectory) let meetingsFile = path.appendingPathComponent("corpusResources/meetings.xml") let hasRequiredDir = FileManager.default.fileExists(atPath: requiredDir.path) diff --git a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift index a5e3f19e2..bc6c6b1b4 100644 --- a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift +++ b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift @@ -191,14 +191,27 @@ struct DatasetDownloader { return } - // Download and extract AMI manual annotations v1.6.2 + // Download and extract AMI manual annotations v1.6.2. + // The Edinburgh server is occasionally flaky, so retry with backoff — + // a single transient failure here once poisoned a CI benchmark run + // with placeholder ground truth (issue #752). let zipURL = "https://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip" let zipFile = annotationsDir.appendingPathComponent("ami_public_manual_1.6.2.zip") - let zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile) + + var zipSuccess = false + let maxAttempts = 3 + for attempt in 1...maxAttempts { + zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile) + if zipSuccess { break } + logger.warning("Annotation download attempt \(attempt)/\(maxAttempts) failed") + if attempt < maxAttempts { + try? await Task.sleep(nanoseconds: UInt64(attempt) * 2_000_000_000) + } + } if !zipSuccess { - logger.error("Failed to download AMI annotations") + logger.error("Failed to download AMI annotations after \(maxAttempts) attempts") return } diff --git a/Tests/FluidAudioTests/CLI/AMIParserTests.swift b/Tests/FluidAudioTests/CLI/AMIParserTests.swift index eb00102df..4aa034846 100644 --- a/Tests/FluidAudioTests/CLI/AMIParserTests.swift +++ b/Tests/FluidAudioTests/CLI/AMIParserTests.swift @@ -69,6 +69,53 @@ final class AMIParserTests: XCTestCase { XCTAssertEqual(segments[2].end, 1.02, accuracy: 0.0001) } + func testLoadAMIGroundTruthThrowsWhenAnnotationsMissing() throws { + let missingRoot = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + + XCTAssertThrowsError( + try AMIParser.loadAMIGroundTruth( + for: "ES2004a", + duration: 30, + searchRoots: [missingRoot] + ) + ) { error in + guard case AMIParserError.annotationsNotFound(let subdirectory) = error else { + return XCTFail("Expected annotationsNotFound, got \(error)") + } + XCTAssertEqual(subdirectory, "segments") + } + } + + func testLoadWordAlignedGroundTruthThrowsWhenAnnotationsMissing() throws { + let missingRoot = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + + XCTAssertThrowsError( + try AMIParser.loadWordAlignedGroundTruth( + for: "ES2004a", + duration: 30, + searchRoots: [missingRoot] + ) + ) { error in + guard case AMIParserError.annotationsNotFound(let subdirectory) = error else { + return XCTFail("Expected annotationsNotFound, got \(error)") + } + XCTAssertEqual(subdirectory, "words") + } + } + + func testSearchBasedLoadersResolveFixtureRoot() throws { + let fixture = try makeAMIFixture() + + let segments = try AMIParser.loadWordAlignedGroundTruth( + for: "ES2004a", + duration: 30, + searchRoots: [fixture] + ) + XCTAssertEqual(segments.count, 2) + } + private func makeAMIFixture() throws -> URL { let baseURL = FileManager.default.temporaryDirectory .appendingPathComponent(UUID().uuidString, isDirectory: true)