diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml
index 0775ad844..eaf5f8fd8 100644
--- a/.github/workflows/diarizer-benchmark.yml
+++ b/.github/workflows/diarizer-benchmark.yml
@@ -44,6 +44,12 @@ jobs:
           path: ~/FluidAudioDatasets/ami_official
           key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }}
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 
@@ -123,7 +129,9 @@ jobs:
           fi
 
       - name: Comment PR with Benchmark Results
-        if: always()
+        # Only comment when metrics were actually extracted — a failed run must
+        # show up as a red check, not a comment with garbage numbers (issue #752)
+        if: steps.extract.outcome == 'success'
         uses: actions/github-script@v7
         with:
           script: |
diff --git a/.github/workflows/offline-pipeline.yml b/.github/workflows/offline-pipeline.yml
index 25cab0278..c35a246f7 100644
--- a/.github/workflows/offline-pipeline.yml
+++ b/.github/workflows/offline-pipeline.yml
@@ -24,6 +24,12 @@ jobs:
         with:
           swift-version: "6.1"
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 
diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml
index a3e04d662..8df3cadd7 100644
--- a/.github/workflows/sortformer-benchmark.yml
+++ b/.github/workflows/sortformer-benchmark.yml
@@ -44,6 +44,12 @@ jobs:
           path: ~/FluidAudioDatasets/ami_official
           key: ${{ runner.os }}-ami-dataset
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 
diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 455870931..83111c5d6 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -589,43 +589,47 @@ Step Ratio 1, min duration 0 (edited)
 
 Note that the baseline pytorch version is ~11% DER, we lost some precision dropping down to fp16 precision in order to run most of the embedding model on neural engine. But as a result, we significantly out perform the baseline `mps` backend as well. the pyannote-community-1 on cpu is ~1.5-2 RTFx, on mps, it's ~20-25 RTFx.
 
-Running on the full AMI SDM 16-meeting test set (official NeMo/pyannote evaluation split: EN2002, ES2004, IS1009, TS3003 × a-d):
+Running on the full AMI SDM 16-meeting test set (official NeMo/pyannote evaluation split: EN2002, ES2004, IS1009, TS3003 × a-d). Re-run 2026-07-03 on an Apple M5 Pro:
 
 ```bash
 swift run -c release fluidaudiocli diarization-benchmark --mode offline \
-    --dataset ami-sdm --auto-download
+    --dataset ami-sdm --auto-download --threshold 0.7
 ```
 
 ```text
 ------------------------------------------------------------------------------------------
 Meeting        DER %    JER %    Miss %     FA %     SE %   Speakers     RTFx
 ------------------------------------------------------------------------------------------
-IS1009c           5.1      5.9      3.1      1.5      0.6     4/4        94.6
-IS1009b           5.4      6.4      2.8      1.4      1.1     4/4        77.6
-ES2004b           6.0      7.0      2.7      2.2      1.1     4/4        70.4
-ES2004c           6.4      7.3      2.0      3.4      1.0     4/4        70.5
-EN2002c           7.8      9.7      5.1      0.5      2.2     3/3        60.3
-TS3003b           8.0      7.8      3.6      3.7      0.7     4/4        71.4
-TS3003c           9.0      8.7      6.1      1.9      0.9     4/4        70.4
-EN2002b           9.1     12.9      4.0      1.9      3.2     5/4        63.4
-IS1009d           9.2     11.7      4.5      2.6      2.2     4/4        91.6
-IS1009a           9.9     11.9      5.0      2.5      2.4     4/4        60.8
-ES2004a          10.4     13.4      7.5      1.6      1.4     4/4        60.0
-EN2002a          10.6     15.0      5.4      1.2      4.0     4/4        52.2
-ES2004d          11.4     16.4      5.3      2.6      3.5     4/4        62.1
-TS3003a          17.2     64.1     13.1      1.3      2.8     2/4        68.7
-EN2002d          18.3     38.2      4.6      1.5     12.2     3/4        78.6
-TS3003d          26.0     41.6     11.0      2.2     12.8     3/4        64.5
+IS1009c           5.1      5.9       3.1      1.5      0.6     4/4        335.3
+IS1009b           5.4      6.4       2.8      1.4      1.1     4/4        337.1
+ES2004b           6.0      7.0       2.7      2.2      1.1     4/4        314.0
+ES2004c           6.4      7.3       2.0      3.4      1.0     4/4        304.0
+EN2002c           7.8      9.7       5.1      0.5      2.2     3/3        305.0
+TS3003b           8.0      7.8       3.6      3.7      0.7     4/4        307.3
+TS3003c           9.0      8.7       6.1      1.9      0.9     4/4        350.0
+EN2002b           9.1     12.9       4.0      1.9      3.2     5/4        318.5
+IS1009d           9.2     11.7       4.5      2.6      2.2     4/4        320.9
+IS1009a           9.9     11.9       5.0      2.5      2.4     4/4        342.9
+ES2004a          10.4     13.4       7.5      1.6      1.4     4/4        334.0
+EN2002a          10.6     15.0       5.4      1.2      4.0     4/4        298.7
+ES2004d          11.4     16.4       5.3      2.6      3.5     4/4        329.4
+TS3003a          17.2     64.1      13.1      1.3      2.8     2/4        346.4
+EN2002d          18.3     38.2       4.6      1.5     12.2     3/4        302.5
+TS3003d          26.0     41.6      11.0      2.2     12.8     3/4        324.6
 ------------------------------------------------------------------------------------------
-AVERAGE          10.6     17.4      5.4      2.0      3.3      -         69.8
+AVERAGE          10.6     17.4       5.4      2.0      3.3      -        323.2
 ==========================================================================================
 ```
 
-12/16 meetings detect the correct speaker count. Average DER 10.62% matches published pyannote-community-1 offline numbers on this split (~11-12%).
+12/16 meetings detect the correct speaker count. Average DER 10.62% matches published pyannote-community-1 offline numbers on this split (~11-12%). Results are fully deterministic (two consecutive runs produce bit-identical metrics).
+
+**Clustering threshold matters on this split.** The table above uses `--threshold 0.7`. Since #616 the CLI default is the community-1 preset (0.6), which merges clusters more aggressively and undercounts speakers on 4 meetings (EN2002a 2/4 → 41.9% DER, ES2004d 3/4 → 34.8%, EN2002b 19.0%, IS1009d 16.4%), degrading the average to 15.5% DER. Pass `--threshold 0.7` (or set `clusteringThreshold: 0.7` in `OfflineDiarizerConfig`) for AMI-SDM-like meeting audio.
 
 ### Streaming/online Diarization
 
-This is more tricky and honestly a lot more fragile to clustering. Expect +10-15% worse DER for the streaming implementation. Only use this when you critically need realtime streaming speaker diarization. In most cases, offline is more than enough for most applications.
+This is more tricky and honestly a lot more fragile to clustering. Expect substantially worse DER than the offline pipeline — the tail meetings suffer heavy speaker confusion. Only use this when you critically need realtime streaming speaker diarization. In most cases, offline is more than enough for most applications.
+
+All streaming tables below were re-run 2026-07-03 on an Apple M5 Pro against the official AMI SDM 16-meeting test set (same split as the offline table above; earlier revisions of these tables used a different 7-meeting subset, so numbers are not directly comparable to those).
 
 Running a near real-time diarization benchmark for 3s chunks, 1s overlap, and 0.85 clustering threshold:
 ```bash
@@ -641,26 +645,35 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \
 ------------------------------------------------------------------------------------------
 Meeting        DER %    JER %    Miss %     FA %     SE %   Speakers     RTFx
 ------------------------------------------------------------------------------------------
-ES2004a          31.6     41.6      6.7      2.1     22.7     7/4        49.8
-ES2005a          39.7     65.0      6.9      7.3     25.5     5/4        59.1
-IS1002b          40.4     51.3      1.1      5.2     34.1     9/4        45.3
-ES2002a          41.5     56.0      5.3     10.1     26.1     6/4        48.6
-ES2003a          53.1     78.7      5.3      2.3     45.5     5/4        57.1
-IS1000a          66.7     74.0      6.1      7.6     53.0     7/4        50.7
-IS1001a          75.0     88.6      7.1      4.7     63.2     10/4       48.8
+TS3003a          19.3     37.0       7.3      3.0      9.1     5/4         26.1
+ES2004d          21.7     36.9       4.8      3.1     13.8     6/4         24.5
+IS1009d          24.4     38.4       2.0      4.6     17.8     10/4        24.7
+IS1009a          29.8     46.0       2.8      3.7     23.4     6/4         25.3
+TS3003c          31.6     45.9       4.2      3.4     24.0     5/4         25.4
+ES2004a          31.7     41.6       6.7      2.1     22.8     7/4         24.9
+TS3003b          55.3     63.5       3.0      5.3     47.0     5/4         24.4
+EN2002d          60.3     62.8       5.3      2.2     52.8     7/4         23.6
+IS1009c          61.0     70.7       1.4      3.8     55.8     6/4         24.7
+EN2002a          63.0     62.2       4.8      2.0     56.2     7/4         23.4
+EN2002c          67.0     70.9       4.7      2.0     60.4     9/3         23.7
+ES2004b          69.5     71.2       3.2      2.6     63.8     9/4         24.2
+ES2004c          70.8     68.2       2.5      3.2     65.2     7/4         24.2
+TS3003d          74.8     89.2       7.0      3.5     64.3     7/4         24.9
+EN2002b          83.9     84.7       3.6      2.3     77.9     11/4        24.2
+IS1009b          88.1     90.8       1.3      2.3     84.5     7/4         24.7
 ------------------------------------------------------------------------------------------
-AVERAGE          49.7     65.0      5.5      5.6     38.6      -         51.4
+AVERAGE          53.3     61.2       4.0      3.1     46.2      -         24.6
 ==========================================================================================
 ```
 
 
-Diarization benchmark with 10s chunks, 0s overlap, and 0.7 clustering threshold:
+Diarization benchmark with 10s chunks, 0s overlap, and 0.7 clustering threshold (best streaming configuration found on this split):
 ```bash
 swift run fluidaudiocli diarization-benchmark --mode streaming \
-    --dataset ami-sdm
-    --threshold 0.7
-    --auto-download
-    --chunk-seconds 10.0
+    --dataset ami-sdm \
+    --threshold 0.7 \
+    --auto-download \
+    --chunk-seconds 10.0 \
     --overlap-seconds 0.0
 
 ...
@@ -668,26 +681,35 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \
 ------------------------------------------------------------------------------------------
 Meeting        DER %    JER %    Miss %     FA %     SE %   Speakers     RTFx
 ------------------------------------------------------------------------------------------
-ES2003a          12.0     19.5      6.9      1.2      3.9 4/4           477.0
-ES2004a          15.1     24.8      9.2      1.2      4.7 4/4           367.4
-ES2002a          17.8     26.8      8.6      5.8      3.4 6/4           356.8
-IS1002b          38.0     41.8      3.1      3.1     31.8 5/4           361.9
-ES2005a          22.5     36.8      7.7      6.8      8.0 4/4           460.8
-IS1000a          57.7     80.6     11.9      3.9     41.9 8/4           352.1
-IS1001a          70.1     85.4     11.2      2.4     56.5 7/4           370.9
+ES2004a          15.1     24.8       9.2      1.2      4.7     4/4        208.9
+ES2004d          17.5     24.5       9.0      1.3      7.2     5/4        199.9
+TS3003a          18.8     29.3      13.0      1.2      4.5     5/4        212.2
+EN2002b          20.5     28.9       7.1      1.2     12.2     5/4        199.8
+TS3003b          20.9     26.0       7.2      2.8     10.8     4/4        214.0
+TS3003c          22.1     30.7       9.0      1.3     11.9     4/4        221.3
+IS1009c          38.2     45.6       5.7      1.4     31.1     4/4        221.0
+EN2002c          39.0     44.5       6.9      0.6     31.4     5/3        200.7
+IS1009b          39.3     45.8       4.9      1.0     33.4     6/4        215.6
+EN2002d          42.1     47.5       7.0      1.1     34.1     6/4        191.1
+IS1009d          44.1     53.5       6.7      2.0     35.3     5/4        210.5
+EN2002a          45.5     51.8       8.6      0.8     36.0     7/4        184.8
+TS3003d          52.7     68.5      13.5      1.5     37.6     4/4        197.3
+ES2004c          55.7     62.6       4.3      2.4     49.1     9/4        211.5
+IS1009a          61.0     77.5       6.0      1.9     53.1     5/4        215.5
+ES2004b          78.5     86.9       5.1      1.8     71.6     7/4        216.3
 ------------------------------------------------------------------------------------------
-AVERAGE          33.3     45.1      8.4      3.5     21.5         -     392.4
+AVERAGE          38.2     46.8       7.7      1.5     29.0      -        207.5
 ==========================================================================================
 ```
 
 
-Diarization benchmark with 5s chunks, 0s overlap, and 0.8 clustering threshold (best configuration found):
+Diarization benchmark with 5s chunks, 0s overlap, and 0.8 clustering threshold:
 ```bash
 swift run fluidaudiocli diarization-benchmark --mode streaming \
-    --dataset ami-sdm
-    --threshold 0.8
-    --auto-download
-    --chunk-seconds 5.0
+    --dataset ami-sdm \
+    --threshold 0.8 \
+    --auto-download \
+    --chunk-seconds 5.0 \
     --overlap-seconds 0.0
 
 ...
@@ -695,15 +717,24 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \
 ------------------------------------------------------------------------------------------
 Meeting        DER %    JER %    Miss %     FA %     SE %   Speakers     RTFx
 ------------------------------------------------------------------------------------------
-IS1002b           9.8     11.7      3.5      3.8      2.6      5/4       205.2
-ES2003a          14.4     23.3      7.4      1.6      5.3      4/4       260.9
-ES2004a          17.0     26.0      9.0      1.3      6.7      7/4       218.1
-ES2005a          18.4     31.0      9.2      5.8      3.4      4/4       259.8
-ES2002a          20.8     30.5      9.5      7.4      3.9      5/4       198.0
-IS1000a          24.7     35.7     12.1      4.3      8.3      6/4       204.2
-IS1001a          78.0     94.5     13.3      3.0     61.6      6/4       215.7
+ES2004a          17.0     26.0       9.0      1.3      6.7     7/4        113.4
+IS1009a          18.1     26.5       4.7      2.7     10.8     4/4        113.2
+TS3003a          21.0     32.3      12.7      1.4      6.8     2/4        114.4
+TS3003b          21.5     26.2       7.1      4.2     10.2     4/4        109.8
+ES2004d          22.5     29.6       9.6      1.8     11.0     6/4        108.6
+IS1009c          25.5     30.7       3.4      2.5     19.6     5/4        112.9
+ES2004c          25.6     29.9       4.8      2.4     18.4     5/4        111.8
+TS3003c          33.2     44.3       8.0      2.5     22.7     4/4        114.2
+EN2002c          40.6     46.5       8.4      1.5     30.7     5/3         29.7
+EN2002b          46.4     57.2       8.2      1.2     36.9     8/4          9.9
+IS1009d          47.8     56.7       4.9      3.0     39.9     5/4        111.4
+IS1009b          56.1     63.9       2.5      1.5     52.1     5/4        111.7
+ES2004b          57.6     64.0       5.9      2.0     49.7     8/4        111.5
+EN2002d          62.4     71.3      10.1      1.4     50.9     7/4        103.4
+EN2002a          63.4     71.1       9.2      1.1     53.0     7/4        105.2
+TS3003d          66.3     83.3      12.7      2.6     51.0     5/4        105.3
 ------------------------------------------------------------------------------------------
-AVERAGE          26.2     36.1      9.2      3.9     13.1       -        223.1
+AVERAGE          39.0     47.5       7.6      2.1     29.4      -         99.1
 ==========================================================================================
 ```
 
@@ -711,10 +742,10 @@ AVERAGE          26.2     36.1      9.2      3.9     13.1       -        223.1
 Diarization benchmark with 5s chunks, 2s overlap, and 0.8 clustering threshold:
 ```bash
 swift run fluidaudiocli diarization-benchmark --mode streaming \
-    --dataset ami-sdm
-    --threshold 0.8
-    --auto-download
-    --chunk-seconds 5.0
+    --dataset ami-sdm \
+    --threshold 0.8 \
+    --auto-download \
+    --chunk-seconds 5.0 \
     --overlap-seconds 2.0
 
 ...
@@ -722,18 +753,29 @@ swift run fluidaudiocli diarization-benchmark --mode streaming \
 ------------------------------------------------------------------------------------------
 Meeting        DER %    JER %    Miss %     FA %     SE %   Speakers     RTFx
 ------------------------------------------------------------------------------------------
-ES2003a          24.5     42.1      4.7      1.9     18.0     6/4        81.4
-ES2005a          27.5     50.6      5.5      7.6     14.4     5/4        76.8
-ES2004a          31.6     54.8      6.4      2.3     23.0     5/4        66.9
-IS1002b          39.6     57.0      0.8      5.1     33.7     6/4        63.7
-ES2002a          41.1     57.2      4.7      9.8     26.7     5/4        65.5
-IS1000a          57.4     54.2      6.1      7.7     43.6     9/4        67.2
-IS1001a          79.0     86.8      7.0      5.0     66.9     10/4       64.5
+TS3003a          20.2     45.0       6.7      2.9     10.6     4/4         37.6
+ES2004c          26.8     40.7       1.7      3.5     21.6     6/4         34.9
+ES2004a          31.6     54.8       6.4      2.3     23.0     5/4         35.2
+IS1009c          38.1     39.0       1.3      4.0     32.7     7/4         36.1
+EN2002c          38.1     39.3       3.1      1.9     33.1     6/3         33.6
+ES2004d          39.0     44.3       4.4      3.1     31.5     5/4         35.1
+EN2002b          45.8     59.1       3.3      2.4     40.1     7/4         34.5
+IS1009d          55.3     62.1       2.2      4.4     48.7     8/4         35.6
+EN2002a          60.9     54.0       3.6      1.9     55.5     7/4         33.8
+ES2004b          69.5     74.6       2.4      2.9     64.1     8/4         34.9
+TS3003d          71.7     83.4       5.2      4.7     61.8     6/4         34.5
+IS1009a          72.2     75.1       1.7      3.5     67.0     5/4         36.9
+TS3003c          72.8     85.2       3.2      3.5     66.1     5/4         36.5
+IS1009b          77.6     80.3       0.8      2.1     74.6     6/4         35.9
+TS3003b          86.4     86.7       2.8      5.5     78.0     5/4         35.5
+EN2002d          88.8     92.2       3.7      2.0     83.1     9/4         33.4
 ------------------------------------------------------------------------------------------
-AVERAGE          43.0     57.5      5.0      5.6     32.3      -         69.4
+AVERAGE          55.9     63.5       3.3      3.2     49.5      -         35.2
 ==========================================================================================
 ```
 
+Takeaways from the 2026-07-03 sweep: on the official 16-meeting split, no-overlap configs clearly beat overlap configs (10s/0s 38.2% and 5s/0s 39.0% vs 3s/1s 53.3% and 5s/2s 55.9% average DER) — overlap increases chunk count and drives over-clustering (5-11 detected speakers vs 4 truth on the worst meetings). Note: the two overlap configs were measured one meeting per process; running many overlapping-chunk meetings back-to-back in a single process can exhaust IOSurface-backed CoreML buffers on macOS (E5RT `Failed to allocate memory IOSurface object`).
+
 ## Sortformer Streaming Diarization
 
 NVIDIA's Sortformer model for streaming speaker diarization, converted to CoreML.
diff --git a/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift b/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift
index 890475ce0..1257c9400 100644
--- a/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift
@@ -605,6 +605,13 @@ enum StreamDiarizationBenchmark {
             }
         }
 
+        // Fail loudly if no meeting produced a result (e.g. missing ground truth
+        // annotations) instead of exiting cleanly with empty metrics (issue #752).
+        guard !allResults.isEmpty else {
+            logger.error("❌ Benchmark produced no results — see errors above")
+            exit(1)
+        }
+
         // Print final summary
         printFinalSummary(results: allResults)
 
@@ -765,8 +772,9 @@ enum StreamDiarizationBenchmark {
             let totalElapsed = Date().timeIntervalSince(startTime)
             let finalRTFx = totalDuration / totalElapsed
 
-            // Load ground truth
-            let groundTruth = await AMIParser.loadAMIGroundTruth(
+            // Load ground truth (throws if annotations are missing — never scores
+            // against placeholder data, see issue #752)
+            let groundTruth = try AMIParser.loadAMIGroundTruth(
                 for: meetingName,
                 duration: Float(totalDuration)
             )
@@ -870,8 +878,9 @@ enum StreamDiarizationBenchmark {
                 logger.info("  RTFx: \(String(format: "%.1f", finalRTFx))x")
             }
 
-            // Load ground truth
-            let groundTruth = await AMIParser.loadAMIGroundTruth(
+            // Load ground truth (throws if annotations are missing — never scores
+            // against placeholder data, see issue #752)
+            let groundTruth = try AMIParser.loadAMIGroundTruth(
                 for: meetingName,
                 duration: Float(totalDuration)
             )
diff --git a/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift b/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift
index 156cdee5f..27968bf5d 100644
--- a/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift
@@ -487,7 +487,7 @@ enum LSEENDBenchmark {
 
             if dataset == .ami {
                 print("   [REF] Using AMI word-aligned annotations")
-                referenceSegments = await AMIParser.loadWordAlignedDERReference(
+                referenceSegments = try AMIParser.loadWordAlignedDERReference(
                     for: meetingName,
                     duration: duration
                 )
diff --git a/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift b/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift
index 68bfd171c..26e7c46f1 100644
--- a/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift
@@ -608,7 +608,7 @@ enum SortformerBenchmark {
             // Fall back to AMI word-aligned annotations if no RTTM available (AMI only)
             if groundTruth.isEmpty && dataset == .ami {
                 print("   [RTTM] No RTTM file, falling back to AMI word-aligned annotations")
-                groundTruth = await AMIParser.loadWordAlignedGroundTruth(
+                groundTruth = try AMIParser.loadWordAlignedGroundTruth(
                     for: meetingName,
                     duration: duration
                 )
diff --git a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift
index 74adb1f21..3e7cb447b 100644
--- a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift
+++ b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift
@@ -2,6 +2,25 @@
 import FluidAudio
 import Foundation
 
+/// Errors thrown when AMI reference annotations cannot be loaded.
+///
+/// Benchmarks must fail loudly on these instead of scoring against a synthetic
+/// reference: a transient annotation download failure once produced a bogus
+/// 80.8% DER report scored against placeholder ground truth (issue #752).
+enum AMIParserError: Error, LocalizedError {
+    case annotationsNotFound(subdirectory: String)
+
+    var errorDescription: String? {
+        switch self {
+        case .annotationsNotFound(let subdirectory):
+            return
+                "AMI annotations not found in any expected location. "
+                + "Expected structure: [path]/\(subdirectory)/ AND [path]/corpusResources/meetings.xml. "
+                + "Run with --auto-download or download manually from https://groups.inf.ed.ac.uk/ami/download/"
+        }
+    }
+}
+
 /// AMI annotation parser and ground truth handling
 struct AMIParser {
     private static let logger = AppLogger(category: "AMIParser")
@@ -40,41 +59,31 @@ struct AMIParser {
         return 4  // AMI meetings typically have 4 speakers
     }
 
-    /// Load AMI ground truth annotations for a specific meeting
+    /// Load AMI ground truth annotations for a specific meeting.
+    ///
+    /// Throws if annotations are missing or unparsable — never substitutes a
+    /// placeholder reference, so callers cannot silently score against fake data.
     static func loadAMIGroundTruth(
-        for meetingId: String, duration: Float
-    ) async
-        -> [TimedSpeakerSegment]
-    {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else {
-            logger.warning("   AMI annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning(
-                "      🔧 To download annotations: visit https://groups.inf.ed.ac.uk/ami/download/"
-            )
-            logger.warning(
-                "      📋 Using simplified placeholder ground truth (causes poor DER performance)"
-            )
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
+        for meetingId: String,
+        duration: Float,
+        searchRoots: [URL]? = nil
+    ) throws -> [TimedSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "segments", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "segments")
         }
 
         logger.info("   📖 Loading AMI annotations for meeting: \(meetingId)")
 
-        do {
-            let allSegments = try loadAMIGroundTruth(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration
-            )
-            logger.info("      Total segments loaded: \(allSegments.count)")
-            return allSegments
-        } catch {
-            logger.warning("      Failed to parse AMI annotations: \(error)")
-            logger.warning("      Using simplified placeholder instead")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
-        }
+        let allSegments = try loadAMIGroundTruth(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration
+        )
+        logger.info("      Total segments loaded: \(allSegments.count)")
+        return allSegments
     }
 
     /// Internal hook for tests and benchmark helpers that need deterministic parsing
@@ -163,35 +172,22 @@ struct AMIParser {
     static func loadFrameAlignedDERReference(
         for meetingId: String,
         duration: Float,
-        frameStep: Double = defaultReferenceFrameStepSeconds
-    ) async -> [DERSpeakerSegment] {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else {
-            logger.warning("   AMI annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning("      📋 Falling back to simplified placeholder ground truth")
-            return frameAlignedDERReference(
-                from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4),
-                frameStep: frameStep
-            )
+        frameStep: Double = defaultReferenceFrameStepSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [DERSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "segments", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "segments")
         }
 
-        do {
-            return try loadFrameAlignedDERReference(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration,
-                frameStep: frameStep
-            )
-        } catch {
-            logger.warning("      Failed to parse AMI annotations: \(error)")
-            logger.warning("      Falling back to simplified placeholder ground truth")
-            return frameAlignedDERReference(
-                from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4),
-                frameStep: frameStep
-            )
-        }
+        return try loadFrameAlignedDERReference(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration,
+            frameStep: frameStep
+        )
     }
 
     static func loadFrameAlignedDERReference(
@@ -216,29 +212,22 @@ struct AMIParser {
     static func loadWordAlignedGroundTruth(
         for meetingId: String,
         duration: Float,
-        mergeGap: Double = defaultMergeGapSeconds
-    ) async -> [TimedSpeakerSegment] {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "words") else {
-            logger.warning("   AMI word annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/words/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning("      📋 Falling back to simplified placeholder ground truth")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
+        mergeGap: Double = defaultMergeGapSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [TimedSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "words", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "words")
         }
 
-        do {
-            return try loadWordAlignedGroundTruth(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration,
-                mergeGap: mergeGap
-            )
-        } catch {
-            logger.warning("      Failed to parse AMI word annotations: \(error)")
-            logger.warning("      Falling back to simplified placeholder ground truth")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
-        }
+        return try loadWordAlignedGroundTruth(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration,
+            mergeGap: mergeGap
+        )
     }
 
     /// Internal hook for tests and benchmark helpers that need deterministic parsing
@@ -293,12 +282,14 @@ struct AMIParser {
     static func loadWordAlignedDERReference(
         for meetingId: String,
         duration: Float,
-        mergeGap: Double = defaultMergeGapSeconds
-    ) async -> [DERSpeakerSegment] {
-        let segments = await loadWordAlignedGroundTruth(
+        mergeGap: Double = defaultMergeGapSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [DERSpeakerSegment] {
+        let segments = try loadWordAlignedGroundTruth(
             for: meetingId,
             duration: duration,
-            mergeGap: mergeGap
+            mergeGap: mergeGap,
+            searchRoots: searchRoots
         )
         return segments.map {
             DERSpeakerSegment(
@@ -330,34 +321,6 @@ struct AMIParser {
         }
     }
 
-    /// Generate simplified ground truth for testing
-    static func generateSimplifiedGroundTruth(
-        duration: Float, speakerCount: Int
-    )
-        -> [TimedSpeakerSegment]
-    {
-        let segmentDuration = duration / Float(speakerCount * 2)
-        var segments: [TimedSpeakerSegment] = []
-        let dummyEmbedding: [Float] = Array(repeating: 0.1, count: 512)
-
-        for i in 0..<(speakerCount * 2) {
-            let speakerId = "Speaker \((i % speakerCount) + 1)"
-            let startTime = Float(i) * segmentDuration
-            let endTime = min(startTime + segmentDuration, duration)
-
-            segments.append(
-                TimedSpeakerSegment(
-                    speakerId: speakerId,
-                    embedding: dummyEmbedding,
-                    startTimeSeconds: startTime,
-                    endTimeSeconds: endTime,
-                    qualityScore: 1.0
-                ))
-        }
-
-        return segments
-    }
-
     /// Generate consistent placeholder embeddings for each speaker
     static func generatePlaceholderEmbedding(for participantId: String) -> [Float] {
         // Generate a consistent embedding based on participant ID
@@ -388,8 +351,11 @@ struct AMIParser {
         ]
     }
 
-    private static func findAnnotationRoot(requiringSubdirectory subdirectory: String) -> URL? {
-        for path in possibleAnnotationRoots() {
+    private static func findAnnotationRoot(
+        requiringSubdirectory subdirectory: String,
+        searchRoots: [URL]? = nil
+    ) -> URL? {
+        for path in searchRoots ?? possibleAnnotationRoots() {
             let requiredDir = path.appendingPathComponent(subdirectory)
             let meetingsFile = path.appendingPathComponent("corpusResources/meetings.xml")
             let hasRequiredDir = FileManager.default.fileExists(atPath: requiredDir.path)
diff --git a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
index a5e3f19e2..bc6c6b1b4 100644
--- a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
+++ b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
@@ -191,14 +191,27 @@ struct DatasetDownloader {
             return
         }
 
-        // Download and extract AMI manual annotations v1.6.2
+        // Download and extract AMI manual annotations v1.6.2.
+        // The Edinburgh server is occasionally flaky, so retry with backoff —
+        // a single transient failure here once poisoned a CI benchmark run
+        // with placeholder ground truth (issue #752).
         let zipURL =
             "https://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip"
         let zipFile = annotationsDir.appendingPathComponent("ami_public_manual_1.6.2.zip")
-        let zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile)
+
+        var zipSuccess = false
+        let maxAttempts = 3
+        for attempt in 1...maxAttempts {
+            zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile)
+            if zipSuccess { break }
+            logger.warning("Annotation download attempt \(attempt)/\(maxAttempts) failed")
+            if attempt < maxAttempts {
+                try? await Task.sleep(nanoseconds: UInt64(attempt) * 2_000_000_000)
+            }
+        }
 
         if !zipSuccess {
-            logger.error("Failed to download AMI annotations")
+            logger.error("Failed to download AMI annotations after \(maxAttempts) attempts")
             return
         }
 
diff --git a/Tests/FluidAudioTests/CLI/AMIParserTests.swift b/Tests/FluidAudioTests/CLI/AMIParserTests.swift
index eb00102df..4aa034846 100644
--- a/Tests/FluidAudioTests/CLI/AMIParserTests.swift
+++ b/Tests/FluidAudioTests/CLI/AMIParserTests.swift
@@ -69,6 +69,53 @@ final class AMIParserTests: XCTestCase {
         XCTAssertEqual(segments[2].end, 1.02, accuracy: 0.0001)
     }
 
+    func testLoadAMIGroundTruthThrowsWhenAnnotationsMissing() throws {
+        let missingRoot = FileManager.default.temporaryDirectory
+            .appendingPathComponent(UUID().uuidString, isDirectory: true)
+
+        XCTAssertThrowsError(
+            try AMIParser.loadAMIGroundTruth(
+                for: "ES2004a",
+                duration: 30,
+                searchRoots: [missingRoot]
+            )
+        ) { error in
+            guard case AMIParserError.annotationsNotFound(let subdirectory) = error else {
+                return XCTFail("Expected annotationsNotFound, got \(error)")
+            }
+            XCTAssertEqual(subdirectory, "segments")
+        }
+    }
+
+    func testLoadWordAlignedGroundTruthThrowsWhenAnnotationsMissing() throws {
+        let missingRoot = FileManager.default.temporaryDirectory
+            .appendingPathComponent(UUID().uuidString, isDirectory: true)
+
+        XCTAssertThrowsError(
+            try AMIParser.loadWordAlignedGroundTruth(
+                for: "ES2004a",
+                duration: 30,
+                searchRoots: [missingRoot]
+            )
+        ) { error in
+            guard case AMIParserError.annotationsNotFound(let subdirectory) = error else {
+                return XCTFail("Expected annotationsNotFound, got \(error)")
+            }
+            XCTAssertEqual(subdirectory, "words")
+        }
+    }
+
+    func testSearchBasedLoadersResolveFixtureRoot() throws {
+        let fixture = try makeAMIFixture()
+
+        let segments = try AMIParser.loadWordAlignedGroundTruth(
+            for: "ES2004a",
+            duration: 30,
+            searchRoots: [fixture]
+        )
+        XCTAssertEqual(segments.count, 2)
+    }
+
     private func makeAMIFixture() throws -> URL {
         let baseURL = FileManager.default.temporaryDirectory
             .appendingPathComponent(UUID().uuidString, isDirectory: true)