FluidInference · Alex-Wengg · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026
diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml
@@ -44,6 +44,12 @@ jobs:
           path: ~/FluidAudioDatasets/ami_official
           key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }}
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 
@@ -123,7 +129,9 @@ jobs:
           fi
 
       - name: Comment PR with Benchmark Results
-        if: always()
+        # Only comment when metrics were actually extracted — a failed run must
+        # show up as a red check, not a comment with garbage numbers (issue #752)
+        if: steps.extract.outcome == 'success'
         uses: actions/github-script@v7
         with:
           script: |

diff --git a/.github/workflows/offline-pipeline.yml b/.github/workflows/offline-pipeline.yml
@@ -24,6 +24,12 @@ jobs:
         with:
           swift-version: "6.1"
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 

diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml
@@ -44,6 +44,12 @@ jobs:
           path: ~/FluidAudioDatasets/ami_official
           key: ${{ runner.os }}-ami-dataset
 
+      - name: Cache AMI annotations
+        uses: actions/cache@v4
+        with:
+          path: Datasets/ami_public_1.6.2
+          key: ${{ runner.os }}-ami-annotations-1.6.2
+
       - name: Build package
         run: swift build -c release
 

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
diff --git a/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift b/Sources/FluidAudioCLI/Commands/DiarizationBenchmark.swift
@@ -605,6 +605,13 @@ enum StreamDiarizationBenchmark {
             }
         }
 
+        // Fail loudly if no meeting produced a result (e.g. missing ground truth
+        // annotations) instead of exiting cleanly with empty metrics (issue #752).
+        guard !allResults.isEmpty else {
+            logger.error("❌ Benchmark produced no results — see errors above")
+            exit(1)
+        }
+
         // Print final summary
         printFinalSummary(results: allResults)
 
@@ -765,8 +772,9 @@ enum StreamDiarizationBenchmark {
             let totalElapsed = Date().timeIntervalSince(startTime)
             let finalRTFx = totalDuration / totalElapsed
 
-            // Load ground truth
-            let groundTruth = await AMIParser.loadAMIGroundTruth(
+            // Load ground truth (throws if annotations are missing — never scores
+            // against placeholder data, see issue #752)
+            let groundTruth = try AMIParser.loadAMIGroundTruth(
                 for: meetingName,
                 duration: Float(totalDuration)
             )
@@ -870,8 +878,9 @@ enum StreamDiarizationBenchmark {
                 logger.info("  RTFx: \(String(format: "%.1f", finalRTFx))x")
             }
 
-            // Load ground truth
-            let groundTruth = await AMIParser.loadAMIGroundTruth(
+            // Load ground truth (throws if annotations are missing — never scores
+            // against placeholder data, see issue #752)
+            let groundTruth = try AMIParser.loadAMIGroundTruth(
                 for: meetingName,
                 duration: Float(totalDuration)
             )

diff --git a/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift b/Sources/FluidAudioCLI/Commands/LSEENDBenchmark.swift
@@ -487,7 +487,7 @@ enum LSEENDBenchmark {
 
             if dataset == .ami {
                 print("   [REF] Using AMI word-aligned annotations")
-                referenceSegments = await AMIParser.loadWordAlignedDERReference(
+                referenceSegments = try AMIParser.loadWordAlignedDERReference(
                     for: meetingName,
                     duration: duration
                 )

diff --git a/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift b/Sources/FluidAudioCLI/Commands/SortformerBenchmark.swift
@@ -608,7 +608,7 @@ enum SortformerBenchmark {
             // Fall back to AMI word-aligned annotations if no RTTM available (AMI only)
             if groundTruth.isEmpty && dataset == .ami {
                 print("   [RTTM] No RTTM file, falling back to AMI word-aligned annotations")
-                groundTruth = await AMIParser.loadWordAlignedGroundTruth(
+                groundTruth = try AMIParser.loadWordAlignedGroundTruth(
                     for: meetingName,
                     duration: duration
                 )

diff --git a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift
@@ -2,6 +2,25 @@
 import FluidAudio
 import Foundation
 
+/// Errors thrown when AMI reference annotations cannot be loaded.
+///
+/// Benchmarks must fail loudly on these instead of scoring against a synthetic
+/// reference: a transient annotation download failure once produced a bogus
+/// 80.8% DER report scored against placeholder ground truth (issue #752).
+enum AMIParserError: Error, LocalizedError {
+    case annotationsNotFound(subdirectory: String)
+
+    var errorDescription: String? {
+        switch self {
+        case .annotationsNotFound(let subdirectory):
+            return
+                "AMI annotations not found in any expected location. "
+                + "Expected structure: [path]/\(subdirectory)/ AND [path]/corpusResources/meetings.xml. "
+                + "Run with --auto-download or download manually from https://groups.inf.ed.ac.uk/ami/download/"
+        }
+    }
+}
+
 /// AMI annotation parser and ground truth handling
 struct AMIParser {
     private static let logger = AppLogger(category: "AMIParser")
@@ -40,41 +59,31 @@ struct AMIParser {
         return 4  // AMI meetings typically have 4 speakers
     }
 
-    /// Load AMI ground truth annotations for a specific meeting
+    /// Load AMI ground truth annotations for a specific meeting.
+    ///
+    /// Throws if annotations are missing or unparsable — never substitutes a
+    /// placeholder reference, so callers cannot silently score against fake data.
     static func loadAMIGroundTruth(
-        for meetingId: String, duration: Float
-    ) async
-        -> [TimedSpeakerSegment]
-    {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else {
-            logger.warning("   AMI annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning(
-                "      🔧 To download annotations: visit https://groups.inf.ed.ac.uk/ami/download/"
-            )
-            logger.warning(
-                "      📋 Using simplified placeholder ground truth (causes poor DER performance)"
-            )
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
+        for meetingId: String,
+        duration: Float,
+        searchRoots: [URL]? = nil
+    ) throws -> [TimedSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "segments", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "segments")
         }
 
         logger.info("   📖 Loading AMI annotations for meeting: \(meetingId)")
 
-        do {
-            let allSegments = try loadAMIGroundTruth(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration
-            )
-            logger.info("      Total segments loaded: \(allSegments.count)")
-            return allSegments
-        } catch {
-            logger.warning("      Failed to parse AMI annotations: \(error)")
-            logger.warning("      Using simplified placeholder instead")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
-        }
+        let allSegments = try loadAMIGroundTruth(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration
+        )
+        logger.info("      Total segments loaded: \(allSegments.count)")
+        return allSegments
     }
 
     /// Internal hook for tests and benchmark helpers that need deterministic parsing
@@ -163,35 +172,22 @@ struct AMIParser {
     static func loadFrameAlignedDERReference(
         for meetingId: String,
         duration: Float,
-        frameStep: Double = defaultReferenceFrameStepSeconds
-    ) async -> [DERSpeakerSegment] {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "segments") else {
-            logger.warning("   AMI annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/segments/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning("      📋 Falling back to simplified placeholder ground truth")
-            return frameAlignedDERReference(
-                from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4),
-                frameStep: frameStep
-            )
+        frameStep: Double = defaultReferenceFrameStepSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [DERSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "segments", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "segments")
         }
 
-        do {
-            return try loadFrameAlignedDERReference(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration,
-                frameStep: frameStep
-            )
-        } catch {
-            logger.warning("      Failed to parse AMI annotations: \(error)")
-            logger.warning("      Falling back to simplified placeholder ground truth")
-            return frameAlignedDERReference(
-                from: generateSimplifiedGroundTruth(duration: duration, speakerCount: 4),
-                frameStep: frameStep
-            )
-        }
+        return try loadFrameAlignedDERReference(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration,
+            frameStep: frameStep
+        )
     }
 
     static func loadFrameAlignedDERReference(
@@ -216,29 +212,22 @@ struct AMIParser {
     static func loadWordAlignedGroundTruth(
         for meetingId: String,
         duration: Float,
-        mergeGap: Double = defaultMergeGapSeconds
-    ) async -> [TimedSpeakerSegment] {
-        guard let validAmiDir = findAnnotationRoot(requiringSubdirectory: "words") else {
-            logger.warning("   AMI word annotations not found in any expected location")
-            logger.warning(
-                "      📁 Expected structure: [path]/words/ AND [path]/corpusResources/meetings.xml"
-            )
-            logger.warning("      📋 Falling back to simplified placeholder ground truth")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
+        mergeGap: Double = defaultMergeGapSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [TimedSpeakerSegment] {
+        guard
+            let validAmiDir = findAnnotationRoot(
+                requiringSubdirectory: "words", searchRoots: searchRoots)
+        else {
+            throw AMIParserError.annotationsNotFound(subdirectory: "words")
         }
 
-        do {
-            return try loadWordAlignedGroundTruth(
-                for: meetingId,
-                in: validAmiDir,
-                duration: duration,
-                mergeGap: mergeGap
-            )
-        } catch {
-            logger.warning("      Failed to parse AMI word annotations: \(error)")
-            logger.warning("      Falling back to simplified placeholder ground truth")
-            return generateSimplifiedGroundTruth(duration: duration, speakerCount: 4)
-        }
+        return try loadWordAlignedGroundTruth(
+            for: meetingId,
+            in: validAmiDir,
+            duration: duration,
+            mergeGap: mergeGap
+        )
     }
 
     /// Internal hook for tests and benchmark helpers that need deterministic parsing
@@ -293,12 +282,14 @@ struct AMIParser {
     static func loadWordAlignedDERReference(
         for meetingId: String,
         duration: Float,
-        mergeGap: Double = defaultMergeGapSeconds
-    ) async -> [DERSpeakerSegment] {
-        let segments = await loadWordAlignedGroundTruth(
+        mergeGap: Double = defaultMergeGapSeconds,
+        searchRoots: [URL]? = nil
+    ) throws -> [DERSpeakerSegment] {
+        let segments = try loadWordAlignedGroundTruth(
             for: meetingId,
             duration: duration,
-            mergeGap: mergeGap
+            mergeGap: mergeGap,
+            searchRoots: searchRoots
         )
         return segments.map {
             DERSpeakerSegment(
@@ -330,34 +321,6 @@ struct AMIParser {
         }
     }
 
-    /// Generate simplified ground truth for testing
-    static func generateSimplifiedGroundTruth(
-        duration: Float, speakerCount: Int
-    )
-        -> [TimedSpeakerSegment]
-    {
-        let segmentDuration = duration / Float(speakerCount * 2)
-        var segments: [TimedSpeakerSegment] = []
-        let dummyEmbedding: [Float] = Array(repeating: 0.1, count: 512)
-
-        for i in 0..<(speakerCount * 2) {
-            let speakerId = "Speaker \((i % speakerCount) + 1)"
-            let startTime = Float(i) * segmentDuration
-            let endTime = min(startTime + segmentDuration, duration)
-
-            segments.append(
-                TimedSpeakerSegment(
-                    speakerId: speakerId,
-                    embedding: dummyEmbedding,
-                    startTimeSeconds: startTime,
-                    endTimeSeconds: endTime,
-                    qualityScore: 1.0
-                ))
-        }
-
-        return segments
-    }
-
     /// Generate consistent placeholder embeddings for each speaker
     static func generatePlaceholderEmbedding(for participantId: String) -> [Float] {
         // Generate a consistent embedding based on participant ID
@@ -388,8 +351,11 @@ struct AMIParser {
         ]
     }
 
-    private static func findAnnotationRoot(requiringSubdirectory subdirectory: String) -> URL? {
-        for path in possibleAnnotationRoots() {
+    private static func findAnnotationRoot(
+        requiringSubdirectory subdirectory: String,
+        searchRoots: [URL]? = nil
+    ) -> URL? {
+        for path in searchRoots ?? possibleAnnotationRoots() {
             let requiredDir = path.appendingPathComponent(subdirectory)
             let meetingsFile = path.appendingPathComponent("corpusResources/meetings.xml")
             let hasRequiredDir = FileManager.default.fileExists(atPath: requiredDir.path)

diff --git a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
@@ -191,14 +191,27 @@ struct DatasetDownloader {
             return
         }
 
-        // Download and extract AMI manual annotations v1.6.2
+        // Download and extract AMI manual annotations v1.6.2.
+        // The Edinburgh server is occasionally flaky, so retry with backoff —
+        // a single transient failure here once poisoned a CI benchmark run
+        // with placeholder ground truth (issue #752).
         let zipURL =
             "https://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip"
         let zipFile = annotationsDir.appendingPathComponent("ami_public_manual_1.6.2.zip")
-        let zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile)
+
+        var zipSuccess = false
+        let maxAttempts = 3
+        for attempt in 1...maxAttempts {
+            zipSuccess = await downloadAnnotationFile(from: zipURL, to: zipFile)
+            if zipSuccess { break }
+            logger.warning("Annotation download attempt \(attempt)/\(maxAttempts) failed")
+            if attempt < maxAttempts {
+                try? await Task.sleep(nanoseconds: UInt64(attempt) * 2_000_000_000)
+            }
+        }
 
         if !zipSuccess {
-            logger.error("Failed to download AMI annotations")
+            logger.error("Failed to download AMI annotations after \(maxAttempts) attempts")
             return
         }