speechmatics · sam-s10s · Feb 11, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,7 @@ tmp/
 # Examples
 output/
 **/output.wav
+
+# macOS
+.DS_Store
+**/.DS_Store
diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py
@@ -12,7 +12,6 @@
 from ._exceptions import TimeoutError
 from ._exceptions import TranscriptionError
 from ._logging import get_logger
-from ._models import AudioEncoding
 from ._models import AudioEventsConfig
 from ._models import AudioFormat
 from ._models import ClientMessageType
@@ -100,7 +99,9 @@ def __init__(
         self.on(ServerMessageType.WARNING, self._on_warning)
         self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added)
 
-        self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096)
+        # Audio format is set when start_session is called with an explicit format.
+        # Deliberately None until then to avoid silently using incorrect defaults.
+        self._audio_format: Optional[AudioFormat] = None
 
         self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id)
 
@@ -138,10 +139,10 @@ async def start_session(
                 ...     await client.start_session()
                 ...     await client.send_audio(frame)
         """
-        if audio_format is not None:
-            self._audio_format = audio_format
 
-        await self._start_recognition_session(
+        # _start_recognition_session resolves defaults (e.g. AudioFormat() if None),
+        # so we capture the resolved format to keep _audio_format in sync.
+        _, self._audio_format = await self._start_recognition_session(
             transcription_config=transcription_config,
             audio_format=audio_format,
             translation_config=translation_config,
@@ -213,8 +214,13 @@ def audio_seconds_sent(self) -> float:
         """Number of audio seconds sent to the server.
 
         Raises:
-            ValueError: If the audio format does not have an encoding set.
+            ValueError: If called before start_session has set the audio format,
+                or if the audio format does not have an encoding set.
         """
+        # _audio_format is only set once start_session receives an explicit AudioFormat.
+        # Failing here prevents silently computing with wrong defaults (e.g. 44100Hz).
+        if self._audio_format is None:
+            raise ValueError("audio_seconds_sent is not available before start_session is called with an audio format")
         return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
 
     async def transcribe(

diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml
@@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }]
 license = "MIT"
 requires-python = ">=3.9"
 dependencies = [
-    "speechmatics-rt>=0.5.3",
+    "speechmatics-rt>=1.0.0",
     "pydantic>=2.10.6,<3",
     "numpy>=1.26.4,<3"
 ]
@@ -42,7 +42,7 @@ keywords = [
 smart = [
     "certifi>=2025.10.5",
     "onnxruntime>=1.20.1,<2",
-    "transformers>=4.57.0,<5",
+    "transformers>=4.57.0,<6",
 ]
 dev = [
     "black",

diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
@@ -176,6 +176,10 @@ def __init__(
             preset_config = VoiceAgentConfigPreset.load(preset)
             config = VoiceAgentConfigPreset._merge_configs(preset_config, config)
 
+        # Validate the final config (deferred to allow overlay/preset merging first)
+        if config is not None:
+            config.validate_config()
+
         # Process the config
         self._config, self._transcription_config, self._audio_format = self._prepare_config(config)
 
@@ -310,20 +314,18 @@ def __init__(
         self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
         self._eot_calculation_task: Optional[asyncio.Task] = None
 
-        # Uses fixed EndOfUtterance message from STT
-        self._uses_fixed_eou: bool = (
-            self._eou_mode == EndOfUtteranceMode.FIXED
-            and not self._silero_detector
-            and not self._config.end_of_turn_config.use_forced_eou
-        )
-
-        # Uses ForceEndOfUtterance message
-        self._uses_forced_eou: bool = not self._uses_fixed_eou
+        # Forced end of utterance handling
+        #   FEOU is not used in FIXED mode, unless VAD has been enabled. It can / should
+        #   also be disabled during testing when not connected to an endpoint, as the
+        #   waiting for FEOU response will block the test.
+        self._use_forced_eou: bool = self._eou_mode is not EndOfUtteranceMode.FIXED or self._uses_silero_vad
         self._forced_eou_active: bool = False
         self._last_forced_eou_latency: float = 0.0
 
-        # Emit EOT prediction (uses _uses_forced_eou)
-        self._uses_eot_prediction: bool = self._eou_mode not in [
+        # Emit EOT prediction
+        #   EOT predictions are only relevant when not using the FIXED or EXTERNAL modes,
+        #   as these use different triggers to finalize the turn.
+        self._emit_eot_predictions: bool = self._eou_mode not in [
             EndOfUtteranceMode.FIXED,
             EndOfUtteranceMode.EXTERNAL,
         ]
@@ -360,8 +362,8 @@ def __init__(
             AudioEncoding.PCM_S16LE: 2,
         }.get(self._audio_format.encoding, 1)
 
-        # Default audio buffer
-        if not self._config.audio_buffer_length and (self._uses_smart_turn or self._uses_silero_vad):
+        # Default audio buffer (used when Silero VAD is enabled and with Smart Turn)
+        if not self._config.audio_buffer_length and self._uses_silero_vad:
             self._config.audio_buffer_length = 15.0
 
         # Audio buffer
@@ -447,9 +449,7 @@ def _prepare_config(
             )
 
         # Fixed end of Utterance
-        if bool(
-            config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
-        ):
+        if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
             transcription_config.conversation_config = ConversationConfig(
                 end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
             )
@@ -472,12 +472,15 @@ def _prepare_config(
     # LIFECYCLE METHODS
     # ============================================================================
 
-    async def connect(self) -> None:
+    async def connect(self, ws_headers: Optional[dict] = None) -> None:
         """Connect to the Speechmatics API.
 
         Establishes WebSocket connection and starts the transcription session.
         This must be called before sending audio.
 
+        Args:
+            ws_headers: Optional headers to pass to the WebSocket connection.
+
         Raises:
             Exception: If connection fails.
 
@@ -521,6 +524,7 @@ async def connect(self) -> None:
             await self.start_session(
                 transcription_config=self._transcription_config,
                 audio_format=self._audio_format,
+                ws_headers=ws_headers,
             )
             self._is_connected = True
             self._start_metrics_task()
@@ -659,8 +663,14 @@ async def send_audio(self, payload: bytes) -> None:
             return
 
         # Process with Silero VAD
-        if self._silero_detector:
-            asyncio.create_task(self._silero_detector.process_audio(payload))
+        if self._uses_silero_vad and self._silero_detector is not None:
+            asyncio.create_task(
+                self._silero_detector.process_audio(
+                    payload,
+                    sample_rate=self._audio_sample_rate,
+                    sample_width=self._audio_sample_width,
+                )
+            )
 
         # Add to audio buffer (use put_bytes to handle variable chunk sizes)
         if self._config.audio_buffer_length > 0:
@@ -717,14 +727,14 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
     # PUBLIC UTTERANCE / TURN MANAGEMENT
     # ============================================================================
 
-    def finalize(self, end_of_turn: bool = False) -> None:
+    def finalize(self, pad: float | None = None) -> None:
         """Finalize segments.
 
         This function will emit segments in the buffer without any further checks
         on the contents of the segments.
 
         Args:
-            end_of_turn: Whether to emit an end of turn message.
+            pad: the number of seconds to pad the timestamp for the FEOU (optional)
         """
 
         # Clear smart turn cutoff
@@ -738,8 +748,8 @@ async def emit() -> None:
             """Wait for EndOfUtterance if needed, then emit segments."""
 
             # Forced end of utterance message (only when no speaker is detected)
-            if self._config.end_of_turn_config.use_forced_eou:
-                await self._await_forced_eou()
+            if self._use_forced_eou:
+                await self._await_forced_eou(pad=pad)
 
             # Check if the turn has changed
             if self._turn_handler.handler_id != _turn_id:
@@ -749,7 +759,7 @@ async def emit() -> None:
             self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True))
 
         # Call async task (only if not already waiting for forced EOU)
-        if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
+        if not self._forced_eou_active:
             asyncio.create_task(emit())
 
     # ============================================================================
@@ -788,8 +798,8 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None:
                 return
             self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
 
-        # End of Utterance (FIXED mode only)
-        if self._uses_fixed_eou:
+        # End of Utterance - only when not using ForceEndOfUtterance messages
+        if not self._use_forced_eou:
 
             @self.on(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
             def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
@@ -1121,7 +1131,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
                     self._last_fragment_end_time = max(self._last_fragment_end_time, fragment.end_time)
 
             # Evaluate for VAD (only done on partials)
-            await self._vad_evaluation(fragments, is_final=is_final)
+            await self._speaker_start_stop_evaluation(fragments, is_final=is_final)
 
             # Fragments to retain
             retained_fragments = [
@@ -1205,18 +1215,8 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
         if change_filter and not changes.any(*change_filter):
             return
 
-        # Skip re-evaluation if transcripts are older than smart turn cutoff
-        if self._smart_turn_pending_cutoff is not None and self._current_view:
-            latest_end_time = max(
-                (f.end_time for f in self._current_view.fragments if f.end_time is not None), default=0.0
-            )
-
-            # If all fragments end before or at the cutoff, skip re-evaluation
-            if latest_end_time <= self._smart_turn_pending_cutoff:
-                return
-
         # Turn prediction
-        if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active and self._use_forced_eou:
 
             async def fn() -> None:
                 ttl = await self._calculate_finalize_delay()
@@ -1518,14 +1518,20 @@ async def _calculate_finalize_delay(
         annotation = annotation or AnnotationResult()
 
         # VAD enabled
-        if self._silero_detector:
+        if self._uses_silero_vad:
             annotation.add(AnnotationFlags.VAD_ACTIVE)
         else:
             annotation.add(AnnotationFlags.VAD_INACTIVE)
 
         # Smart Turn enabled
         if self._smart_turn_detector:
             annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
+            # If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation.
+            # This covers the case where the TTL fires before VAD triggers Smart Turn inference.
+            if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has(
+                AnnotationFlags.SMART_TURN_FALSE
+            ):
+                annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL)
         else:
             annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)
 
@@ -1551,8 +1557,7 @@ async def _calculate_finalize_delay(
         delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)
 
         # Trim off the most recent forced EOU delay if we're in forced EOU mode
-        if self._uses_forced_eou:
-            delay -= self._last_forced_eou_latency
+        delay -= self._last_forced_eou_latency
 
         # Clamp to max delay and adjust for TTFB
         clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
@@ -1586,7 +1591,10 @@ async def _eot_prediction(
         # Wait for Smart Turn result
         if self._smart_turn_detector and end_time is not None:
             result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker)
-            if result.prediction:
+            if result.error:
+                # No valid prediction — SMART_TURN_NO_SIGNAL will be applied by _calculate_finalize_delay
+                pass
+            elif result.prediction:
                 annotation.add(AnnotationFlags.SMART_TURN_TRUE)
             else:
                 annotation.add(AnnotationFlags.SMART_TURN_FALSE)
@@ -1667,7 +1675,7 @@ async def _smart_turn_prediction(
         # Return the prediction
         return prediction
 
-    async def _await_forced_eou(self, timeout: float = 1.0) -> None:
+    async def _await_forced_eou(self, timeout: float = 1.0, pad: float | None = None) -> None:
         """Await the forced end of utterance."""
 
         # Received EOU
@@ -1676,17 +1684,28 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
         # Add listener
         self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set())
 
-        # Trigger EOU message
-        self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance")
-
         # Wait for EOU
         try:
             # Track the start time
             start_time = time.time()
             self._forced_eou_active = True
 
+            # Timings
+            audio_sent = self.audio_seconds_sent
+            # padding = self._config.end_of_turn_config.forced_eou_padding
+            padding = pad if pad else 0.0
+
+            # Establish amount of time to wait for EOU
+            timestamp: float = max(audio_sent + padding, 0.0)
+
+            # Info
+            info = {"audio_sent": audio_sent, "padding": padding, "timestamp": timestamp}
+
             # Send the force EOU and wait for the response
-            await self.force_end_of_utterance()
+            await self.force_end_of_utterance(timestamp=timestamp)
+            self._emit_diagnostic_message(f"ForceEndOfUtterance sent - waiting for EndOfUtterance ({info})")
+
+            # Wait for the response
             await asyncio.wait_for(eou_received.wait(), timeout=timeout)
 
             # Record the latency
@@ -1702,7 +1721,7 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
     # VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
     # ============================================================================
 
-    async def _vad_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
+    async def _speaker_start_stop_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
         """Emit a VAD event.
 
         This will emit `SPEAKER_STARTED` and `SPEAKER_ENDED` events to the client and is
@@ -1850,18 +1869,20 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
             annotation.add(AnnotationFlags.VAD_STARTED)
 
         # If speech has ended, we need to predict the end of turn
-        if result.speech_ended and self._uses_eot_prediction:
+        if self._emit_eot_predictions and result.speech_ended:
             """VAD-based end of turn prediction."""
 
             # Set cutoff to prevent late transcripts from cancelling finalization
             self._smart_turn_pending_cutoff = event_time
 
+            # Async callback
             async def fn() -> None:
                 ttl = await self._eot_prediction(
                     end_time=event_time, speaker=self._current_speaker, annotation=annotation
                 )
                 self._turn_handler.update_timer(ttl)
 
+            # Call the eot calculation asynchronously
             self._run_background_eot_calculation(fn, "silero_vad")
 
     async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None:
@@ -1878,8 +1899,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
             await self._emit_start_of_turn(event_time)
 
         # Update the turn handler
-        if self._uses_forced_eou:
-            self._turn_handler.reset()
+        self._turn_handler.reset()
 
         # Emit the event
         self._emit_message(
@@ -1902,7 +1922,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa
         self._last_speak_end_latency = self._total_time - event_time
 
         # Turn prediction
-        if self._uses_eot_prediction and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active:
 
             async def fn() -> None:
                 ttl = await self._eot_prediction(event_time, speaker)