diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml index 9006bd1..3edf131 100644 --- a/sdk/voice/pyproject.toml +++ b/sdk/voice/pyproject.toml @@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }] license = "MIT" requires-python = ">=3.9" dependencies = [ - "speechmatics-rt>=0.5.3", + "speechmatics-rt>=1.0.0", "pydantic>=2.10.6,<3", "numpy>=1.26.4,<3" ] @@ -42,7 +42,7 @@ keywords = [ smart = [ "certifi>=2025.10.5", "onnxruntime>=1.20.1,<2", - "transformers>=4.57.0,<5", + "transformers>=4.57.0,<6", ] dev = [ "black", diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index c0988dd..09acda0 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -717,14 +717,14 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None: # PUBLIC UTTERANCE / TURN MANAGEMENT # ============================================================================ - def finalize(self, end_of_turn: bool = False) -> None: + def finalize(self, pad: float | None = None) -> None: """Finalize segments. This function will emit segments in the buffer without any further checks on the contents of the segments. Args: - end_of_turn: Whether to emit an end of turn message. + pad: the number of seconds to pad the timestamp for the FEOU (optional) """ # Clear smart turn cutoff @@ -738,8 +738,8 @@ async def emit() -> None: """Wait for EndOfUtterance if needed, then emit segments.""" # Forced end of utterance message (only when no speaker is detected) - if self._config.end_of_turn_config.use_forced_eou: - await self._await_forced_eou() + if self._uses_forced_eou: + await self._await_forced_eou(pad=pad) # Check if the turn has changed if self._turn_handler.handler_id != _turn_id: @@ -1667,7 +1667,7 @@ async def _smart_turn_prediction( # Return the prediction return prediction - async def _await_forced_eou(self, timeout: float = 1.0) -> None: + async def _await_forced_eou(self, timeout: float = 1.0, pad: float | None = None) -> None: """Await the forced end of utterance.""" # Received EOU @@ -1676,17 +1676,36 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None: # Add listener self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set()) - # Trigger EOU message - self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance") - # Wait for EOU try: # Track the start time start_time = time.time() self._forced_eou_active = True + # Timings + audio_sent = self.audio_seconds_sent + + # Padding precedence: explicit `pad` arg > SPEECHMATICS_FEOU_PAD env var > config + if pad is not None: + padding = pad + else: + env_pad = os.environ.get("SPEECHMATICS_FEOU_PAD") + if env_pad is not None: + padding = float(env_pad) + else: + padding = self._config.end_of_turn_config.forced_eou_padding + + # Establish amount of time to wait for EOU + timestamp: float = max(audio_sent + padding, 0.0) + + # Info + info = {"audio_sent": audio_sent, "padding": padding, "timestamp": timestamp} + # Send the force EOU and wait for the response - await self.force_end_of_utterance() + await self.force_end_of_utterance(timestamp=timestamp) + self._emit_diagnostic_message(f"ForceEndOfUtterance sent - waiting for EndOfUtterance ({info})") + + # Wait for the response await asyncio.wait_for(eou_received.wait(), timeout=timeout) # Record the latency diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index b4a432c..9c6ecc6 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -372,7 +372,7 @@ class SpeechSegmentConfig(BaseModel): Parameters: add_trailing_eos: Add trailing end of sentence to segments. When enabled, segments are - emitted with missing trailing end of sentence added. Defaults to False. + emitted with missing trailing end of sentence added. Defaults to True. emit_sentences: Emit segments when a sentence has ended. A finalized segment is emitted as soon as a finalized end of sentence is detected. If a speaker continues to speak during @@ -384,7 +384,7 @@ class SpeechSegmentConfig(BaseModel): Defaults to None. """ - add_trailing_eos: bool = False + add_trailing_eos: bool = True emit_sentences: bool = True pause_mark: Optional[str] = None @@ -411,6 +411,7 @@ class EndOfTurnConfig(BaseModel): min_end_of_turn_delay: Minimum end of turn delay. penalties: List of end of turn penalty items. use_forced_eou: Whether to use forced end of utterance detection. + forced_eou_padding: the padding to use when sending ForceEndOfUtterance with timestamp """ base_multiplier: float = 1.0 @@ -439,6 +440,7 @@ class EndOfTurnConfig(BaseModel): ] ) use_forced_eou: bool = False + forced_eou_padding: float = 0.2 class VoiceActivityConfig(BaseModel):