diff --git a/sdk/batch/speechmatics/batch/_models.py b/sdk/batch/speechmatics/batch/_models.py index c375d34..1d14b32 100644 --- a/sdk/batch/speechmatics/batch/_models.py +++ b/sdk/batch/speechmatics/batch/_models.py @@ -44,6 +44,8 @@ class OperatingPoint(str, Enum): ENHANCED = "enhanced" STANDARD = "standard" + # Not yet available for general use. Support for omni-v1 models is coming soon. + OMNI = "omni-v1" class NotificationContents(str, Enum): @@ -101,10 +103,20 @@ class TranscriptionConfig: defaults to None. audio_filtering_config: Configuration for limiting the transcription of quiet audio. Defaults to None. + language_hints: Configuration for the list of languages that are most likely to appear in your audio, + This improves accuracy by biasing recognition toward the specified languages. + Use ``language_hints_strict`` to control whether other languages can also be detected. + Applicable only for omni-v1 models. Support for omni-v1 models is coming soon. + language_hints_strict: Configuration that controls how strictly language hints are applied. + When ``True``, the transcript will only contain languages specified in ``language_hints``. + When ``False``, recognition is biased toward the specified languages while still allowing other + languages to be detected if present. + Applicable only for omni-v1 models. Support for omni-v1 models is coming soon. """ language: str = "en" operating_point: OperatingPoint = OperatingPoint.ENHANCED + model: Optional[OperatingPoint] = None output_locale: Optional[str] = None diarization: Optional[str] = None additional_vocab: Optional[list[dict[str, Any]]] = None @@ -118,9 +130,15 @@ class TranscriptionConfig: max_delay_mode: Optional[str] = None transcript_filtering_config: Optional[TranscriptFilteringConfig] = None audio_filtering_config: Optional[AudioFilteringConfig] = None + language_hints: Optional[list[str]] = None + language_hints_strict: Optional[bool] = None def to_dict(self) -> dict[str, Any]: result: dict[str, Any] = {k: v for k, v in asdict(self).items() if v is not None} + if self.model: + # model is an alias for operating_point for omni-v1 models; they cannot coexist in the request. + result["operating_point"] = self.model + result.pop("model") if self.transcript_filtering_config is not None: result["transcript_filtering_config"] = self.transcript_filtering_config.to_dict() if self.audio_filtering_config is not None: @@ -710,6 +728,9 @@ class Transcript: audio_event_summary: Optional audio event statistics. """ + _LANG_PACK_WORD_DELIMITER_KEY = "word_delimiter" + _LANG_PACK_PER_LANG_DELIMITERS_KEY = "per_language_word_delimiters" + format: str job: JobInfo metadata: RecognitionMetadata @@ -738,14 +759,23 @@ def transcript_text(self) -> str: return "" # Get language pack info for word delimiter - word_delimiter = " " # Default - if self.metadata and self.metadata.language_pack_info and "word_delimiter" in self.metadata.language_pack_info: - word_delimiter = self.metadata.language_pack_info["word_delimiter"] + default_word_delimiter = " " # Default + # Applicable only for the next gen models + per_lang_word_delimiters: dict = {} + if self.metadata and self.metadata.language_pack_info: + if self._LANG_PACK_WORD_DELIMITER_KEY in self.metadata.language_pack_info: + default_word_delimiter = self.metadata.language_pack_info[self._LANG_PACK_WORD_DELIMITER_KEY] + + if self._LANG_PACK_PER_LANG_DELIMITERS_KEY in self.metadata.language_pack_info: + per_lang_word_delimiters = self.metadata.language_pack_info[self._LANG_PACK_PER_LANG_DELIMITERS_KEY] # Group results by speaker and process transcript_parts = [] current_speaker = None - current_group: list[str] = [] + # Each entry is (word, delimiter), where delimiter is looked up from per_language_word_delimiters + # using the word's language code, falling back to the default word delimiter. + # For example, [("hello", " "), ("world", " ")] + current_group: list[tuple[str, str]] = [] for result in self.results: if not result.alternatives: @@ -754,12 +784,15 @@ def transcript_text(self) -> str: alternative = result.alternatives[0] content = alternative.content speaker = alternative.speaker + word_delimiter = default_word_delimiter + if alternative.language and alternative.language in per_lang_word_delimiters: + word_delimiter = per_lang_word_delimiters[alternative.language] # Handle speaker changes if speaker != current_speaker: # Process accumulated group for previous speaker if current_group: - text = self._join_content_items(current_group, word_delimiter) + text = self._join_content_items(current_group) if current_speaker: transcript_parts.append(f"SPEAKER {current_speaker}: {text}") # type: ignore[unreachable] else: @@ -768,13 +801,13 @@ def transcript_text(self) -> str: current_speaker = speaker - # Add content to current group + # Add content to current group with its word delimiter if content: - current_group.append(content) + current_group.append((content, word_delimiter)) # Process final group if current_group: - text = self._join_content_items(current_group, word_delimiter) + text = self._join_content_items(current_group) if current_speaker: transcript_parts.append(f"SPEAKER {current_speaker}: {text}") else: @@ -782,13 +815,12 @@ def transcript_text(self) -> str: return "\n".join(transcript_parts) - def _join_content_items(self, content_items: list[str], word_delimiter: str) -> str: + def _join_content_items(self, content_items: list[tuple[str, str]]) -> str: """ Join content items with appropriate spacing and punctuation handling. Args: - content_items: List of content strings to join. - word_delimiter: Delimiter to use between words. + content_items: List of (content, word_delimiter) pairs to join. Returns: Properly formatted text string. @@ -798,7 +830,7 @@ def _join_content_items(self, content_items: list[str], word_delimiter: str) -> result: list[str] = [] - for i, content in enumerate(content_items): + for i, (content, word_delimiter) in enumerate(content_items): if not content: continue diff --git a/tests/batch/test_models.py b/tests/batch/test_models.py index d262685..743550b 100644 --- a/tests/batch/test_models.py +++ b/tests/batch/test_models.py @@ -1,4 +1,4 @@ -from speechmatics.batch._models import JobConfig, TranscriptFilteringConfig, TranscriptionConfig +from speechmatics.batch._models import JobConfig, OperatingPoint, TranscriptFilteringConfig, TranscriptionConfig class TestTranscriptFilteringConfigToDict: @@ -127,3 +127,81 @@ def test_absent_output_config_is_none(self): data = {"type": "transcription"} job_config = JobConfig.from_dict(data) assert job_config.output_config is None + + +class TestModelToDict: + def test_model_serializes_as_operating_point(self): + config = TranscriptionConfig(model=OperatingPoint.OMNI) + result = config.to_dict() + assert result["operating_point"] == OperatingPoint.OMNI + assert "model" not in result + + def test_model_absent_leaves_operating_point_unchanged(self): + config = TranscriptionConfig(operating_point=OperatingPoint.ENHANCED) + result = config.to_dict() + assert result["operating_point"] == OperatingPoint.ENHANCED + assert "model" not in result + + +class TestLanguageHintsToDict: + def test_language_hints_serializes_correctly(self): + config = TranscriptionConfig(language_hints=["en", "fr"]) + result = config.to_dict() + assert result["language_hints"] == ["en", "fr"] + assert "language_hints_strict" not in result + + def test_language_hints_strict_true_serializes_correctly(self): + config = TranscriptionConfig(language_hints=["en"], language_hints_strict=True) + result = config.to_dict() + assert result["language_hints"] == ["en"] + assert result["language_hints_strict"] is True + + def test_language_hints_strict_false_serializes_correctly(self): + config = TranscriptionConfig(language_hints=["en"], language_hints_strict=False) + result = config.to_dict() + assert result["language_hints"] == ["en"] + assert "language_hints_strict" in result + assert result["language_hints_strict"] is False + + def test_language_hints_absent_when_none(self): + config = TranscriptionConfig() + result = config.to_dict() + assert "language_hints" not in result + assert "language_hints_strict" not in result + + +class TestLanguageHintsFromDict: + def test_language_hints_deserializes_correctly(self): + data = { + "type": "transcription", + "transcription_config": { + "language": "en", + "language_hints": ["en", "fr"], + }, + } + job_config = JobConfig.from_dict(data) + assert job_config.transcription_config is not None + assert job_config.transcription_config.language_hints == ["en", "fr"] + + def test_language_hints_strict_deserializes_correctly(self): + data = { + "type": "transcription", + "transcription_config": { + "language": "en", + "language_hints": ["en"], + "language_hints_strict": True, + }, + } + job_config = JobConfig.from_dict(data) + assert job_config.transcription_config is not None + assert job_config.transcription_config.language_hints_strict is True + + def test_absent_fields_are_none(self): + data = { + "type": "transcription", + "transcription_config": {"language": "en"}, + } + job_config = JobConfig.from_dict(data) + assert job_config.transcription_config + assert job_config.transcription_config.language_hints is None + assert job_config.transcription_config.language_hints_strict is None