Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 44 additions & 12 deletions sdk/batch/speechmatics/batch/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class OperatingPoint(str, Enum):

ENHANCED = "enhanced"
STANDARD = "standard"
# Not yet available for general use. Support for omni-v1 models is coming soon.
OMNI = "omni-v1"


class NotificationContents(str, Enum):
Expand Down Expand Up @@ -101,10 +103,20 @@ class TranscriptionConfig:
defaults to None.
audio_filtering_config: Configuration for limiting the transcription of quiet audio.
Defaults to None.
language_hints: Configuration for the list of languages that are most likely to appear in your audio,
This improves accuracy by biasing recognition toward the specified languages.
Use ``language_hints_strict`` to control whether other languages can also be detected.
Applicable only for omni-v1 models. Support for omni-v1 models is coming soon.
language_hints_strict: Configuration that controls how strictly language hints are applied.
When ``True``, the transcript will only contain languages specified in ``language_hints``.
When ``False``, recognition is biased toward the specified languages while still allowing other
languages to be detected if present.
Applicable only for omni-v1 models. Support for omni-v1 models is coming soon.
"""

language: str = "en"
operating_point: OperatingPoint = OperatingPoint.ENHANCED
model: Optional[OperatingPoint] = None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels strange that users will call the OperatingPoint enum for a field called model.

Copy link
Copy Markdown
Author

@rakeshv247 rakeshv247 May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Users can set either model or operating_point — both are accepted. When model is set, it takes precedence and is sent as operating_point in the request; only operating_point ever goes over the wire. The model field is purely an alias added for ergonomic familiarity and it mirrors how LLM APIs (OpenAI, Anthropic, etc.) expose model selection, making the omni-v1 use case feel natural to users coming from that ecosystem.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh no, I understand that. This is just a nit from me. It feels odd calling

model = OperatingPoint.OMNI

If someone were to use the model field.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately 😞, we have to close this PR for now (if you have followed the thread).

output_locale: Optional[str] = None
diarization: Optional[str] = None
additional_vocab: Optional[list[dict[str, Any]]] = None
Expand All @@ -118,9 +130,15 @@ class TranscriptionConfig:
max_delay_mode: Optional[str] = None
transcript_filtering_config: Optional[TranscriptFilteringConfig] = None
audio_filtering_config: Optional[AudioFilteringConfig] = None
language_hints: Optional[list[str]] = None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm probably ignorant to whether this has been discussed before, but would it be worth putting language_hints and language_hints_strict into a single config class like our other configs?

That way users could call

lang_config = LangConfig(
    hints = ["en", "jp"],
    strict = True

config = TranscriptionConfig(
    model = OperatingPoint.OMNI,
    lang_config = lang_config
)

Thoughts?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that's a valid point! Forgot these! Nice shout

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't do that because language_hints and language_hints_strict are flat scalar fields in the transcription_config, just like their peers max_delay + max_delay_mode in the transcription_config. The other fields that are defined as dataclasses are real JSON objects rather than scalar types, for example: TranscriptFilteringConfig and AudioFilteringConfig. Does this make sense?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, and understand that point. Just wondered if it might be good from a user perspective to group everything that way. Happy with the explanation though :)

language_hints_strict: Optional[bool] = None

def to_dict(self) -> dict[str, Any]:
result: dict[str, Any] = {k: v for k, v in asdict(self).items() if v is not None}
if self.model:
# model is an alias for operating_point for omni-v1 models; they cannot coexist in the request.
result["operating_point"] = self.model
result.pop("model")
if self.transcript_filtering_config is not None:
result["transcript_filtering_config"] = self.transcript_filtering_config.to_dict()
if self.audio_filtering_config is not None:
Expand Down Expand Up @@ -710,6 +728,9 @@ class Transcript:
audio_event_summary: Optional audio event statistics.
"""

_LANG_PACK_WORD_DELIMITER_KEY = "word_delimiter"
_LANG_PACK_PER_LANG_DELIMITERS_KEY = "per_language_word_delimiters"

format: str
job: JobInfo
metadata: RecognitionMetadata
Expand Down Expand Up @@ -738,14 +759,23 @@ def transcript_text(self) -> str:
return ""

# Get language pack info for word delimiter
word_delimiter = " " # Default
if self.metadata and self.metadata.language_pack_info and "word_delimiter" in self.metadata.language_pack_info:
word_delimiter = self.metadata.language_pack_info["word_delimiter"]
default_word_delimiter = " " # Default
# Applicable only for the next gen models
per_lang_word_delimiters: dict = {}
if self.metadata and self.metadata.language_pack_info:
if self._LANG_PACK_WORD_DELIMITER_KEY in self.metadata.language_pack_info:
default_word_delimiter = self.metadata.language_pack_info[self._LANG_PACK_WORD_DELIMITER_KEY]

if self._LANG_PACK_PER_LANG_DELIMITERS_KEY in self.metadata.language_pack_info:
per_lang_word_delimiters = self.metadata.language_pack_info[self._LANG_PACK_PER_LANG_DELIMITERS_KEY]

# Group results by speaker and process
transcript_parts = []
current_speaker = None
current_group: list[str] = []
# Each entry is (word, delimiter), where delimiter is looked up from per_language_word_delimiters
# using the word's language code, falling back to the default word delimiter.
# For example, [("hello", " "), ("world", " ")]
current_group: list[tuple[str, str]] = []
Comment thread
rakeshv247 marked this conversation as resolved.

for result in self.results:
if not result.alternatives:
Expand All @@ -754,12 +784,15 @@ def transcript_text(self) -> str:
alternative = result.alternatives[0]
content = alternative.content
speaker = alternative.speaker
word_delimiter = default_word_delimiter
if alternative.language and alternative.language in per_lang_word_delimiters:
word_delimiter = per_lang_word_delimiters[alternative.language]

# Handle speaker changes
if speaker != current_speaker:
# Process accumulated group for previous speaker
if current_group:
text = self._join_content_items(current_group, word_delimiter)
text = self._join_content_items(current_group)
if current_speaker:
transcript_parts.append(f"SPEAKER {current_speaker}: {text}") # type: ignore[unreachable]
else:
Expand All @@ -768,27 +801,26 @@ def transcript_text(self) -> str:

current_speaker = speaker

# Add content to current group
# Add content to current group with its word delimiter
if content:
current_group.append(content)
current_group.append((content, word_delimiter))

# Process final group
if current_group:
text = self._join_content_items(current_group, word_delimiter)
text = self._join_content_items(current_group)
if current_speaker:
transcript_parts.append(f"SPEAKER {current_speaker}: {text}")
else:
transcript_parts.append(text)

return "\n".join(transcript_parts)

def _join_content_items(self, content_items: list[str], word_delimiter: str) -> str:
def _join_content_items(self, content_items: list[tuple[str, str]]) -> str:
"""
Join content items with appropriate spacing and punctuation handling.

Args:
content_items: List of content strings to join.
word_delimiter: Delimiter to use between words.
content_items: List of (content, word_delimiter) pairs to join.

Returns:
Properly formatted text string.
Expand All @@ -798,7 +830,7 @@ def _join_content_items(self, content_items: list[str], word_delimiter: str) ->

result: list[str] = []

for i, content in enumerate(content_items):
for i, (content, word_delimiter) in enumerate(content_items):
if not content:
continue

Expand Down
80 changes: 79 additions & 1 deletion tests/batch/test_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from speechmatics.batch._models import JobConfig, TranscriptFilteringConfig, TranscriptionConfig
from speechmatics.batch._models import JobConfig, OperatingPoint, TranscriptFilteringConfig, TranscriptionConfig


class TestTranscriptFilteringConfigToDict:
Expand Down Expand Up @@ -127,3 +127,81 @@ def test_absent_output_config_is_none(self):
data = {"type": "transcription"}
job_config = JobConfig.from_dict(data)
assert job_config.output_config is None


class TestModelToDict:
def test_model_serializes_as_operating_point(self):
config = TranscriptionConfig(model=OperatingPoint.OMNI)
result = config.to_dict()
assert result["operating_point"] == OperatingPoint.OMNI
assert "model" not in result

def test_model_absent_leaves_operating_point_unchanged(self):
config = TranscriptionConfig(operating_point=OperatingPoint.ENHANCED)
result = config.to_dict()
assert result["operating_point"] == OperatingPoint.ENHANCED
assert "model" not in result


class TestLanguageHintsToDict:
def test_language_hints_serializes_correctly(self):
config = TranscriptionConfig(language_hints=["en", "fr"])
Comment thread
rakeshv247 marked this conversation as resolved.
result = config.to_dict()
assert result["language_hints"] == ["en", "fr"]
assert "language_hints_strict" not in result

def test_language_hints_strict_true_serializes_correctly(self):
config = TranscriptionConfig(language_hints=["en"], language_hints_strict=True)
result = config.to_dict()
assert result["language_hints"] == ["en"]
assert result["language_hints_strict"] is True
Comment thread
giorgosHadji marked this conversation as resolved.

def test_language_hints_strict_false_serializes_correctly(self):
config = TranscriptionConfig(language_hints=["en"], language_hints_strict=False)
result = config.to_dict()
assert result["language_hints"] == ["en"]
assert "language_hints_strict" in result
Comment thread
rakeshv247 marked this conversation as resolved.
assert result["language_hints_strict"] is False

def test_language_hints_absent_when_none(self):
config = TranscriptionConfig()
result = config.to_dict()
assert "language_hints" not in result
assert "language_hints_strict" not in result


class TestLanguageHintsFromDict:
def test_language_hints_deserializes_correctly(self):
data = {
"type": "transcription",
"transcription_config": {
"language": "en",
"language_hints": ["en", "fr"],
},
}
job_config = JobConfig.from_dict(data)
assert job_config.transcription_config is not None
assert job_config.transcription_config.language_hints == ["en", "fr"]

def test_language_hints_strict_deserializes_correctly(self):
data = {
"type": "transcription",
"transcription_config": {
"language": "en",
"language_hints": ["en"],
"language_hints_strict": True,
},
}
job_config = JobConfig.from_dict(data)
assert job_config.transcription_config is not None
Comment thread
rakeshv247 marked this conversation as resolved.
assert job_config.transcription_config.language_hints_strict is True
Comment thread
rakeshv247 marked this conversation as resolved.

def test_absent_fields_are_none(self):
data = {
"type": "transcription",
"transcription_config": {"language": "en"},
}
job_config = JobConfig.from_dict(data)
assert job_config.transcription_config
assert job_config.transcription_config.language_hints is None
assert job_config.transcription_config.language_hints_strict is None