@@ -57,6 +57,7 @@ def create(
57
57
* ,
58
58
file : FileTypes ,
59
59
model : Union [str , AudioModel ],
60
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
60
61
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
61
62
response_format : Union [Literal ["json" ], NotGiven ] = NOT_GIVEN ,
62
63
language : str | NotGiven = NOT_GIVEN ,
@@ -118,6 +119,7 @@ def create(
118
119
file : FileTypes ,
119
120
model : Union [str , AudioModel ],
120
121
stream : Literal [True ],
122
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
121
123
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
122
124
language : str | NotGiven = NOT_GIVEN ,
123
125
prompt : str | NotGiven = NOT_GIVEN ,
@@ -152,6 +154,11 @@ def create(
152
154
153
155
Note: Streaming is not supported for the `whisper-1` model and will be ignored.
154
156
157
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
158
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
159
+ boundaries. `server_vad` object can be provided to tweak VAD detection
160
+ parameters manually. If unset, the audio is transcribed as a single block.
161
+
155
162
include: Additional information to include in the transcription response. `logprobs` will
156
163
return the log probabilities of the tokens in the response to understand the
157
164
model's confidence in the transcription. `logprobs` only works with
@@ -200,6 +207,7 @@ def create(
200
207
file : FileTypes ,
201
208
model : Union [str , AudioModel ],
202
209
stream : bool ,
210
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
203
211
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
204
212
language : str | NotGiven = NOT_GIVEN ,
205
213
prompt : str | NotGiven = NOT_GIVEN ,
@@ -234,6 +242,11 @@ def create(
234
242
235
243
Note: Streaming is not supported for the `whisper-1` model and will be ignored.
236
244
245
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
246
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
247
+ boundaries. `server_vad` object can be provided to tweak VAD detection
248
+ parameters manually. If unset, the audio is transcribed as a single block.
249
+
237
250
include: Additional information to include in the transcription response. `logprobs` will
238
251
return the log probabilities of the tokens in the response to understand the
239
252
model's confidence in the transcription. `logprobs` only works with
@@ -281,6 +294,7 @@ def create(
281
294
* ,
282
295
file : FileTypes ,
283
296
model : Union [str , AudioModel ],
297
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
284
298
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
285
299
language : str | NotGiven = NOT_GIVEN ,
286
300
prompt : str | NotGiven = NOT_GIVEN ,
@@ -299,6 +313,7 @@ def create(
299
313
{
300
314
"file" : file ,
301
315
"model" : model ,
316
+ "chunking_strategy" : chunking_strategy ,
302
317
"include" : include ,
303
318
"language" : language ,
304
319
"prompt" : prompt ,
@@ -357,6 +372,8 @@ async def create(
357
372
* ,
358
373
file : FileTypes ,
359
374
model : Union [str , AudioModel ],
375
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
376
+ include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
360
377
response_format : Union [Literal ["json" ], NotGiven ] = NOT_GIVEN ,
361
378
language : str | NotGiven = NOT_GIVEN ,
362
379
prompt : str | NotGiven = NOT_GIVEN ,
@@ -369,7 +386,68 @@ async def create(
369
386
extra_query : Query | None = None ,
370
387
extra_body : Body | None = None ,
371
388
timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
372
- ) -> Transcription : ...
389
+ ) -> TranscriptionCreateResponse :
390
+ """
391
+ Transcribes audio into the input language.
392
+
393
+ Args:
394
+ file:
395
+ The audio file object (not file name) to transcribe, in one of these formats:
396
+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
397
+
398
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
399
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
400
+ Whisper V2 model).
401
+
402
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
403
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
404
+ boundaries. `server_vad` object can be provided to tweak VAD detection
405
+ parameters manually. If unset, the audio is transcribed as a single block.
406
+
407
+ include: Additional information to include in the transcription response. `logprobs` will
408
+ return the log probabilities of the tokens in the response to understand the
409
+ model's confidence in the transcription. `logprobs` only works with
410
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
411
+ `gpt-4o-mini-transcribe`.
412
+
413
+ language: The language of the input audio. Supplying the input language in
414
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
415
+ format will improve accuracy and latency.
416
+
417
+ prompt: An optional text to guide the model's style or continue a previous audio
418
+ segment. The
419
+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
420
+ should match the audio language.
421
+
422
+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
423
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
424
+ the only supported format is `json`.
425
+
426
+ stream: If set to true, the model response data will be streamed to the client as it is
427
+ generated using
428
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
429
+ See the
430
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
431
+ for more information.
432
+
433
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
434
+
435
+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
436
+ output more random, while lower values like 0.2 will make it more focused and
437
+ deterministic. If set to 0, the model will use
438
+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
439
+ automatically increase the temperature until certain thresholds are hit.
440
+
441
+ timestamp_granularities: The timestamp granularities to populate for this transcription.
442
+ `response_format` must be set `verbose_json` to use timestamp granularities.
443
+ Either or both of these options are supported: `word`, or `segment`. Note: There
444
+ is no additional latency for segment timestamps, but generating word timestamps
445
+ incurs additional latency.
446
+
447
+ extra_headers: Send extra headers
448
+
449
+ extra_query: Add additional query parameters to the request
450
+ """
373
451
374
452
@overload
375
453
async def create (
@@ -418,6 +496,7 @@ async def create(
418
496
file : FileTypes ,
419
497
model : Union [str , AudioModel ],
420
498
stream : Literal [True ],
499
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
421
500
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
422
501
language : str | NotGiven = NOT_GIVEN ,
423
502
prompt : str | NotGiven = NOT_GIVEN ,
@@ -452,6 +531,11 @@ async def create(
452
531
453
532
Note: Streaming is not supported for the `whisper-1` model and will be ignored.
454
533
534
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
535
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
536
+ boundaries. `server_vad` object can be provided to tweak VAD detection
537
+ parameters manually. If unset, the audio is transcribed as a single block.
538
+
455
539
include: Additional information to include in the transcription response. `logprobs` will
456
540
return the log probabilities of the tokens in the response to understand the
457
541
model's confidence in the transcription. `logprobs` only works with
@@ -500,6 +584,7 @@ async def create(
500
584
file : FileTypes ,
501
585
model : Union [str , AudioModel ],
502
586
stream : bool ,
587
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
503
588
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
504
589
language : str | NotGiven = NOT_GIVEN ,
505
590
prompt : str | NotGiven = NOT_GIVEN ,
@@ -534,6 +619,11 @@ async def create(
534
619
535
620
Note: Streaming is not supported for the `whisper-1` model and will be ignored.
536
621
622
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
623
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
624
+ boundaries. `server_vad` object can be provided to tweak VAD detection
625
+ parameters manually. If unset, the audio is transcribed as a single block.
626
+
537
627
include: Additional information to include in the transcription response. `logprobs` will
538
628
return the log probabilities of the tokens in the response to understand the
539
629
model's confidence in the transcription. `logprobs` only works with
@@ -581,6 +671,7 @@ async def create(
581
671
* ,
582
672
file : FileTypes ,
583
673
model : Union [str , AudioModel ],
674
+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
584
675
include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
585
676
language : str | NotGiven = NOT_GIVEN ,
586
677
prompt : str | NotGiven = NOT_GIVEN ,
@@ -599,6 +690,7 @@ async def create(
599
690
{
600
691
"file" : file ,
601
692
"model" : model ,
693
+ "chunking_strategy" : chunking_strategy ,
602
694
"include" : include ,
603
695
"language" : language ,
604
696
"prompt" : prompt ,
0 commit comments