diff --git a/api-reference/endpoint/model/create-model.mdx b/api-reference/endpoint/model/create-model.mdx
index 6844cd8..bff874c 100644
--- a/api-reference/endpoint/model/create-model.mdx
+++ b/api-reference/endpoint/model/create-model.mdx
@@ -1,11 +1,13 @@
---
openapi: post /model
-title: 'Create Model'
-description: 'Create a new voice model'
+title: "Create Model"
+description: "Create a new voice model"
icon: "circle-plus"
iconType: "solid"
---
-Since this endpoint requires uploading file, it only accepts `multipart/form-data` and `application/msgpack`.
+ Since this endpoint uploads files, use `multipart/form-data` for regular REST
+ requests. Let your HTTP client set the multipart `Content-Type` boundary
+ automatically.
diff --git a/developer-guide/core-features/creating-models.mdx b/developer-guide/core-features/creating-models.mdx
index 13eed18..5f6da13 100644
--- a/developer-guide/core-features/creating-models.mdx
+++ b/developer-guide/core-features/creating-models.mdx
@@ -3,14 +3,15 @@ title: "Creating Voice Models"
description: "Learn how to create custom voice models with Fish Audio"
icon: "wand-magic-sparkles"
---
-import { AudioTranscript } from '/snippets/audio-transcript.jsx';
+
+import { AudioTranscript } from "/snippets/audio-transcript.jsx";
{/* speak-mintlify-hash: 6a43e7312895ae0c33a68fad2e95821fbecd8a5bfe0c250d3ee631871dc8d410 */}
+
-
## Overview
Create custom voice models to generate consistent, high-quality speech. You can create models through our web interface or programmatically via API.
@@ -23,21 +24,15 @@ The easiest way to create a voice model:
Visit [fish.audio](https://fish.audio) and log in
-
- Click on "Models" in your dashboard
-
-
- Select "Create New Model"
-
+ Click on "Models" in your dashboard
+ Select "Create New Model"
Add 1 or more voice samples (at least 10 seconds each)
Choose privacy settings and training options
-
- Click "Create" and wait for processing
-
+ Click "Create" and wait for processing
## Using the API
@@ -57,24 +52,23 @@ Create models with the Python or JavaScript SDK:
Then create a model:
```python
- from fish_audio_sdk import Session
-
- # Initialize session with your API key
- session = Session("your_api_key")
-
- # Create the model
- model = session.create_model(
- title="My Voice Model",
- description="Custom voice for storytelling",
- voices=[
- voice_file1.read(),
- voice_file2.read()
- ],
- cover_image=image_file.read() # Optional
- )
+ from fishaudio import FishAudio
+
+ client = FishAudio(api_key="your_api_key_here")
+
+ with open("sample1.mp3", "rb") as f1, open("sample2.wav", "rb") as f2:
+ voice = client.voices.create(
+ title="My Voice Model",
+ voices=[f1.read(), f2.read()],
+ description="Custom voice for storytelling",
+ visibility="private",
+ enhance_audio_quality=True,
+ )
- print(f"Model created: {model.id}")
+ # The Python SDK maps the REST `_id` field to `voice.id`.
+ print(f"Voice model ID: {voice.id}")
```
+
First, install the SDK:
@@ -84,30 +78,27 @@ Create models with the Python or JavaScript SDK:
```
Then create a model:
-
+
```javascript
import { FishAudioClient } from "fish-audio";
import { createReadStream } from "fs";
const fishAudio = new FishAudioClient({ apiKey: process.env.FISH_API_KEY });
- const title = "My Voice Model";
- const audioFile1 = createReadStream("sample1.mp3");
- // Optionally add more samples:
- // const audioFile2 = createReadStream("sample2.wav");
- const coverImageFile = createReadStream("cover.png"); // optional
-
try {
const response = await fishAudio.voices.ivc.create({
- title,
- voices: [audioFile1],
- cover_image: coverImageFile,
+ title: "My Voice Model",
+ voices: [
+ createReadStream("sample1.mp3"),
+ createReadStream("sample2.wav"),
+ ],
description: "Custom voice for storytelling",
visibility: "private",
+ enhance_audio_quality: true,
});
console.log("Voice created:", {
- id: response._id,
+ _id: response._id,
title: response.title,
state: response.state,
});
@@ -115,6 +106,7 @@ Create models with the Python or JavaScript SDK:
console.error("Create voice request failed:", err);
}
```
+
@@ -122,33 +114,54 @@ Create models with the Python or JavaScript SDK:
Create models directly using the REST API:
+
+ The REST API accepts uploaded audio as `multipart/form-data`. Let your HTTP
+ client set the multipart `Content-Type` boundary for you.
+
+
+
+ ```bash
+ curl --request POST "https://api.fish.audio/model" \
+ --header "Authorization: Bearer $FISH_API_KEY" \
+ --form "type=tts" \
+ --form "train_mode=fast" \
+ --form "title=My Voice Model" \
+ --form "visibility=private" \
+ --form "description=Custom voice model" \
+ --form "voices=@sample1.mp3" \
+ --form "voices=@sample2.wav" \
+ --form "enhance_audio_quality=true"
+ ```
+
```python
import requests
- response = requests.post(
- "https://api.fish.audio/model",
- files=[
- ("voices", open("sample1.mp3", "rb")),
- ("voices", open("sample2.wav", "rb"))
- ],
- data=[
- ("title", "My Voice Model"),
- ("description", "Custom voice model"),
- ("visibility", "private"),
- ("type", "tts"),
- ("train_mode", "fast"),
- ("enhance_audio_quality", "true")
- ],
- headers={
- "Authorization": "Bearer YOUR_API_KEY"
- }
- )
-
+ with open("sample1.mp3", "rb") as f1, open("sample2.wav", "rb") as f2:
+ response = requests.post(
+ "https://api.fish.audio/model",
+ headers={"Authorization": "Bearer YOUR_API_KEY"},
+ data=[
+ ("type", "tts"),
+ ("train_mode", "fast"),
+ ("title", "My Voice Model"),
+ ("description", "Custom voice model"),
+ ("visibility", "private"),
+ ("enhance_audio_quality", "true"),
+ ],
+ files=[
+ ("voices", f1),
+ ("voices", f2),
+ ],
+ )
+
+ response.raise_for_status()
result = response.json()
- print(f"Model ID: {result['id']}")
+ print(f"Model ID: {result['_id']}")
+ print(f"State: {result['state']}")
```
+
```javascript
@@ -164,8 +177,8 @@ Create models directly using the REST API:
const v1 = await readFile("sample1.mp3");
const v2 = await readFile("sample2.wav");
- form.append("voices", new File([v1], "sample1.mp3"));
- form.append("voices", new File([v2], "sample2.wav"));
+ form.append("voices", new Blob([v1]), "sample1.mp3");
+ form.append("voices", new Blob([v2]), "sample2.wav");
const res = await fetch("https://api.fish.audio/model", {
method: "POST",
@@ -173,9 +186,13 @@ Create models directly using the REST API:
body: form,
});
+ if (!res.ok) throw new Error(await res.text());
+
const result = await res.json();
- console.log("Model ID:", result.id);
+ console.log("Model ID:", result._id);
+ console.log("State:", result.state);
```
+
@@ -183,25 +200,32 @@ Create models directly using the REST API:
### Required Parameters
-| Parameter | Description | Type | Options |
-|---|---|---|---|
-| **title** | Name of your model | `string` | Any text |
-| **voices** | Audio samples | `Array` | .mp3, .wav, .m4a, .opus |
-| **type*** | Model type | `enum`| `tts` |
-| **train_mode*** | Model train mode, fast means model instantly available after creation | `enum` | `fast` |
+| Parameter | Description | Type | Options |
+| ---------------- | ------------------------------------------------------------------------------ | ----------------------- | ----------------------- |
+| **title** | Name of your model | `string` | Any text |
+| **voices** | One or more audio samples | `File` or `Array` | .mp3, .wav, .m4a, .opus |
+| **type\*** | Model type | `enum` | `tts` |
+| **train_mode\*** | Model train mode. `fast` means the model is instantly available after creation | `enum` | `fast` |
-*Automatically set by Python and JavaScript SDKs
+\*Automatically set by Python and JavaScript SDKs
### Optional Parameters
-| Parameter | Description | Type | Options |
-|---|---|---|---|
-| **visibility** | Who can use your model | `enum` | `private`, `public`, `unlist`
`default: public` |
-| **description** | Model description | `string` | Any text |
-| **cover_image** | Model cover image, required if the model is public | `File` | .jpg, .png |
-| **texts** | Transcripts of audio samples | `Array` | Must match number of audio files |
-| **tags** | Tags for your model | `string[]` | Any text |
-| **enhance_audio_quality** | Remove background noise | `boolean` | `true`, `false`
`default: false` |
+| Parameter | Description | Type | Options |
+| ------------------------- | ------------------------------------------------------------------- | --------------------------- | ---------------------------------------------------- |
+| **visibility** | Who can use your model | `enum` | `private`, `public`, `unlist`
`default: public` |
+| **description** | Model description | `string` or `null` | Any text |
+| **cover_image** | Model cover image, required if the model is public | `File` | .jpg, .png |
+| **texts** | Transcripts of audio samples. If omitted, ASR transcribes the audio | `string` or `Array` | Must match number of audio files |
+| **tags** | Tags for your model | `string` or `Array` | Any text |
+| **enhance_audio_quality** | Remove background noise and normalize audio | `boolean` | `true`, `false`
`default: true` |
+| **generate_sample** | Generate a default sample text for the model | `boolean` | `true`, `false`
`default: false` |
+
+
+ The REST API defaults `visibility` to `public`. The SDK examples above set
+ `visibility` to `private`, which is safer for personal voice models and avoids
+ requiring a public `cover_image`.
+
For detailed explanations view our [API reference](/api-reference/endpoint/model/create-model).
@@ -210,10 +234,12 @@ For detailed explanations view our [API reference](/api-reference/endpoint/model
### Quality Guidelines
**Minimum Requirements:**
+
- At least 1 audio sample
- 10+ seconds per sample
**Best Practices:**
+
- Use multiple diverse samples
- 1 consistent speaker throughout
- Include different emotions and tones
@@ -227,21 +253,30 @@ Including text transcripts improves model quality:
```python
- response = requests.post(
- "https://api.fish.audio/model",
- files=[
- ("voices", open("hello.mp3", "rb")),
- ("voices", open("world.wav", "rb"))
- ],
- data=[
- ("title", "Enhanced Model"),
- ("texts", "Hello, this is my first recording."),
- ("texts", "Welcome to the world of AI voices."),
- # ... other parameters
- ],
- headers={"Authorization": "Bearer YOUR_API_KEY"}
- )
+ import requests
+
+ with open("hello.mp3", "rb") as f1, open("world.wav", "rb") as f2:
+ response = requests.post(
+ "https://api.fish.audio/model",
+ headers={"Authorization": "Bearer YOUR_API_KEY"},
+ files=[
+ ("voices", f1),
+ ("voices", f2),
+ ],
+ data=[
+ ("type", "tts"),
+ ("train_mode", "fast"),
+ ("title", "Enhanced Model"),
+ ("texts", "Hello, this is my first recording."),
+ ("texts", "Welcome to the world of AI voices."),
+ ("visibility", "private"),
+ ],
+ )
+
+ response.raise_for_status()
+ print(response.json()["_id"])
```
+
```javascript
@@ -267,35 +302,38 @@ Including text transcripts improves model quality:
console.log("Model ID:", response._id);
```
+
-Text transcripts must match the exact number of audio files. If you provide 3 audio files, you must provide exactly 3 text transcripts.
+ Text transcripts must match the exact number of audio files. If you provide 3
+ audio files, you must provide exactly 3 text transcripts.
## Using Your Model
Once training is complete:
+Use the SDK `voice.id` or the REST response `_id` as the TTS `reference_id`.
+
```python
- # Generate speech with your model
- response = requests.post(
- "https://api.fish.audio/v1/tts",
- json={
- "text": "Hello from my custom voice!",
- "model_id": model_id,
- "format": "mp3"
- },
- headers={"Authorization": "Bearer YOUR_API_KEY"}
+ from fishaudio import FishAudio
+ from fishaudio.utils import save
+
+ client = FishAudio(api_key="your_api_key_here")
+
+ audio = client.tts.convert(
+ text="Hello from my custom voice!",
+ reference_id="your_voice_model_id",
+ format="mp3",
)
- # Save the audio
- with open("output.mp3", "wb") as f:
- f.write(response.content)
+ save(audio, "output.mp3")
```
+
```javascript
@@ -306,7 +344,7 @@ Once training is complete:
const audio = await fishAudio.textToSpeech.convert({
text: "Hello from my custom voice!",
- model_id: "your_model_id_here",
+ reference_id: "your_voice_model_id",
format: "mp3",
});
@@ -314,6 +352,7 @@ Once training is complete:
await writeFile("output.mp3", buffer);
console.log("✓ Audio saved to output.mp3");
```
+
@@ -322,19 +361,32 @@ Once training is complete:
### Common Issues
**Model training fails:**
+
- Check audio quality and format
- Ensure single speaker in all samples
- Verify files are not corrupted
+- Confirm REST requests include `type=tts`, `train_mode=fast`, `title`, and at least one `voices` file
+- If `texts` are provided, make sure the count matches the number of `voices` files
**Poor voice quality:**
+
- Add more diverse audio samples
- Enable audio enhancement
- Use higher quality recording
+**Public model creation fails:**
+
+- Add a `cover_image`, or set `visibility` to `private` or `unlist`
+
+**Cannot use the created voice in TTS:**
+
+- Use REST `_id` or SDK `voice.id` as the TTS `reference_id`
+- If the model state is not `trained`, check it with [Get Model](/api-reference/endpoint/model/get-model)
+
## Best Practices
1. **Start Simple:** Begin with 2-3 samples in fast mode to test
-2. **Iterate:** Refine with more samples and quality mode
+2. **Iterate:** Refine with cleaner samples, transcripts, and audio enhancement
3. **Document:** Keep track of which samples work best
4. **Test Thoroughly:** Try different texts and emotions
5. **Privacy First:** Keep personal models private
@@ -345,4 +397,4 @@ Need help creating models?
- **API Documentation:** [Full API Reference](/api-reference/introduction)
- **Discord Community:** [Join our Discord](https://discord.gg/fish-audio)
-- **Email Support:** support@fish.audio
\ No newline at end of file
+- **Email Support:** support@fish.audio