From ac74b2c75c48d504ad8dee446b4355cc542174c4 Mon Sep 17 00:00:00 2001 From: Luoxianyi Date: Sun, 28 Jun 2026 05:25:42 +0800 Subject: [PATCH 1/6] feat: add aliyun bailina qwen3-tts-vc voice clone tts provider --- astrbot/core/config/default.py | 67 ++++++ astrbot/core/provider/manager.py | 4 + .../sources/dashscope_voice_clone_tts.py | 200 ++++++++++++++++++ .../en-US/features/config-metadata.json | 20 ++ .../ru-RU/features/config-metadata.json | 20 ++ .../zh-CN/features/config-metadata.json | 20 ++ 6 files changed, 331 insertions(+) create mode 100644 astrbot/core/provider/sources/dashscope_voice_clone_tts.py diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 7fb847dccd..343cc6c403 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1742,6 +1742,25 @@ "dashscope_tts_voice": "loongstella", "timeout": "20", }, + "阿里云百炼 音色复刻 TTS(API)": { + "hint": "使用阿里云百炼「音色复刻」生成的专属音色(Qwen3-TTS-VC 系列)。" + "请先在百炼控制台或通过声音复刻 API 创建复刻音色,获取 voice_id 后填入下方配置。" + "API Key 获取: https://bailian.console.aliyun.com/?tab=model#/api-key;" + "声音复刻文档: https://help.aliyun.com/zh/model-studio/voice-cloning-user-guide", + "id": "dashscope_voice_clone_tts", + "provider": "dashscope", + "type": "dashscope_voice_clone_tts", + "provider_type": "text_to_speech", + "enable": False, + "api_key": "", + "model": "qwen3-tts-vc-2026-01-22", + "voice_id": "", + "language_type": "", + "workspace_id": "", + "region": "cn-beijing", + "base_url": "", + "timeout": "20", + }, "Azure TTS": { "id": "azure_tts", "type": "azure_tts", @@ -2323,6 +2342,54 @@ "hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)", }, "dashscope_tts_voice": {"description": "音色", "type": "string"}, + "voice_id": { + "description": "复刻音色 ID", + "type": "string", + "hint": "由阿里云百炼「音色复刻」接口返回(output.voice,例如 yourVoice)。" + "请确保 voice_id 与所选模型属于同一次声音复刻创建结果。", + }, + "language_type": { + "description": "合成语种 (可选)", + "type": "string", + "hint": "合成音频的语种。留空表示由模型自动判断 (Auto)。" + "指定单一语种通常能显著提升合成质量。", + "options": [ + "", + "Auto", + "Chinese", + "English", + "German", + "Italian", + "Portuguese", + "Spanish", + "Japanese", + "Korean", + "French", + "Russian", + ], + }, + "workspace_id": { + "description": "workspace ID (可选)", + "type": "string", + "hint": "填写后会自动切换到百炼 workspace 专属域名 " + "(https://{WorkspaceId}.{region}.maas.aliyuncs.com)," + "通常能获得更好的性能和稳定性。可在百炼控制台「workspace 详情」页面查看。", + }, + "region": { + "description": "URL区域 (可选)", + "type": "string", + "hint": "仅在填写了 workspace ID 时生效。北京地域选择 cn-beijing,新加坡地域选择 ap-southeast-1。", + "options": [ + "cn-beijing", + "ap-southeast-1", + ], + }, + "base_url": { + "description": "自定义 DashScope Base URL (可选)", + "type": "string", + "hint": "完整的 HTTP API Base URL,例如 https://dashscope.aliyuncs.com/api/v1。" + "若填写则优先级高于 workspace ID + 地域 的组合。一般无需填写。", + }, "gm_resp_image_modal": { "description": "启用图片模态", "type": "bool", diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index ae4001fcd6..23922e487c 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -451,6 +451,10 @@ def dynamic_import_provider(self, type: str) -> None: from .sources.dashscope_tts import ( ProviderDashscopeTTSAPI as ProviderDashscopeTTSAPI, ) + case "dashscope_voice_clone_tts": + from .sources.dashscope_voice_clone_tts import ( + ProviderDashscopeVoiceCloneTTSAPI as ProviderDashscopeVoiceCloneTTSAPI, + ) case "azure_tts": from .sources.azure_tts_source import ( AzureTTSProvider as AzureTTSProvider, diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py new file mode 100644 index 0000000000..9fd08b53c0 --- /dev/null +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -0,0 +1,200 @@ +"""阿里云百炼 - 音色复刻(Qwen-TTS Voice Clone)TTS 提供商。 + +通过指定声音复刻产生的 voice_id(如 ``yourVoice``)与对应的 Qwen3 TTS-VC +合成模型(如 ``qwen3-tts-vc-2026-01-22``)调用阿里云 DashScope 的多模态生成 +接口完成语音合成。该提供商仅负责"使用"已经在百炼控制台中创建好的复刻音色, +音色的创建/管理流程请直接通过百炼控制台或 API 完成。 +""" + +from __future__ import annotations + +import asyncio +import base64 +import logging +import os +import uuid + +import aiohttp +import dashscope + +try: + from dashscope.aigc.multimodal_conversation import MultiModalConversation +except ImportError: # pragma: no cover - 老版本 dashscope 没有 Qwen TTS 能力 + MultiModalConversation = None + +from astrbot.core.utils.astrbot_path import get_astrbot_temp_path + +from ..entities import ProviderType +from ..provider import TTSProvider +from ..register import register_provider_adapter + + +@register_provider_adapter( + "dashscope_voice_clone_tts", + "阿里云百炼 音色复刻 TTS API (Qwen3-TTS-VC)", + provider_type=ProviderType.TEXT_TO_SPEECH, +) +class ProviderDashscopeVoiceCloneTTSAPI(TTSProvider): + """使用阿里云百炼 Qwen3-TTS-VC 系列模型合成"复刻音色"的 TTS 提供商。""" + + def __init__( + self, + provider_config: dict, + provider_settings: dict, + ) -> None: + super().__init__(provider_config, provider_settings) + self.chosen_api_key: str = provider_config.get("api_key", "") + # 复刻音色 ID,由百炼音色复刻接口返回(output.voice) + self.voice_id: str = provider_config.get( + "voice_id", + "", + ) + # 合成语种,可选;默认让模型自动判断 + self.language_type: str = provider_config.get( + "language_type", + "", + ) + # workspace ID(可选),填写后会切换到百炼 workspace 专属域名以获得更佳性能 + self.workspace_id: str = provider_config.get( + "workspace_id", + "", + ) + # 地域,默认 cn-beijing;可选 ap-southeast-1(新加坡) + self.region: str = ( + provider_config.get( + "region", + "cn-beijing", + ) + or "cn-beijing" + ) + # 自定义 base url(优先级最高),不填时根据 workspace_id / region 推断 + self.base_http_api_url: str = provider_config.get( + "base_url", + "", + ) + + self.set_model( + provider_config.get("model") or "qwen3-tts-vc-2026-01-22", + ) + self.timeout_ms = float(provider_config.get("timeout", 20)) * 1000 + + dashscope.api_key = self.chosen_api_key + resolved_base_url = self._resolve_base_url() + if resolved_base_url: + dashscope.base_http_api_url = resolved_base_url + + # public API# + async def get_audio(self, text: str) -> str: + model = self.get_model() + if not model: + raise RuntimeError("Dashscope Voice Clone TTS model is not configured.") + if not self.voice_id: + raise RuntimeError( + "未配置复刻音色 ID(voice_id)," + "请先在阿里云百炼控制台或 API 创建复刻音色后再填写。", + ) + + temp_dir = get_astrbot_temp_path() + os.makedirs(temp_dir, exist_ok=True) + + # 每次调用前确保 dashscope 全局配置使用本提供商指定的值。 + # 避免多 TTS 共存时被其它提供商覆盖。 + dashscope.api_key = self.chosen_api_key + resolved_base_url = self._resolve_base_url() + if resolved_base_url: + dashscope.base_http_api_url = resolved_base_url + + audio_bytes = await self._synthesize(model, text) + if not audio_bytes: + raise RuntimeError( + "音色复刻语音合成失败,返回内容为空。请检查模型名、voice_id " + "以及对应的 API Key/地域是否匹配。", + ) + + path = os.path.join( + temp_dir, + f"dashscope_voice_clone_tts_{uuid.uuid4()}.wav", + ) + with open(path, "wb") as f: + f.write(audio_bytes) + return path + + # internal helpers# + def _resolve_base_url(self) -> str: + """根据配置推断 DashScope HTTP base url。""" + if self.base_http_api_url: + return self.base_http_api_url.rstrip("/") + if self.workspace_id: + region = self.region or "cn-beijing" + return f"https://{self.workspace_id}.{region}.maas.aliyuncs.com/api/v1" + # 不指定专属域名时返回空字符串,使用 dashscope SDK 内置默认域名 + return "" + + def _call_qwen_tts(self, model: str, text: str): + if MultiModalConversation is None: + raise RuntimeError( + "dashscope SDK 缺少 MultiModalConversation。请升级 dashscope " + "至最新版本以使用 Qwen TTS 系列模型。", + ) + + kwargs = { + "model": model, + "messages": None, + "api_key": self.chosen_api_key, + "voice": self.voice_id, + "text": text, + } + if self.language_type: + kwargs["language_type"] = self.language_type + return MultiModalConversation.call(**kwargs) + + async def _synthesize(self, model: str, text: str) -> bytes | None: + loop = asyncio.get_running_loop() + response = await loop.run_in_executor( + None, + self._call_qwen_tts, + model, + text, + ) + audio_bytes = await self._extract_audio_from_response(response) + if not audio_bytes: + raise RuntimeError( + f"模型 '{model}' 音色复刻语音合成失败。原始返回: {response}", + ) + return audio_bytes + + async def _extract_audio_from_response(self, response) -> bytes | None: + output = getattr(response, "output", None) + audio_obj = getattr(output, "audio", None) if output is not None else None + if not audio_obj: + return None + + data_b64 = getattr(audio_obj, "data", None) + if data_b64: + try: + return base64.b64decode(data_b64) + except (ValueError, TypeError): + logging.exception("Failed to decode base64 audio data.") + return None + + url = getattr(audio_obj, "url", None) + if url: + return await self._download_audio_from_url(url) + return None + + async def _download_audio_from_url(self, url: str) -> bytes | None: + if not url: + return None + timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20 + try: + async with ( + aiohttp.ClientSession() as session, + session.get( + url, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response, + ): + return await response.read() + except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e: + logging.exception(f"Failed to download audio from URL {url}: {e}") + return None diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 31d1362b26..47e7ca7f73 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1440,6 +1440,26 @@ "dashscope_tts_voice": { "description": "Voice" }, + "voice_id": { + "description": "Voice Clone ID", + "hint": "Returned by the Alibaba Cloud Bailian Voice Cloning API (output.voice, e.g. yourVoice). Make sure the voice_id and the selected model belong to the same voice clone creation result." + }, + "language_type": { + "description": "Synthesis Language (optional)", + "hint": "Language for the synthesized audio. Leave empty for auto-detection (Auto). Specifying a single language usually improves synthesis quality significantly." + }, + "workspace_id": { + "description": "Workspace ID (optional)", + "hint": "When filled, the provider switches to the workspace-specific domain (https://{WorkspaceId}.{region}.maas.aliyuncs.com) for better performance and stability. Find it in the Bailian console under Workspace Details." + }, + "region": { + "description": "Region (optional)", + "hint": "Only effective when Workspace ID is set. Use cn-beijing for Beijing, ap-southeast-1 for Singapore." + }, + "base_url": { + "description": "Custom DashScope Base URL (optional)", + "hint": "Full HTTP API Base URL, e.g. https://dashscope.aliyuncs.com/api/v1. Takes precedence over Workspace ID + Region when set. Usually not needed." + }, "gm_resp_image_modal": { "description": "Enable image modality", "hint": "When enabled, responses can include images. Requires model support or it will error. See the Google Gemini website for supported models. Tip: if you need image generation, disable the `Enable member recognition` setting for better results." diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index f0a1294d7c..f90a30ec26 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -1437,6 +1437,26 @@ "dashscope_tts_voice": { "description": "Голос" }, + "voice_id": { + "description": "ID клонированного голоса", + "hint": "Возвращается API клонирования голоса Alibaba Cloud Bailian (output.voice, напр. yourVoice). Убедитесь, что voice_id и выбранная модель относятся к одному результату клонирования голоса." + }, + "language_type": { + "description": "Язык синтеза (опционально)", + "hint": "Язык синтезируемого аудио. Оставьте пустым для автоопределения (Auto). Указание одного языка обычно значительно улучшает качество синтеза." + }, + "workspace_id": { + "description": "ID рабочего пространства (опционально)", + "hint": "При заполнении провайдер переключается на домен рабочего пространства (https://{WorkspaceId}.{region}.maas.aliyuncs.com) для лучшей производительности. Можно найти в консоли Bailian в разделе сведений о рабочем пространстве." + }, + "region": { + "description": "Регион (опционально)", + "hint": "Действует только при указании ID рабочего пространства. Используйте cn-beijing для Пекина, ap-southeast-1 для Сингапура." + }, + "base_url": { + "description": "Пользовательский DashScope Base URL (опционально)", + "hint": "Полный HTTP API Base URL, напр. https://dashscope.aliyuncs.com/api/v1. Приоритет выше, чем ID рабочего пространства + регион. Обычно не требуется." + }, "gm_resp_image_modal": { "description": "Включить визуальную модальность", "hint": "Если включено, ответы могут содержать изображения. Требует поддержки моделью. Совет: для генерации изображений отключите 'Распознавание участников'." diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index 6ae988383f..2f4627e121 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1442,6 +1442,26 @@ "dashscope_tts_voice": { "description": "音色" }, + "voice_id": { + "description": "复刻音色 ID", + "hint": "由阿里云百炼「音色复刻」接口返回(output.voice,例如 yourVoice)。请确保 voice_id 与所选模型属于同一次声音复刻创建结果。" + }, + "language_type": { + "description": "合成语种 (可选)", + "hint": "合成音频的语种。留空表示由模型自动判断 (Auto)。指定单一语种通常能显著提升合成质量。" + }, + "workspace_id": { + "description": "workspace ID (可选)", + "hint": "填写后会自动切换到百炼 workspace 专属域名 (https://{WorkspaceId}.{region}.maas.aliyuncs.com),通常能获得更好的性能和稳定性。可在百炼控制台「workspace 详情」页面查看。" + }, + "region": { + "description": "URL区域 (可选)", + "hint": "仅在填写了 workspace ID 时生效。北京地域选择 cn-beijing,新加坡地域选择 ap-southeast-1。" + }, + "base_url": { + "description": "自定义 DashScope Base URL (可选)", + "hint": "完整的 HTTP API Base URL,例如 https://dashscope.aliyuncs.com/api/v1。若填写则优先级高于 workspace ID + 地域 的组合。一般无需填写。" + }, "gm_resp_image_modal": { "description": "启用图片模态", "hint": "启用后,将支持返回图片内容。需要模型支持,否则会报错。具体支持模型请查看 Google Gemini 官方网站。温馨提示,如果您需要生成图片,请关闭 `启用群员识别` 配置获得更好的效果。" From f8467fa15bbe7cacaafef37b4ef4f9040fdc3e5b Mon Sep 17 00:00:00 2001 From: Luoxianyi <162295853+makuralymi@users.noreply.github.com> Date: Sun, 28 Jun 2026 13:56:47 +0800 Subject: [PATCH 2/6] Update astrbot/core/provider/sources/dashscope_voice_clone_tts.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- astrbot/core/provider/sources/dashscope_voice_clone_tts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py index 9fd08b53c0..7b833982d6 100644 --- a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -78,10 +78,7 @@ def __init__( ) self.timeout_ms = float(provider_config.get("timeout", 20)) * 1000 - dashscope.api_key = self.chosen_api_key - resolved_base_url = self._resolve_base_url() - if resolved_base_url: - dashscope.base_http_api_url = resolved_base_url + # API Key 和 Base URL 将在每次调用时通过 kwargs 动态传入,避免修改全局配置 # public API# async def get_audio(self, text: str) -> str: From 8428eebe6772814c391b9ab40bd3233ac6a71313 Mon Sep 17 00:00:00 2001 From: Luoxianyi <162295853+makuralymi@users.noreply.github.com> Date: Sun, 28 Jun 2026 13:56:57 +0800 Subject: [PATCH 3/6] Update astrbot/core/provider/sources/dashscope_voice_clone_tts.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- astrbot/core/provider/sources/dashscope_voice_clone_tts.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py index 7b833982d6..1ffa565b1b 100644 --- a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -141,6 +141,9 @@ def _call_qwen_tts(self, model: str, text: str): "voice": self.voice_id, "text": text, } + resolved_base_url = self._resolve_base_url() + if resolved_base_url: + kwargs["base_http_api_url"] = resolved_base_url if self.language_type: kwargs["language_type"] = self.language_type return MultiModalConversation.call(**kwargs) From e274de920e6b1320d1a9ad463657148e480fffdc Mon Sep 17 00:00:00 2001 From: Luoxianyi <162295853+makuralymi@users.noreply.github.com> Date: Sun, 28 Jun 2026 13:57:05 +0800 Subject: [PATCH 4/6] Update astrbot/core/provider/sources/dashscope_voice_clone_tts.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../core/provider/sources/dashscope_voice_clone_tts.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py index 1ffa565b1b..7fda67777f 100644 --- a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -156,10 +156,16 @@ async def _synthesize(self, model: str, text: str) -> bytes | None: model, text, ) + if hasattr(response, "status_code") and response.status_code != 200: + raise RuntimeError( + f"DashScope API 调用失败,状态码: {response.status_code}," + f"错误码: {getattr(response, 'code', 'Unknown')}," + f"错误信息: {getattr(response, 'message', 'Unknown')}" + ) audio_bytes = await self._extract_audio_from_response(response) if not audio_bytes: raise RuntimeError( - f"模型 '{model}' 音色复刻语音合成失败。原始返回: {response}", + f"模型 '{model}' 音色复刻语音合成失败。返回内容为空。", ) return audio_bytes From f937060f93b4a2ed6d94a4efbe3b082ca4fae599 Mon Sep 17 00:00:00 2001 From: Luoxianyi <162295853+makuralymi@users.noreply.github.com> Date: Sun, 28 Jun 2026 13:57:14 +0800 Subject: [PATCH 5/6] Update astrbot/core/provider/sources/dashscope_voice_clone_tts.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- astrbot/core/provider/sources/dashscope_voice_clone_tts.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py index 7fda67777f..f3183eae85 100644 --- a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -200,6 +200,9 @@ async def _download_audio_from_url(self, url: str) -> bytes | None: timeout=aiohttp.ClientTimeout(total=timeout), ) as response, ): + if response.status != 200: + logging.error(f"Failed to download audio from URL {url}, HTTP status: {response.status}") + return None return await response.read() except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e: logging.exception(f"Failed to download audio from URL {url}: {e}") From d3f0769ae9498f85d3fa30b978783cf4fe90dbb2 Mon Sep 17 00:00:00 2001 From: Luoxianyi <162295853+makuralymi@users.noreply.github.com> Date: Sun, 28 Jun 2026 13:57:24 +0800 Subject: [PATCH 6/6] Update astrbot/core/provider/sources/dashscope_voice_clone_tts.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- astrbot/core/provider/sources/dashscope_voice_clone_tts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py index f3183eae85..68e31c1cab 100644 --- a/astrbot/core/provider/sources/dashscope_voice_clone_tts.py +++ b/astrbot/core/provider/sources/dashscope_voice_clone_tts.py @@ -96,10 +96,7 @@ async def get_audio(self, text: str) -> str: # 每次调用前确保 dashscope 全局配置使用本提供商指定的值。 # 避免多 TTS 共存时被其它提供商覆盖。 - dashscope.api_key = self.chosen_api_key - resolved_base_url = self._resolve_base_url() - if resolved_base_url: - dashscope.base_http_api_url = resolved_base_url + # 每次调用时通过 kwargs 动态传入 API Key 和 Base URL,无需修改全局配置 audio_bytes = await self._synthesize(model, text) if not audio_bytes: