diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 2d5797f983..d8b16aa1dc 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1838,6 +1838,7 @@ "embedding_api_base": "", "embedding_model": "", "embedding_dimensions": 1024, + "max_batch_size": 100, "timeout": 20, "proxy": "", }, @@ -1852,6 +1853,7 @@ "embedding_api_base": "", "embedding_model": "gemini-embedding-exp-03-07", "embedding_dimensions": 768, + "max_batch_size": 100, "timeout": 20, "proxy": "", }, @@ -1867,6 +1869,7 @@ "embedding_model": "nvidia/llama-nemotron-embed-1b-v2", "input_type": "passage", "embedding_dimensions": 1024, + "max_batch_size": 100, "timeout": 20, "proxy": "", }, @@ -1880,9 +1883,25 @@ "embedding_api_base": "http://localhost:11434", "embedding_model": "nomic-embed-text", "embedding_dimensions": 768, + "max_batch_size": 100, "timeout": 60, "proxy": "", }, + "阿里云百炼 Embedding": { + "id": "bailian_embedding", + "type": "openai_embedding", + "provider": "bailian", + "provider_type": "embedding", + "hint": "provider_group.provider.bailian_embedding.hint", + "enable": True, + "embedding_api_key": "", + "embedding_api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "embedding_model": "text-embedding-v4", + "embedding_dimensions": 1024, + "max_batch_size": 10, + "timeout": 30, + "proxy": "", + }, "vLLM Rerank": { "id": "vllm_rerank", "type": "vllm_rerank", @@ -2210,6 +2229,11 @@ "hint": "嵌入向量的维度。根据模型不同,可能需要调整,请参考具体模型的文档。此配置项请务必填写正确,否则将导致向量数据库无法正常工作。", "_special": "get_embedding_dim", }, + "max_batch_size": { + "description": "最大批次大小", + "type": "int", + "hint": "每批向嵌入 API 发送的文本数量上限。各服务商限制不同:DashScope/阿里云百炼限制为 10;OpenAI 支持 2048;Ollama/Gemini/NVIDIA 通常可设为 100-2048。超过限制会触发 API 400 错误。", + }, "embedding_model": { "description": "嵌入模型", "type": "string", diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py index c29e45876d..d5151b4f74 100644 --- a/astrbot/core/knowledge_base/kb_helper.py +++ b/astrbot/core/knowledge_base/kb_helper.py @@ -365,7 +365,10 @@ async def upload_document( contents = [] metadatas = [] for idx, chunk_text in enumerate(chunks_text): - contents.append(chunk_text) + # Replace lone surrogates that break UTF-8 encoding (e.g., broken + # emoji codepoints from PDF text extraction). + sanitized = chunk_text.encode("utf-8", errors="replace").decode("utf-8") + contents.append(sanitized) metadatas.append( { "kb_id": self.kb.kb_id, diff --git a/astrbot/core/provider/provider.py b/astrbot/core/provider/provider.py index 0cc9f1ca1c..8ce7b0542b 100644 --- a/astrbot/core/provider/provider.py +++ b/astrbot/core/provider/provider.py @@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator from typing import Literal, TypeAlias, Union +from astrbot import logger from astrbot.core.agent.message import ContentPart, Message, is_checkpoint_message from astrbot.core.agent.tool import ToolSet from astrbot.core.provider.entities import ( @@ -338,6 +339,21 @@ def get_dim(self) -> int: """获取向量的维度""" ... + @property + def max_batch_size(self) -> int: + """Maximum batch size per single embedding API call. + + Providers may set ``max_batch_size`` in their config to override this + value. For example, DashScope / Alibaba Cloud Bailian requires 10. + + The default of 100 is safe for most providers (OpenAI supports up to + 2048, Ollama and Gemini also handle large batches without issues). + + Returns: + The maximum number of texts per batch. + """ + return int(self.provider_config.get("max_batch_size", 100)) + async def test(self) -> None: await self.get_embedding("astrbot") @@ -362,6 +378,16 @@ async def get_embeddings_batch( 向量列表 """ + # Respect the provider's maximum batch size limit. + if batch_size > self.max_batch_size: + logger.debug( + "Batch size %d exceeds provider limit %d, capping to %d.", + batch_size, + self.max_batch_size, + self.max_batch_size, + ) + batch_size = self.max_batch_size + semaphore = asyncio.Semaphore(tasks_limit) all_embeddings: list[list[float]] = [] failed_batches: list[tuple[int, list[str]]] = [] diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index dad5a53a25..98b129cd44 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1386,6 +1386,9 @@ "openai_embedding": { "hint": "If testing fails, try adding /v1 at the end for some OpenAI API versions." }, + "bailian_embedding": { + "hint": "Alibaba Cloud Bailian Embedding uses OpenAI-compatible API format. Get your API Key from https://bailian.console.aliyun.com/?tab=model#/api-key. The recommended embedding model is text-embedding-v4 (1024 dimensions), with a batch limit of 10." + }, "gemini_embedding": { "hint": "Gemini Embedding does not require manually adding /v1beta." }, diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index bcfb4e20dc..085b4d4828 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1388,6 +1388,9 @@ "openai_embedding": { "hint": "如果测试不通过,可以尝试添加 /v1 在末尾以兼容部分 OpenAI API 版本。" }, + "bailian_embedding": { + "hint": "阿里云百炼 Embedding 使用兼容 OpenAI 的 API 格式。API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。嵌入模型推荐 text-embedding-v4(1024 维),批次上限 10。" + }, "gemini_embedding": { "hint": "Gemini Embedding 无需手动添加 /v1beta。" }, diff --git a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue index 29a49d0da7..a60e14974b 100644 --- a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue +++ b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue @@ -173,7 +173,7 @@