diff --git a/docs/features/openai-compatible-video-generation/plan.md b/docs/features/openai-compatible-video-generation/plan.md new file mode 100644 index 000000000..37b2c5b4f --- /dev/null +++ b/docs/features/openai-compatible-video-generation/plan.md @@ -0,0 +1,42 @@ +# Plan + +## Approach +Treat video generation as a first-class model capability parallel to image generation and TTS: +- Extend shared model/type enums and model-db parsing to include `videoGeneration`. +- Add a shared video compatibility helper that can recover video intent from model metadata, endpoint hints, modalities, or known model ID patterns when upstream data is incomplete. +- Add an OpenAI-compatible video runtime path that sends requests to `/v1/videos`, normalizes provider responses, and emits media output into the assistant stream. +- Reuse the current assistant media block transport by carrying video payloads through the existing message block structure with video MIME detection on the renderer side. + +## Affected Areas +- Shared types/contracts: + - `src/shared/model.ts` + - `src/shared/types/model-db.ts` + - `src/shared/types/presenters/llmprovider.presenter.d.ts` + - `src/shared/types/presenters/legacy.presenters.d.ts` + - `src/shared/videoGenerationSettings.ts` (new) +- Main runtime/provider: + - `src/main/presenter/configPresenter/index.ts` + - `src/main/presenter/configPresenter/modelConfig.ts` + - `src/main/presenter/llmProviderPresenter/index.ts` + - `src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts` + - `src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts` +- Renderer: + - `src/renderer/src/composables/useModelTypeDetection.ts` + - `src/renderer/src/components/chat/messageListItems.ts` + - `src/renderer/src/components/message/MessageItemAssistant.vue` + - `src/renderer/src/components/message/MessageBlockVideo.vue` (new) + - `src/renderer/settings/components/ProviderModelList.vue` +- Model DB: + - `resources/model-db/providers.json` + +## Compatibility +- Existing text, image, and TTS paths remain unchanged. +- Existing assistant block persistence remains compatible by reusing the current media payload field rather than changing the storage shape. +- Future video models can plug in through shared detection helpers or explicit `videoGeneration` metadata. + +## Verification Strategy +Run: +- `pnpm run typecheck` +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` diff --git a/docs/features/openai-compatible-video-generation/spec.md b/docs/features/openai-compatible-video-generation/spec.md new file mode 100644 index 000000000..66550c902 --- /dev/null +++ b/docs/features/openai-compatible-video-generation/spec.md @@ -0,0 +1,32 @@ +# OpenAI-Compatible Video Generation + +## User Need +Users need DeepChat to recognize and run video generation models such as `doubao-seedance-2-0-fast-260128` through the same model-driven provider flow used by text and audio generation, without hardcoding one-off provider logic for each future video model. + +## Goal +Enable first-class video generation routing in DeepChat for OpenAI-compatible providers, starting with AIHubMix Seedance models and leaving a compatibility layer for future video models. + +## Acceptance Criteria +1. Shared model/type contracts support `videoGeneration` and preserve compatibility with existing model metadata. +2. DeepChat can recognize `doubao-seedance-2-0-fast-260128` as a video generation model even when upstream metadata is incomplete or still marked as `chat`. +3. Main runtime can route video generation requests through an OpenAI-compatible `/v1/videos` flow. +4. Video generation responses are normalized into a stable internal result shape that future providers/models can reuse. +5. Generated video output reaches the existing assistant message pipeline and renders in the chat UI. +6. Validation commands pass: +- `pnpm run typecheck` +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` + +## Constraints +- Keep the provider integration generic for OpenAI-compatible video endpoints. +- Reuse the current assistant media block pipeline where practical instead of introducing a parallel storage format. +- Do not scope in advanced video editing controls or provider-specific parameter UIs for this change. + +## Non-Goals +- Dedicated video generation settings panels. +- Agent-level video generation tool configuration. +- Non-OpenAI-compatible video provider protocols. + +## Open Questions +- None for current scope. diff --git a/docs/features/openai-compatible-video-generation/tasks.md b/docs/features/openai-compatible-video-generation/tasks.md new file mode 100644 index 000000000..d27f8ab8f --- /dev/null +++ b/docs/features/openai-compatible-video-generation/tasks.md @@ -0,0 +1,25 @@ +# Tasks + +## Shared Types + Detection +- [x] Add `ModelType.VideoGeneration` and extend model-db parsing/schema for `videoGeneration`. +- [x] Add shared video detection/compatibility helpers for endpoint hints, modalities, and known model IDs. +- [x] Update model config inference to classify video models consistently in main and renderer flows. +- [x] Extend session generation settings/contracts and draft state to carry `videoGeneration` options. + +## Runtime + Provider +- [x] Add `generateVideoStandalone` presenter contracts and implementation. +- [x] Add OpenAI-compatible `/v1/videos` request/response normalization in the AI SDK runtime/provider path. +- [x] Persist and sanitize session-level video generation settings through agent runtime and sqlite storage. +- [ ] Mark Seedance built-in model metadata as `videoGeneration` where available. + +## Renderer +- [x] Expose video model detection for UI behavior alignment. +- [x] Add assistant message rendering for generated video media. +- [x] Update model list/type display for video generation models. +- [x] Expose video generation settings in chat status bar and model config dialog flows. + +## Validation +- [x] Run `pnpm run typecheck`. +- [x] Run `pnpm run format`. +- [x] Run `pnpm run i18n`. +- [x] Run `pnpm run lint`. diff --git a/docs/issues/merge-dev-into-gen-video/plan.md b/docs/issues/merge-dev-into-gen-video/plan.md new file mode 100644 index 000000000..caf4ddd68 --- /dev/null +++ b/docs/issues/merge-dev-into-gen-video/plan.md @@ -0,0 +1,20 @@ +# Plan + +## Scope +将 `origin/dev` 合并到当前 `gen-video` 分支,识别并解决冲突文件,保留双方必要改动,并执行仓库要求的基础校验。 + +## Implementation decisions +- 先 `git fetch origin dev`,再执行 `git merge origin/dev` 以基于最新远端 `dev` 合并。 +- 冲突解决前先阅读每个冲突文件的上下文,按文件现有模式做最小修改。 +- 若冲突涉及文档或配置,同样遵循最小差异原则,不借机整理无关内容。 +- 合并完成后执行仓库要求的 `pnpm run format`、`pnpm run i18n`、`pnpm run lint`。若命令失败,记录失败点并告知用户。 + +## Risks and mitigations +- 风险:冲突文件较多且分散,容易误删一侧逻辑。 + - 缓解:逐文件阅读冲突块上下文后再编辑,并在完成后检查 diff。 +- 风险:格式化或 lint 暴露既有问题,影响本次验证。 + - 缓解:优先区分新引入问题与仓库既有问题,向用户明确说明。 + +## Test strategy +- 使用 `git status` 确认冲突已清除。 +- 使用格式化、i18n、lint 命令验证合并后仓库状态。 diff --git a/docs/issues/merge-dev-into-gen-video/spec.md b/docs/issues/merge-dev-into-gen-video/spec.md new file mode 100644 index 000000000..825557004 --- /dev/null +++ b/docs/issues/merge-dev-into-gen-video/spec.md @@ -0,0 +1,23 @@ +# Merge dev into gen-video + +## User stories +- 作为 `gen-video` 分支开发者,我需要合并最新 `dev` 变更到当前分支,以便继续在最新主线基础上开发。 +- 作为评审者,我需要本次冲突解决范围清晰、仅限必要文件,并保留两侧已完成的有效修改。 + +## Acceptance criteria +- 当前分支成功合并 `origin/dev`,不存在未解决的 merge conflict。 +- 冲突文件采用最小变更原则解决,不引入与本次合并无关的重构。 +- 合并后工作区状态可继续提交,且相关校验命令已执行并记录结果。 + +## Non-goals +- 不在本次任务中实现新的产品功能。 +- 不主动修改与冲突无关的历史代码风格。 +- 不提交 commit,除非用户额外要求。 + +## Constraints +- 仅处理 `dev` 合并到当前 `gen-video` 分支产生的冲突。 +- 遵循仓库现有 SDD、格式化、i18n、lint 规范。 +- 如需保留双方逻辑,优先基于现有实现做兼容合并,而非重写。 + +## Open questions +- 无 diff --git a/docs/issues/merge-dev-into-gen-video/tasks.md b/docs/issues/merge-dev-into-gen-video/tasks.md new file mode 100644 index 000000000..f234a30a5 --- /dev/null +++ b/docs/issues/merge-dev-into-gen-video/tasks.md @@ -0,0 +1,8 @@ +# Tasks + +1. 获取最新 `origin/dev` 并确认当前分支状态。 +2. 创建本次合并的 SDD 文档并记录范围、约束、验证方式。 +3. 执行 `git merge origin/dev`,定位所有冲突文件。 +4. 阅读冲突文件上下文,逐个解决冲突并保留必要改动。 +5. 运行 `pnpm run format`、`pnpm run i18n`、`pnpm run lint`。 +6. 汇总结果与后续建议,等待用户决定是否提交。 diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md new file mode 100644 index 000000000..2f7ed4812 --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md @@ -0,0 +1,21 @@ +# Plan + +## Approach +Add a small runtime helper that extracts an integer duration from obvious prompt hints only when structured video settings are absent and the parsed value is supported by the active model, then reuse that helper for both request tracing and the actual `/videos` request body. + +## Implementation +- Add a focused runtime test that exercises the OpenAI-compatible `/videos` flow and asserts `duration: 2` is sent for prompts like `... 2s`. +- Add a conservative prompt-duration extractor for `Ns`, `N sec`, `N seconds`, and `N秒`. +- Enforce model-specific validity before injecting the derived duration (for Seedance, `4~15`). +- Apply the fallback only when `videoGeneration.duration` and `videoGeneration.seconds` are both unset. + +## Affected Files +- `src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts` +- `test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts` +- `docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md` + +## Validation +- Focused AI SDK runtime tests for video request bodies. +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md new file mode 100644 index 000000000..04f28d04b --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md @@ -0,0 +1,25 @@ +# OpenAI-Compatible Video Prompt Duration Fallback + +## User Need +When users send prompts such as `生成 马斯克 喝酒的视频 2s` to OpenAI-compatible video models, DeepChat should preserve the obvious structured duration hint instead of sending only the raw prompt body. + +## Goal +Infer an explicit video duration from clear prompt suffixes like `5s` or `5秒` when the session has no structured video duration configured and the parsed value is valid for the target model. + +## Acceptance Criteria +1. OpenAI-compatible video requests derive `duration` from obvious prompt hints when neither `duration` nor `seconds` is already configured and the parsed value is valid for the current model. +2. Explicit structured video settings still take precedence over any prompt-derived fallback. +3. The emitted request trace matches the actual `/videos` body for this fallback. +4. Focused validation passes for the touched runtime slice. + +## Constraints +- Keep the fallback narrow and conservative; do not attempt broad natural-language parameter parsing. +- Preserve existing request-shape compatibility and polling behavior. + +## Non-Goals +- Adding or changing video settings UI. +- Parsing arbitrary style, ratio, or resolution hints from prompts. +- Changing provider safety or moderation behavior. + +## Open Questions +- None. diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md new file mode 100644 index 000000000..bed18d1c6 --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md @@ -0,0 +1,11 @@ +# Tasks + +## Runtime Fallback +- [x] Add a runtime regression test for prompt-derived video duration. +- [x] Apply a conservative prompt duration fallback before building `/videos` requests. + +## Validation +- [x] Run focused AI SDK runtime tests. +- [x] Run `pnpm run format`. +- [x] Run `pnpm run i18n`. +- [x] Run `pnpm run lint`. diff --git a/src/main/presenter/agentRuntimePresenter/index.ts b/src/main/presenter/agentRuntimePresenter/index.ts index 87eb6c67c..473f8bd01 100644 --- a/src/main/presenter/agentRuntimePresenter/index.ts +++ b/src/main/presenter/agentRuntimePresenter/index.ts @@ -59,6 +59,11 @@ import { } from '@shared/imageGenerationSettings' import { ApiEndpointType, ModelType, isDeepSeekSeriesModelId } from '@shared/model' import { isTtsModelConfig, isTtsModelId } from '@shared/ttsSettings' +import { + isVideoGenerationModelConfig, + normalizeVideoGenerationOptions, + supportsOpenAICompatibleVideoGeneration +} from '@shared/videoGenerationSettings' import { nanoid } from 'nanoid' import type { SQLitePresenter } from '../sqlitePresenter' import { eventBus, SendTarget } from '@/eventbus' @@ -1434,7 +1439,8 @@ export class AgentRuntimePresenter implements IAgentImplementation { private shouldUseDeepChatContextBudget( providerId?: string | null, - modelConfig?: Pick | null + modelConfig?: Pick | null, + modelId?: string | null ): boolean { if (providerId?.trim() === 'acp') { return false @@ -1456,22 +1462,28 @@ export class AgentRuntimePresenter implements IAgentImplementation { return false } + if (isVideoGenerationModelConfig(modelConfig, modelId?.trim() || '')) { + return false + } + return true } private shouldBypassDeepChatContextBudget( providerId?: string | null, - modelConfig?: Pick | null + modelConfig?: Pick | null, + modelId?: string | null ): boolean { - return !this.shouldUseDeepChatContextBudget(providerId, modelConfig) + return !this.shouldUseDeepChatContextBudget(providerId, modelConfig, modelId) } private resolveDeepChatContextBudgetLength( providerId: string | null | undefined, contextLength: number, - modelConfig?: Pick | null + modelConfig?: Pick | null, + modelId?: string | null ): number { - return this.shouldBypassDeepChatContextBudget(providerId, modelConfig) + return this.shouldBypassDeepChatContextBudget(providerId, modelConfig, modelId) ? Number.MAX_SAFE_INTEGER : contextLength } @@ -1655,7 +1667,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { throw new Error(`Session ${sessionId} not found`) } const modelConfig = this.configPresenter.getModelConfig(state.modelId, state.providerId) - if (this.shouldBypassDeepChatContextBudget(state.providerId, modelConfig)) { + if (this.shouldBypassDeepChatContextBudget(state.providerId, modelConfig, state.modelId)) { throw new Error('Manual compaction is only available for DeepChat agent sessions.') } if (state.status !== 'idle') { @@ -1676,7 +1688,8 @@ export class AgentRuntimePresenter implements IAgentImplementation { const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, generationSettings.contextLength, - modelConfig + modelConfig, + state.modelId ) const maxTokens = capAgentRequestMaxTokens(generationSettings.maxTokens, contextBudgetLength) const activeSkillNames = await this.resolveActiveSkillNamesForToolProfile(sessionId) @@ -1898,7 +1911,8 @@ export class AgentRuntimePresenter implements IAgentImplementation { const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, generationSettings.contextLength, - baseModelConfig + baseModelConfig, + state.modelId ) const capabilityProviderId = this.resolveCapabilityProviderId(state.providerId, state.modelId) const reasoningPortrait = this.getReasoningPortrait(state.providerId, state.modelId) @@ -1913,6 +1927,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { reasoningVisibility: generationSettings.reasoningVisibility, verbosity: generationSettings.verbosity, imageGeneration: generationSettings.imageGeneration, + videoGeneration: generationSettings.videoGeneration, reasoning: getReasoningEffectiveEnabledForProvider(capabilityProviderId, reasoningPortrait, { reasoning: baseModelConfig.reasoning, reasoningEffort: generationSettings.reasoningEffort ?? baseModelConfig.reasoningEffort @@ -2601,7 +2616,8 @@ export class AgentRuntimePresenter implements IAgentImplementation { const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, generationSettings.contextLength, - modelConfig + modelConfig, + state.modelId ) const maxTokens = capAgentRequestMaxTokens(generationSettings.maxTokens, contextBudgetLength) const projectDir = this.resolveProjectDir(sessionId) @@ -3435,6 +3451,22 @@ export class AgentRuntimePresenter implements IAgentImplementation { } } + if ( + supportsOpenAICompatibleVideoGeneration({ + providerId, + providerApiType: this.resolveProviderApiType(providerId), + modelId, + apiEndpoint: modelConfig.apiEndpoint, + endpointType: modelConfig.endpointType, + type: modelConfig.type + }) + ) { + const videoGeneration = normalizeVideoGenerationOptions(modelConfig.videoGeneration) + if (videoGeneration) { + defaults.videoGeneration = videoGeneration + } + } + const supportsReasoning = this.configPresenter.supportsReasoningCapability?.(providerId, modelId) === true if (supportsReasoning) { @@ -3679,6 +3711,35 @@ export class AgentRuntimePresenter implements IAgentImplementation { delete next.imageGeneration } + if ( + supportsOpenAICompatibleVideoGeneration({ + providerId, + providerApiType: this.resolveProviderApiType(providerId), + modelId, + apiEndpoint: modelConfig.apiEndpoint, + endpointType: modelConfig.endpointType, + type: modelConfig.type + }) + ) { + if (Object.prototype.hasOwnProperty.call(patch, 'videoGeneration')) { + const videoGeneration = normalizeVideoGenerationOptions(patch.videoGeneration) + if (videoGeneration) { + next.videoGeneration = videoGeneration + } else { + delete next.videoGeneration + } + } else { + const videoGeneration = normalizeVideoGenerationOptions(next.videoGeneration) + if (videoGeneration) { + next.videoGeneration = videoGeneration + } else { + delete next.videoGeneration + } + } + } else { + delete next.videoGeneration + } + if (fixedTemperatureKimi) { next.temperature = fixedTemperatureKimi.temperature } diff --git a/src/main/presenter/configPresenter/index.ts b/src/main/presenter/configPresenter/index.ts index b06bde6ec..35512639e 100644 --- a/src/main/presenter/configPresenter/index.ts +++ b/src/main/presenter/configPresenter/index.ts @@ -27,6 +27,7 @@ import { resolveProviderCapabilityProviderId, type NewApiEndpointType } from '@shared/model' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import { DEFAULT_MODEL_CAPABILITY_FALLBACKS, resolveDerivedModelMaxTokens, @@ -973,6 +974,15 @@ export class ConfigPresenter implements IConfigPresenter { } private inferProviderDbModelType(model: ProviderModel): ModelType { + const videoGenerationType = resolveVideoGenerationCompatType({ + modelId: model.id, + type: model.type, + modalities: model.modalities + }) + if (videoGenerationType) { + return videoGenerationType + } + if (Array.isArray(model.modalities?.output) && model.modalities.output.includes('image')) { return ModelType.ImageGeneration } @@ -984,6 +994,8 @@ export class ConfigPresenter implements IConfigPresenter { return ModelType.Rerank case 'imageGeneration': return ModelType.ImageGeneration + case 'videoGeneration': + return ModelType.VideoGeneration case 'tts': return ModelType.TTS case 'chat': diff --git a/src/main/presenter/configPresenter/modelConfig.ts b/src/main/presenter/configPresenter/modelConfig.ts index c6fe4fe04..fcd2c709c 100644 --- a/src/main/presenter/configPresenter/modelConfig.ts +++ b/src/main/presenter/configPresenter/modelConfig.ts @@ -13,6 +13,7 @@ import { resolveModelFunctionCall } from '@shared/modelConfigDefaults' import { applyMoonshotKimiReasoningTemperaturePolicy } from '@shared/moonshotKimiPolicy' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import ElectronStore from 'electron-store' import { providerDbLoader } from './providerDbLoader' import { @@ -105,6 +106,15 @@ export class ModelConfigHelper { * Priority: 1. modalities.output includes image 2. model.type (from provider.json) 3. default Chat */ private inferModelType(model: ProviderModel): ModelType { + const videoGenerationType = resolveVideoGenerationCompatType({ + modelId: model.id, + type: model.type, + modalities: model.modalities + }) + if (videoGenerationType) { + return videoGenerationType + } + // Priority 1: Output modality indicates image generation if (Array.isArray(model.modalities?.output) && model.modalities.output.includes('image')) { return ModelType.ImageGeneration @@ -121,6 +131,8 @@ export class ModelConfigHelper { return ModelType.Rerank case 'imageGeneration': return ModelType.ImageGeneration + case 'videoGeneration': + return ModelType.VideoGeneration case 'tts': return ModelType.TTS default: @@ -180,9 +192,11 @@ export class ModelConfigHelper { apiEndpoint: modelType === ModelType.ImageGeneration ? ApiEndpointType.Image - : modelType === ModelType.TTS - ? ApiEndpointType.AudioSpeech - : ApiEndpointType.Chat, + : modelType === ModelType.VideoGeneration + ? ApiEndpointType.Video + : modelType === ModelType.TTS + ? ApiEndpointType.AudioSpeech + : ApiEndpointType.Chat, thinkingBudget, forceInterleavedThinkingCompat, reasoningEffort, diff --git a/src/main/presenter/configPresenter/providerModelHelper.ts b/src/main/presenter/configPresenter/providerModelHelper.ts index e71a2833d..6ded4e17d 100644 --- a/src/main/presenter/configPresenter/providerModelHelper.ts +++ b/src/main/presenter/configPresenter/providerModelHelper.ts @@ -2,6 +2,7 @@ import { eventBus, SendTarget } from '@/eventbus' import { CONFIG_EVENTS } from '@/events' import { ModelConfig, MODEL_META } from '@shared/presenter' import { ModelType } from '@shared/model' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import ElectronStore from 'electron-store' import path from 'path' import type { StoreLike } from './storeLike' @@ -143,16 +144,30 @@ export class ProviderModelHelper { normalizedModel.reasoning !== undefined ? normalizedModel.reasoning : config.reasoning || false - normalizedModel.type = - normalizedModel.type !== undefined ? normalizedModel.type : config.type || ModelType.Chat normalizedModel.endpointType = config.endpointType ?? normalizedModel.endpointType + normalizedModel.type = + resolveVideoGenerationCompatType({ + modelId: normalizedModel.id, + type: config.type ?? normalizedModel.type, + apiEndpoint: config.apiEndpoint, + endpointType: normalizedModel.endpointType, + supportedEndpointTypes: normalizedModel.supportedEndpointTypes + }) ?? + (normalizedModel.type !== undefined ? normalizedModel.type : config.type || ModelType.Chat) return normalizedModel } normalizedModel.vision = normalizedModel.vision || false normalizedModel.functionCall = normalizedModel.functionCall || false normalizedModel.reasoning = normalizedModel.reasoning || false - normalizedModel.type = normalizedModel.type || ModelType.Chat + normalizedModel.type = + resolveVideoGenerationCompatType({ + modelId: normalizedModel.id, + type: normalizedModel.type, + endpointType: normalizedModel.endpointType, + supportedEndpointTypes: normalizedModel.supportedEndpointTypes + }) ?? + (normalizedModel.type || ModelType.Chat) return normalizedModel } diff --git a/src/main/presenter/index.ts b/src/main/presenter/index.ts index dfb631d9d..5c3a65308 100644 --- a/src/main/presenter/index.ts +++ b/src/main/presenter/index.ts @@ -388,6 +388,14 @@ export class Presenter implements IPresenter { modelId, imageOptions, options + ), + generateVideoStandalone: (providerId, prompt, modelId, videoOptions, options) => + this.llmproviderPresenter.generateVideoStandalone( + providerId, + prompt, + modelId, + videoOptions, + options ) }), cacheImage: (data) => this.devicePresenter.cacheImage(data), diff --git a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts index 56cff686e..bf5cf959b 100644 --- a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts +++ b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts @@ -19,6 +19,13 @@ import { supportsOpenAIImageGenerationSettings, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + isVideoGenerationModelConfig, + normalizeVideoGenerationOptions, + resolveOpenAICompatibleVideoRequestBodyShape, + type VideoGenerationOptions, + type VideoGenerationReference +} from '@shared/videoGenerationSettings' import { isChatAudioTtsModel, isGeminiGenerateContentTtsModel, @@ -48,9 +55,40 @@ type ImageGenerationRequestOptions = { providerOptions?: Record } +type VideoGenerationRequestBody = { + model: string + prompt: string + seconds?: string + size?: string + input_reference?: string | { mime_type?: string; data: string } + content?: Array> + ratio?: string + duration?: number + resolution?: string + watermark?: boolean + generate_audio?: boolean + extra_body?: Record +} + +type VideoGenerationTaskResponse = { + id?: string + status?: string + url?: string | null + error?: + | string + | { + message?: string + } + | null +} + const DEFAULT_GEMINI_TTS_VOICE = 'Kore' const DEFAULT_GEMINI_PCM_SAMPLE_RATE = 24000 const DEFAULT_GEMINI_PCM_BITS_PER_SAMPLE = 16 +const VIDEO_GENERATION_POLL_INTERVAL_MS = 3000 +const PROMPT_VIDEO_DURATION_EN_PATTERN = + /(^|[^0-9a-z])(?\d{1,2})\s*(?:s|sec|secs|second|seconds)\b/i +const PROMPT_VIDEO_DURATION_ZH_PATTERN = /(?\d{1,2})\s*秒/u export interface AiSdkRuntimeContext { providerKind: AiSdkProviderKind @@ -71,6 +109,7 @@ export interface AiSdkRuntimeContext { cleanHeaders?: boolean supportsNativeTools?: (modelId: string, modelConfig: ModelConfig) => boolean shouldUseImageGeneration?: (modelId: string, modelConfig: ModelConfig) => boolean + shouldUseVideoGeneration?: (modelId: string, modelConfig: ModelConfig) => boolean shouldUseTts?: (modelId: string, modelConfig: ModelConfig) => boolean } @@ -146,6 +185,63 @@ function normalizePromptValue(value: unknown): string { return '' } +function supportsPromptDerivedVideoDuration(modelId: string, duration: number): boolean { + const normalizedModelId = modelId.trim().toLowerCase() + + if (normalizedModelId.startsWith('doubao-seedance-')) { + return duration >= 4 && duration <= 15 + } + + return true +} + +function resolvePromptVideoDuration(prompt: string, modelId: string): number | undefined { + const normalizedPrompt = prompt.trim() + if (!normalizedPrompt) { + return undefined + } + + const matchedDuration = + normalizedPrompt.match(PROMPT_VIDEO_DURATION_EN_PATTERN)?.groups?.duration || + normalizedPrompt.match(PROMPT_VIDEO_DURATION_ZH_PATTERN)?.groups?.duration + + if (!matchedDuration) { + return undefined + } + + const parsed = Number.parseInt(matchedDuration, 10) + if (!Number.isFinite(parsed) || parsed <= 0) { + return undefined + } + + return supportsPromptDerivedVideoDuration(modelId, parsed) ? parsed : undefined +} + +function resolveVideoGenerationRequestOptions( + prompt: string, + modelId: string, + options: VideoGenerationOptions | undefined +): VideoGenerationOptions | undefined { + const normalizedOptions = normalizeVideoGenerationOptions(options) + + if ( + typeof normalizedOptions?.duration === 'number' || + (typeof normalizedOptions?.seconds === 'string' && normalizedOptions.seconds.trim().length > 0) + ) { + return normalizedOptions + } + + const promptDuration = resolvePromptVideoDuration(prompt, modelId) + if (promptDuration === undefined) { + return normalizedOptions + } + + return normalizeVideoGenerationOptions({ + ...normalizedOptions, + duration: promptDuration + }) +} + function extractImagePrompt(messages: ChatMessage[]): string { return messages .map((message) => (message.role === 'user' ? normalizePromptValue(message.content) : '')) @@ -153,6 +249,10 @@ function extractImagePrompt(messages: ChatMessage[]): string { .join('\n\n') } +function extractVideoPrompt(messages: ChatMessage[]): string { + return extractImagePrompt(messages) +} + function resolveSupportsNativeTools( context: AiSdkRuntimeContext, modelId: string, @@ -177,6 +277,21 @@ function shouldUseImageGenerationRuntime( return modelConfig.apiEndpoint === ApiEndpointType.Image } +function shouldUseVideoGenerationRuntime( + context: AiSdkRuntimeContext, + modelId: string, + modelConfig: ModelConfig +): boolean { + if (context.shouldUseVideoGeneration) { + return context.shouldUseVideoGeneration(modelId, modelConfig) + } + + return ( + modelConfig.apiEndpoint === ApiEndpointType.Video || + isVideoGenerationModelConfig(modelConfig, modelId) + ) +} + function shouldUseTtsRuntime( context: AiSdkRuntimeContext, modelId: string, @@ -626,6 +741,355 @@ function resolveRuntimeTemperature( } } +function normalizeOpenAICompatibleBaseUrl(baseUrl: string | undefined): string { + const normalized = (baseUrl || 'https://api.openai.com/v1').trim().replace(/\/+$/, '') + if (!normalized) { + return 'https://api.openai.com/v1' + } + + return /\/v1(?:beta\d+)?$/i.test(normalized) ? normalized : `${normalized}/v1` +} + +function normalizeVideoReferenceDataUrl(reference: VideoGenerationReference): string | undefined { + if (reference.url?.trim()) { + return reference.url.trim() + } + + if (!reference.data?.trim()) { + return undefined + } + + const normalizedData = reference.data.trim() + if (normalizedData.startsWith('data:')) { + return normalizedData + } + + const fallbackMimeType = + reference.mimeType?.trim() || + (reference.type === 'image' + ? 'image/png' + : reference.type === 'audio' + ? 'audio/mpeg' + : 'video/mp4') + + return `data:${fallbackMimeType};base64,${normalizedData}` +} + +function buildVideoGenerationContent( + options: VideoGenerationOptions | undefined +): Array> | undefined { + if (!options) { + return undefined + } + + const content: Record[] = [] + + for (const reference of options.references ?? []) { + const url = normalizeVideoReferenceDataUrl(reference) + if (!url) { + continue + } + + if (reference.type === 'image') { + content.push({ + type: 'image_url', + image_url: { url }, + role: 'reference_image' + }) + continue + } + + if (reference.type === 'audio') { + content.push({ + type: 'audio_url', + audio_url: { url }, + role: 'reference_audio' + }) + continue + } + + content.push({ + type: 'video_url', + video_url: { url }, + role: 'reference_video' + }) + } + + return content.length > 0 ? content : undefined +} + +function buildVideoGenerationExtraBody( + options: VideoGenerationOptions | undefined +): Record | undefined { + if (!options) { + return undefined + } + + const extraBody: Record = {} + + if (typeof options.duration === 'number' && Number.isFinite(options.duration)) { + extraBody.duration = options.duration + } + if (typeof options.ratio === 'string' && options.ratio.trim()) { + extraBody.ratio = options.ratio.trim() + } + if (typeof options.resolution === 'string' && options.resolution.trim()) { + extraBody.resolution = options.resolution.trim() + } + if (typeof options.watermark === 'boolean') { + extraBody.watermark = options.watermark + } + if (typeof options.generateAudio === 'boolean') { + extraBody.generate_audio = options.generateAudio + } + + const content = buildVideoGenerationContent(options) + if (content) { + extraBody.content = content + } + + return Object.keys(extraBody).length > 0 ? extraBody : undefined +} + +function resolveFlatTopLevelVideoDuration( + options: VideoGenerationOptions | undefined +): number | undefined { + if (typeof options?.duration === 'number' && Number.isFinite(options.duration)) { + return Math.max(-1, Math.round(options.duration)) + } + + if (typeof options?.seconds !== 'string') { + return undefined + } + + const parsed = Number.parseInt(options.seconds.trim(), 10) + return Number.isFinite(parsed) ? Math.max(-1, parsed) : undefined +} + +function buildVideoGenerationRequestBody( + provider: LLM_PROVIDER, + modelId: string, + prompt: string, + options: VideoGenerationOptions | undefined +): VideoGenerationRequestBody { + const body: VideoGenerationRequestBody = { + model: modelId, + prompt + } + + if (options?.seconds) { + body.seconds = options.seconds + } + if (options?.size) { + body.size = options.size + } + if (options?.inputReference) { + if (typeof options.inputReference === 'string') { + body.input_reference = options.inputReference + } else { + body.input_reference = { + data: options.inputReference.data, + ...(options.inputReference.mimeType ? { mime_type: options.inputReference.mimeType } : {}) + } + } + } + + const requestBodyShape = resolveOpenAICompatibleVideoRequestBodyShape({ + providerId: provider.id, + providerApiType: provider.apiType, + baseUrl: provider.baseUrl, + modelId + }) + + if (requestBodyShape === 'flat-top-level') { + const content = buildVideoGenerationContent(options) + if (content) { + body.content = content + } + if (options?.ratio) { + body.ratio = options.ratio.trim() + } + const duration = resolveFlatTopLevelVideoDuration(options) + if (duration !== undefined) { + body.duration = duration + } + if (options?.resolution) { + body.resolution = options.resolution.trim() + } + if (typeof options?.watermark === 'boolean') { + body.watermark = options.watermark + } + if (typeof options?.generateAudio === 'boolean') { + body.generate_audio = options.generateAudio + } + + return body + } + + const extraBody = buildVideoGenerationExtraBody(options) + if (extraBody) { + body.extra_body = extraBody + } + + return body +} + +function extractVideoTaskError(response: VideoGenerationTaskResponse | null | undefined): string { + const error = response?.error + if (typeof error === 'string' && error.trim()) { + return error.trim() + } + + if ( + error && + typeof error === 'object' && + typeof error.message === 'string' && + error.message.trim() + ) { + return error.message.trim() + } + + return 'Video generation failed' +} + +function resolveVideoTaskStatus(response: VideoGenerationTaskResponse | null | undefined): string { + return typeof response?.status === 'string' ? response.status.trim().toLowerCase() : '' +} + +function delayWithAbort(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal.aborted) { + reject(signal.reason instanceof Error ? signal.reason : new Error('Aborted')) + return + } + + const onAbort = () => { + clearTimeout(timeoutId) + signal.removeEventListener('abort', onAbort) + reject(signal.reason instanceof Error ? signal.reason : new Error('Aborted')) + } + + const timeoutId = setTimeout(() => { + signal.removeEventListener('abort', onAbort) + resolve() + }, ms) + + signal.addEventListener('abort', onAbort, { once: true }) + }) +} + +async function executeOpenAICompatibleVideoGeneration( + provider: LLM_PROVIDER, + defaultHeaders: Record, + modelId: string, + prompt: string, + modelConfig: ModelConfig, + timeout: number | undefined +): Promise<{ base64: string; mimeType: string }> { + const normalizedOptions = resolveVideoGenerationRequestOptions( + prompt, + modelId, + modelConfig.videoGeneration + ) + const baseUrl = normalizeOpenAICompatibleBaseUrl(provider.baseUrl) + const createUrl = `${baseUrl}/videos` + const body = buildVideoGenerationRequestBody(provider, modelId, prompt, normalizedOptions) + const controller = new AbortController() + const timeoutId = timeout ? setTimeout(() => controller.abort(), timeout) : undefined + const proxyUrl = proxyConfig.getProxyUrl() + const dispatcher = proxyUrl ? new ProxyAgent(proxyUrl) : undefined + + const fetchJson = async (url: string, init: RequestInit): Promise => { + const fetchInit: RequestInit & { dispatcher?: ProxyAgent } = { + ...init, + headers: { + ...defaultHeaders, + Authorization: `Bearer ${provider.oauthToken || provider.apiKey || ''}`, + ...(init.headers as Record | undefined) + }, + signal: controller.signal + } + if (dispatcher) fetchInit.dispatcher = dispatcher + + const response = await fetch(url, fetchInit) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + throw new Error(`Video request failed (${response.status}): ${errorText}`) + } + + return (await response.json()) as T + } + + const fetchBinary = async (url: string): Promise<{ buffer: ArrayBuffer; mimeType: string }> => { + const fetchInit: RequestInit & { dispatcher?: ProxyAgent } = { + method: 'GET', + headers: { + ...defaultHeaders, + Authorization: `Bearer ${provider.oauthToken || provider.apiKey || ''}` + }, + signal: controller.signal + } + if (dispatcher) fetchInit.dispatcher = dispatcher + + const response = await fetch(url, fetchInit) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + throw new Error(`Video content download failed (${response.status}): ${errorText}`) + } + + return { + buffer: await response.arrayBuffer(), + mimeType: response.headers.get('content-type')?.split(';')[0]?.trim() || 'video/mp4' + } + } + + try { + let task = await fetchJson(createUrl, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body) + }) + + const taskId = typeof task.id === 'string' ? task.id.trim() : '' + if (!taskId) { + throw new Error('Video generation response missing task id') + } + + let status = resolveVideoTaskStatus(task) + while (status !== 'completed') { + if (status === 'failed') { + throw new Error(extractVideoTaskError(task)) + } + + await delayWithAbort(VIDEO_GENERATION_POLL_INTERVAL_MS, controller.signal) + task = await fetchJson( + `${createUrl}/${encodeURIComponent(taskId)}`, + { + method: 'GET' + } + ) + status = resolveVideoTaskStatus(task) + } + + const contentUrl = + typeof task.url === 'string' && task.url.trim().length > 0 + ? task.url.trim() + : `${createUrl}/${encodeURIComponent(taskId)}/content` + const { buffer, mimeType } = await fetchBinary(contentUrl) + + return { + base64: Buffer.from(buffer).toString('base64'), + mimeType + } + } finally { + if (timeoutId !== undefined) { + clearTimeout(timeoutId) + } + } +} + async function buildPromptRuntime( context: AiSdkRuntimeContext, messages: ChatMessage[], @@ -801,6 +1265,49 @@ export async function* runAiSdkCoreStream( return } + if (shouldUseVideoGenerationRuntime(context, modelId, normalizedModelConfig)) { + const prompt = extractVideoPrompt(messages) + const normalizedVideoOptions = resolveVideoGenerationRequestOptions( + prompt, + modelId, + normalizedModelConfig.videoGeneration + ) + const requestBody = buildVideoGenerationRequestBody( + context.provider, + modelId, + prompt, + normalizedVideoOptions + ) + + await context.emitRequestTrace?.(normalizedModelConfig, { + endpoint: `${normalizeOpenAICompatibleBaseUrl(context.provider.baseUrl)}/videos`, + headers: context.buildTraceHeaders?.() ?? context.defaultHeaders, + body: requestBody + }) + + const { base64, mimeType } = await executeOpenAICompatibleVideoGeneration( + context.provider, + context.defaultHeaders, + modelId, + prompt, + normalizedModelConfig, + timeout + ) + + yield { + type: 'image_data', + image_data: { + data: `data:${mimeType};base64,${base64}`, + mimeType + } + } + yield { + type: 'stop', + stop_reason: 'complete' + } + return + } + if (shouldUseImageGenerationRuntime(context, modelId, normalizedModelConfig)) { const prompt = extractImagePrompt(messages) diff --git a/src/main/presenter/llmProviderPresenter/index.ts b/src/main/presenter/llmProviderPresenter/index.ts index a1c100a2c..b7bab7c04 100644 --- a/src/main/presenter/llmProviderPresenter/index.ts +++ b/src/main/presenter/llmProviderPresenter/index.ts @@ -9,6 +9,7 @@ import { KeyStatus, LLM_EMBEDDING_ATTRS, StandaloneImageGenerationResult, + StandaloneVideoGenerationResult, ModelScopeMcpSyncOptions, ModelScopeMcpSyncResult, IConfigPresenter, @@ -24,6 +25,10 @@ import { normalizeImageGenerationOptions, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + normalizeVideoGenerationOptions, + type VideoGenerationOptions +} from '@shared/videoGenerationSettings' import { ProviderChange, ProviderBatchUpdate } from '@shared/provider-operations' import { isProviderDbBackedProvider } from '@shared/providerDbCatalog' import { eventBus } from '@/eventbus' @@ -538,6 +543,89 @@ export class LLMProviderPresenter implements ILlmProviderPresenter { } } + async generateVideoStandalone( + providerId: string, + prompt: string, + modelId: string, + videoOptions?: VideoGenerationOptions, + options?: { signal?: AbortSignal } + ): Promise { + const normalizedPrompt = prompt.trim() + if (!normalizedPrompt) { + throw new Error('Video generation prompt is required') + } + + const signal = options?.signal + if (signal?.aborted) { + throw createAbortError() + } + + await this.executeWithRateLimit(providerId, { signal }) + + const provider = this.getProviderInstance(providerId) + const modelConfig = this.configPresenter.getModelConfig(modelId, providerId) + const mergedVideoOptions = normalizeVideoGenerationOptions({ + ...modelConfig.videoGeneration, + ...videoOptions + }) + const resolvedModelConfig: ModelConfig = { + ...modelConfig, + type: ModelType.VideoGeneration, + apiEndpoint: ApiEndpointType.Video, + videoGeneration: mergedVideoOptions + } + const stream = provider.coreStream( + [{ role: 'user', content: normalizedPrompt }], + modelId, + resolvedModelConfig, + modelConfig.temperature ?? 0.7, + modelConfig.maxTokens ?? 1024, + [] + ) + const videos: StandaloneVideoGenerationResult['videos'] = [] + const abort = createAbortPromise(signal, () => { + void stream.return?.(undefined as never) + }) + + const collect = async () => { + for await (const event of stream) { + if (signal?.aborted) { + throw createAbortError() + } + + if ( + event.type === 'image_data' && + event.image_data.mimeType.trim().toLowerCase().startsWith('video/') + ) { + videos.push({ + data: event.image_data.data, + mimeType: event.image_data.mimeType + }) + } + if (event.type === 'error') { + throw new Error(event.error_message) + } + } + } + + try { + await (abort.promise ? Promise.race([collect(), abort.promise]) : collect()) + } finally { + abort.cleanup() + } + + if (videos.length === 0) { + throw new Error('Video generation completed without video output') + } + + return { + providerId, + modelId, + ...(mergedVideoOptions ? { options: mergedVideoOptions } : {}), + videos + } + } + // 配置相关方法 setMaxConcurrentStreams(max: number): void { this.config.maxConcurrentStreams = max diff --git a/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts b/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts index 0cb7ec041..37efbd9a9 100644 --- a/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts +++ b/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts @@ -8,6 +8,7 @@ import { type NewApiEndpointType } from '@shared/model' import { isTtsModelConfig, isTtsModelId } from '@shared/ttsSettings' +import { isVideoGenerationModelConfig } from '@shared/videoGenerationSettings' import { DEFAULT_MODEL_CONTEXT_LENGTH, DEFAULT_MODEL_MAX_TOKENS, @@ -96,6 +97,10 @@ const shouldUseOpenAIImageGenerationRoute = (modelId: string, modelConfig: Model modelConfig.apiEndpoint === ApiEndpointType.Image || modelConfig.type === ModelType.ImageGeneration +const shouldUseOpenAIVideoGenerationRoute = (modelId: string, modelConfig: ModelConfig): boolean => + modelConfig.apiEndpoint === ApiEndpointType.Video || + isVideoGenerationModelConfig(modelConfig, modelId) + const shouldUseOpenAITtsRoute = (modelId: string, modelConfig: ModelConfig): boolean => isTtsModelConfig(modelConfig) || modelConfig.apiEndpoint === ApiEndpointType.AudioSpeech || @@ -351,6 +356,27 @@ export class AiSdkProvider extends BaseLLMProvider { endpointType: 'image-generation' } } + case 'video-generation': + return { + providerKind: 'openai-compatible', + endpointType, + providerPatch: { + apiType: 'openai-completions', + baseUrl: `${host}/v1`, + capabilityProviderId: resolveProviderCapabilityProviderId( + this.provider.id, + { + endpointType + }, + modelId + ) + }, + modelConfigPatch: { + apiEndpoint: ApiEndpointType.Video, + type: ModelType.VideoGeneration, + endpointType: 'video-generation' + } + } case 'openai': default: return { @@ -575,6 +601,17 @@ export class AiSdkProvider extends BaseLLMProvider { isOpenAIImageGenerationModel(runtimeModelId) || runtimeModelConfig.apiEndpoint === ApiEndpointType.Image + const shouldUseVideoGeneration = + this.isAzureOpenAI(decision, runtimeProvider) || + decision.providerKind === 'gemini' || + decision.providerKind === 'vertex' || + decision.providerKind === 'anthropic' + ? undefined + : decision.endpointType === 'video-generation' + ? () => true + : (runtimeModelId: string, runtimeModelConfig: ModelConfig) => + shouldUseOpenAIVideoGenerationRoute(runtimeModelId, runtimeModelConfig) + // TTS route: only applicable for OpenAI-compatible providers (not Azure, Gemini, Vertex) const shouldUseTts = this.isAzureOpenAI(decision, runtimeProvider) || @@ -602,6 +639,7 @@ export class AiSdkProvider extends BaseLLMProvider { supportsNativeTools: (_runtimeModelId, runtimeModelConfig) => runtimeModelConfig.functionCall === true, shouldUseImageGeneration, + shouldUseVideoGeneration, shouldUseTts } } @@ -1673,17 +1711,22 @@ export class AiSdkProvider extends BaseLLMProvider { normalizedRawType === 'image' || supportedEndpointTypes.includes('image-generation') ? ModelType.ImageGeneration - : normalizedRawType === 'tts' || - normalizedRawType === 'audio-speech' || - normalizedRawType === 'audiospeech' - ? ModelType.TTS - : normalizedRawType === 'embedding' || - normalizedRawType === 'embeddings' || - normalizedModelId.includes('embedding') - ? ModelType.Embedding - : normalizedRawType === 'rerank' || normalizedModelId.includes('rerank') - ? ModelType.Rerank - : undefined + : normalizedRawType === 'videogeneration' || + normalizedRawType === 'video-generation' || + normalizedRawType === 'video' || + supportedEndpointTypes.includes('video-generation') + ? ModelType.VideoGeneration + : normalizedRawType === 'tts' || + normalizedRawType === 'audio-speech' || + normalizedRawType === 'audiospeech' + ? ModelType.TTS + : normalizedRawType === 'embedding' || + normalizedRawType === 'embeddings' || + normalizedModelId.includes('embedding') + ? ModelType.Embedding + : normalizedRawType === 'rerank' || normalizedModelId.includes('rerank') + ? ModelType.Rerank + : undefined const contextLengthCandidate = [ rawModel.context_length, @@ -1708,7 +1751,9 @@ export class AiSdkProvider extends BaseLLMProvider { supportedEndpointTypes.length === 0 ? type === ModelType.ImageGeneration ? 'image-generation' - : undefined + : type === ModelType.VideoGeneration + ? 'video-generation' + : undefined : resolveNewApiEndpointTypeFromRoute( { supportedEndpointTypes, diff --git a/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts b/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts index 779dfa71f..7993f9742 100644 --- a/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts +++ b/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts @@ -12,6 +12,10 @@ import { normalizeImageGenerationOptions, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + normalizeVideoGenerationOptions, + type VideoGenerationOptions +} from '@shared/videoGenerationSettings' type DeepChatSessionGenerationSettings = Pick< SessionGenerationSettings, @@ -26,6 +30,7 @@ type DeepChatSessionGenerationSettings = Pick< | 'verbosity' | 'forceInterleavedThinkingCompat' | 'imageGeneration' + | 'videoGeneration' > export interface DeepChatSessionRow { @@ -44,6 +49,7 @@ export interface DeepChatSessionRow { verbosity: 'low' | 'medium' | 'high' | null force_interleaved_thinking_compat: number | null image_generation_options_json: string | null + video_generation_options_json: string | null summary_text: string | null summary_cursor_order_seq: number | null summary_updated_at: number | null @@ -109,6 +115,10 @@ export class DeepChatSessionsTable extends BaseTable { columns.push('image_generation_options_json TEXT') } + if (version >= 28) { + columns.push('video_generation_options_json TEXT') + } + if (version >= 14) { columns.push( 'summary_text TEXT', @@ -187,6 +197,11 @@ export class DeepChatSessionsTable extends BaseTable { 'ALTER TABLE deepchat_sessions ADD COLUMN image_generation_options_json TEXT;' ) } + if (!this.hasColumn('video_generation_options_json')) { + statements.push( + 'ALTER TABLE deepchat_sessions ADD COLUMN video_generation_options_json TEXT;' + ) + } return statements } @@ -230,11 +245,14 @@ export class DeepChatSessionsTable extends BaseTable { if (version === 27) { return 'ALTER TABLE deepchat_sessions ADD COLUMN image_generation_options_json TEXT;' } + if (version === 28) { + return 'ALTER TABLE deepchat_sessions ADD COLUMN video_generation_options_json TEXT;' + } return null } getLatestVersion(): number { - return 27 + return 28 } private serializeImageGenerationOptions( @@ -257,6 +275,26 @@ export class DeepChatSessionsTable extends BaseTable { } } + private serializeVideoGenerationOptions( + value: VideoGenerationOptions | undefined + ): string | null { + const normalized = normalizeVideoGenerationOptions(value) + return normalized ? JSON.stringify(normalized) : null + } + + private parseVideoGenerationOptions(value: string | null): VideoGenerationOptions | undefined { + if (!value) { + return undefined + } + + try { + const parsed = JSON.parse(value) as VideoGenerationOptions + return normalizeVideoGenerationOptions(parsed) + } catch { + return undefined + } + } + create( id: string, providerId: string, @@ -282,11 +320,12 @@ export class DeepChatSessionsTable extends BaseTable { verbosity, force_interleaved_thinking_compat, image_generation_options_json, + video_generation_options_json, summary_text, summary_cursor_order_seq, summary_updated_at ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ) .run( id, @@ -308,6 +347,7 @@ export class DeepChatSessionsTable extends BaseTable { ? 1 : 0, this.serializeImageGenerationOptions(generationSettings?.imageGeneration), + this.serializeVideoGenerationOptions(generationSettings?.videoGeneration), null, 1, null @@ -362,6 +402,10 @@ export class DeepChatSessionsTable extends BaseTable { if (imageGeneration) { settings.imageGeneration = imageGeneration } + const videoGeneration = this.parseVideoGenerationOptions(row.video_generation_options_json) + if (videoGeneration) { + settings.videoGeneration = videoGeneration + } return settings } @@ -430,6 +474,10 @@ export class DeepChatSessionsTable extends BaseTable { updates.push('image_generation_options_json = ?') params.push(this.serializeImageGenerationOptions(settings.imageGeneration)) } + if (Object.prototype.hasOwnProperty.call(settings, 'videoGeneration')) { + updates.push('video_generation_options_json = ?') + params.push(this.serializeVideoGenerationOptions(settings.videoGeneration)) + } if (updates.length === 0) { return diff --git a/src/renderer/settings/components/ProviderModelList.vue b/src/renderer/settings/components/ProviderModelList.vue index d4b45ff76..3aa5f83b2 100644 --- a/src/renderer/settings/components/ProviderModelList.vue +++ b/src/renderer/settings/components/ProviderModelList.vue @@ -376,7 +376,9 @@ const TYPE_ORDER: ModelType[] = [ ModelType.Chat, ModelType.Embedding, ModelType.Rerank, - ModelType.ImageGeneration + ModelType.ImageGeneration, + ModelType.VideoGeneration, + ModelType.TTS ] const CAPABILITY_ICONS: Record = { @@ -391,6 +393,7 @@ const TYPE_ICONS: Record = { [ModelType.Embedding]: 'lucide:database', [ModelType.Rerank]: 'lucide:arrow-up-wide-narrow', [ModelType.ImageGeneration]: 'lucide:image', + [ModelType.VideoGeneration]: 'lucide:clapperboard', [ModelType.TTS]: 'lucide:volume-2' } diff --git a/src/renderer/src/components/ChatConfig.vue b/src/renderer/src/components/ChatConfig.vue index 6b3e151db..044aaf584 100644 --- a/src/renderer/src/components/ChatConfig.vue +++ b/src/renderer/src/components/ChatConfig.vue @@ -40,7 +40,7 @@ const props = defineProps<{ providerId?: string reasoningEffort?: ReasoningEffort verbosity?: Verbosity - modelType?: 'chat' | 'imageGeneration' | 'embedding' | 'rerank' + modelType?: 'chat' | 'imageGeneration' | 'videoGeneration' | 'tts' | 'embedding' | 'rerank' }>() const systemPrompt = defineModel('systemPrompt') @@ -129,7 +129,7 @@ const { sliderFields, inputFields, selectFields } = useChatConfigFields({ watch( () => props.modelType, (newType) => { - if (newType === 'imageGeneration' && systemPrompt.value) { + if ((newType === 'imageGeneration' || newType === 'videoGeneration') && systemPrompt.value) { systemPrompt.value = '' } } @@ -140,6 +140,8 @@ const modelTypeIcon = computed(() => { const icons = { chat: 'lucide:message-circle', imageGeneration: 'lucide:image', + videoGeneration: 'lucide:clapperboard', + tts: 'lucide:volume-2', embedding: 'lucide:layers', rerank: 'lucide:arrow-up-down' } @@ -157,7 +159,13 @@ const modelTypeIcon = computed(() => {
-
+
diff --git a/src/renderer/src/components/chat/ChatStatusBar.vue b/src/renderer/src/components/chat/ChatStatusBar.vue index 8d7612cc0..7d479c971 100644 --- a/src/renderer/src/components/chat/ChatStatusBar.vue +++ b/src/renderer/src/components/chat/ChatStatusBar.vue @@ -264,7 +264,7 @@
-
+
@@ -402,7 +402,7 @@

-
+
@@ -541,8 +541,15 @@ @update:model-value="onImageGenerationSettingsUpdate" /> + +
@@ -715,7 +722,7 @@

-
+