From 60099cdd7ba454257dfb5209aff8e90343b0c492 Mon Sep 17 00:00:00 2001 From: zhangmo8 Date: Mon, 18 May 2026 16:50:45 +0800 Subject: [PATCH 1/3] feat: implement OpenAI-compatible video generation features and settings --- .../plan.md | 42 ++ .../spec.md | 32 ++ .../tasks.md | 25 + .../plan.md | 21 + .../spec.md | 25 + .../tasks.md | 11 + .../presenter/agentRuntimePresenter/index.ts | 98 +++- src/main/presenter/configPresenter/index.ts | 12 + .../presenter/configPresenter/modelConfig.ts | 20 +- .../configPresenter/providerModelHelper.ts | 21 +- src/main/presenter/index.ts | 8 + .../llmProviderPresenter/aiSdk/runtime.ts | 507 ++++++++++++++++++ .../presenter/llmProviderPresenter/index.ts | 88 +++ .../providers/aiSdkProvider.ts | 69 ++- .../tables/deepchatSessions.ts | 52 +- .../settings/components/ProviderModelList.vue | 5 +- src/renderer/src/components/ChatConfig.vue | 14 +- .../src/components/chat/ChatStatusBar.vue | 89 ++- .../src/components/chat/messageListItems.ts | 1 + .../components/message/MessageBlockVideo.vue | 132 +++++ .../message/MessageItemAssistant.vue | 24 +- .../components/settings/ModelConfigDialog.vue | 112 +++- .../OpenAIVideoGenerationSettingsFields.vue | 175 ++++++ .../src/composables/useModelTypeDetection.ts | 13 +- src/renderer/src/i18n/da-DK/model.json | 3 +- src/renderer/src/i18n/da-DK/settings.json | 40 +- src/renderer/src/i18n/en-US/model.json | 3 +- src/renderer/src/i18n/en-US/settings.json | 40 +- src/renderer/src/i18n/fa-IR/model.json | 3 +- src/renderer/src/i18n/fa-IR/settings.json | 40 +- src/renderer/src/i18n/fr-FR/model.json | 3 +- src/renderer/src/i18n/fr-FR/settings.json | 40 +- src/renderer/src/i18n/he-IL/model.json | 3 +- src/renderer/src/i18n/he-IL/settings.json | 40 +- src/renderer/src/i18n/ja-JP/model.json | 3 +- src/renderer/src/i18n/ja-JP/settings.json | 40 +- src/renderer/src/i18n/ko-KR/model.json | 3 +- src/renderer/src/i18n/ko-KR/settings.json | 40 +- src/renderer/src/i18n/pt-BR/model.json | 3 +- src/renderer/src/i18n/pt-BR/settings.json | 40 +- src/renderer/src/i18n/ru-RU/model.json | 3 +- src/renderer/src/i18n/ru-RU/settings.json | 40 +- src/renderer/src/i18n/zh-CN/model.json | 3 +- src/renderer/src/i18n/zh-CN/settings.json | 40 +- src/renderer/src/i18n/zh-HK/model.json | 3 +- src/renderer/src/i18n/zh-HK/settings.json | 40 +- src/renderer/src/i18n/zh-TW/model.json | 3 +- src/renderer/src/i18n/zh-TW/settings.json | 40 +- src/renderer/src/pages/NewThreadPage.vue | 1 + src/renderer/src/stores/modelStore.ts | 26 +- src/renderer/src/stores/ui/draft.ts | 19 + src/shared/contracts/common.ts | 36 +- src/shared/contracts/domainSchemas.ts | 2 + src/shared/model.ts | 17 +- src/shared/types/agent-interface.d.ts | 2 + src/shared/types/model-db.ts | 17 +- src/shared/types/presenters/index.d.ts | 1 + .../types/presenters/legacy.presenters.d.ts | 16 + .../presenters/llmprovider.presenter.d.ts | 16 + src/shared/videoGenerationSettings.ts | 339 ++++++++++++ .../llmProviderPresenter/aiSdkRuntime.test.ts | 208 +++++++ .../aihubmixProvider.test.ts | 33 ++ 62 files changed, 2720 insertions(+), 125 deletions(-) create mode 100644 docs/features/openai-compatible-video-generation/plan.md create mode 100644 docs/features/openai-compatible-video-generation/spec.md create mode 100644 docs/features/openai-compatible-video-generation/tasks.md create mode 100644 docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md create mode 100644 docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md create mode 100644 docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md create mode 100644 src/renderer/src/components/message/MessageBlockVideo.vue create mode 100644 src/renderer/src/components/settings/OpenAIVideoGenerationSettingsFields.vue create mode 100644 src/shared/videoGenerationSettings.ts diff --git a/docs/features/openai-compatible-video-generation/plan.md b/docs/features/openai-compatible-video-generation/plan.md new file mode 100644 index 000000000..37b2c5b4f --- /dev/null +++ b/docs/features/openai-compatible-video-generation/plan.md @@ -0,0 +1,42 @@ +# Plan + +## Approach +Treat video generation as a first-class model capability parallel to image generation and TTS: +- Extend shared model/type enums and model-db parsing to include `videoGeneration`. +- Add a shared video compatibility helper that can recover video intent from model metadata, endpoint hints, modalities, or known model ID patterns when upstream data is incomplete. +- Add an OpenAI-compatible video runtime path that sends requests to `/v1/videos`, normalizes provider responses, and emits media output into the assistant stream. +- Reuse the current assistant media block transport by carrying video payloads through the existing message block structure with video MIME detection on the renderer side. + +## Affected Areas +- Shared types/contracts: + - `src/shared/model.ts` + - `src/shared/types/model-db.ts` + - `src/shared/types/presenters/llmprovider.presenter.d.ts` + - `src/shared/types/presenters/legacy.presenters.d.ts` + - `src/shared/videoGenerationSettings.ts` (new) +- Main runtime/provider: + - `src/main/presenter/configPresenter/index.ts` + - `src/main/presenter/configPresenter/modelConfig.ts` + - `src/main/presenter/llmProviderPresenter/index.ts` + - `src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts` + - `src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts` +- Renderer: + - `src/renderer/src/composables/useModelTypeDetection.ts` + - `src/renderer/src/components/chat/messageListItems.ts` + - `src/renderer/src/components/message/MessageItemAssistant.vue` + - `src/renderer/src/components/message/MessageBlockVideo.vue` (new) + - `src/renderer/settings/components/ProviderModelList.vue` +- Model DB: + - `resources/model-db/providers.json` + +## Compatibility +- Existing text, image, and TTS paths remain unchanged. +- Existing assistant block persistence remains compatible by reusing the current media payload field rather than changing the storage shape. +- Future video models can plug in through shared detection helpers or explicit `videoGeneration` metadata. + +## Verification Strategy +Run: +- `pnpm run typecheck` +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` diff --git a/docs/features/openai-compatible-video-generation/spec.md b/docs/features/openai-compatible-video-generation/spec.md new file mode 100644 index 000000000..66550c902 --- /dev/null +++ b/docs/features/openai-compatible-video-generation/spec.md @@ -0,0 +1,32 @@ +# OpenAI-Compatible Video Generation + +## User Need +Users need DeepChat to recognize and run video generation models such as `doubao-seedance-2-0-fast-260128` through the same model-driven provider flow used by text and audio generation, without hardcoding one-off provider logic for each future video model. + +## Goal +Enable first-class video generation routing in DeepChat for OpenAI-compatible providers, starting with AIHubMix Seedance models and leaving a compatibility layer for future video models. + +## Acceptance Criteria +1. Shared model/type contracts support `videoGeneration` and preserve compatibility with existing model metadata. +2. DeepChat can recognize `doubao-seedance-2-0-fast-260128` as a video generation model even when upstream metadata is incomplete or still marked as `chat`. +3. Main runtime can route video generation requests through an OpenAI-compatible `/v1/videos` flow. +4. Video generation responses are normalized into a stable internal result shape that future providers/models can reuse. +5. Generated video output reaches the existing assistant message pipeline and renders in the chat UI. +6. Validation commands pass: +- `pnpm run typecheck` +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` + +## Constraints +- Keep the provider integration generic for OpenAI-compatible video endpoints. +- Reuse the current assistant media block pipeline where practical instead of introducing a parallel storage format. +- Do not scope in advanced video editing controls or provider-specific parameter UIs for this change. + +## Non-Goals +- Dedicated video generation settings panels. +- Agent-level video generation tool configuration. +- Non-OpenAI-compatible video provider protocols. + +## Open Questions +- None for current scope. diff --git a/docs/features/openai-compatible-video-generation/tasks.md b/docs/features/openai-compatible-video-generation/tasks.md new file mode 100644 index 000000000..d27f8ab8f --- /dev/null +++ b/docs/features/openai-compatible-video-generation/tasks.md @@ -0,0 +1,25 @@ +# Tasks + +## Shared Types + Detection +- [x] Add `ModelType.VideoGeneration` and extend model-db parsing/schema for `videoGeneration`. +- [x] Add shared video detection/compatibility helpers for endpoint hints, modalities, and known model IDs. +- [x] Update model config inference to classify video models consistently in main and renderer flows. +- [x] Extend session generation settings/contracts and draft state to carry `videoGeneration` options. + +## Runtime + Provider +- [x] Add `generateVideoStandalone` presenter contracts and implementation. +- [x] Add OpenAI-compatible `/v1/videos` request/response normalization in the AI SDK runtime/provider path. +- [x] Persist and sanitize session-level video generation settings through agent runtime and sqlite storage. +- [ ] Mark Seedance built-in model metadata as `videoGeneration` where available. + +## Renderer +- [x] Expose video model detection for UI behavior alignment. +- [x] Add assistant message rendering for generated video media. +- [x] Update model list/type display for video generation models. +- [x] Expose video generation settings in chat status bar and model config dialog flows. + +## Validation +- [x] Run `pnpm run typecheck`. +- [x] Run `pnpm run format`. +- [x] Run `pnpm run i18n`. +- [x] Run `pnpm run lint`. diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md new file mode 100644 index 000000000..2f7ed4812 --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/plan.md @@ -0,0 +1,21 @@ +# Plan + +## Approach +Add a small runtime helper that extracts an integer duration from obvious prompt hints only when structured video settings are absent and the parsed value is supported by the active model, then reuse that helper for both request tracing and the actual `/videos` request body. + +## Implementation +- Add a focused runtime test that exercises the OpenAI-compatible `/videos` flow and asserts `duration: 2` is sent for prompts like `... 2s`. +- Add a conservative prompt-duration extractor for `Ns`, `N sec`, `N seconds`, and `N秒`. +- Enforce model-specific validity before injecting the derived duration (for Seedance, `4~15`). +- Apply the fallback only when `videoGeneration.duration` and `videoGeneration.seconds` are both unset. + +## Affected Files +- `src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts` +- `test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts` +- `docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md` + +## Validation +- Focused AI SDK runtime tests for video request bodies. +- `pnpm run format` +- `pnpm run i18n` +- `pnpm run lint` diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md new file mode 100644 index 000000000..04f28d04b --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/spec.md @@ -0,0 +1,25 @@ +# OpenAI-Compatible Video Prompt Duration Fallback + +## User Need +When users send prompts such as `生成 马斯克 喝酒的视频 2s` to OpenAI-compatible video models, DeepChat should preserve the obvious structured duration hint instead of sending only the raw prompt body. + +## Goal +Infer an explicit video duration from clear prompt suffixes like `5s` or `5秒` when the session has no structured video duration configured and the parsed value is valid for the target model. + +## Acceptance Criteria +1. OpenAI-compatible video requests derive `duration` from obvious prompt hints when neither `duration` nor `seconds` is already configured and the parsed value is valid for the current model. +2. Explicit structured video settings still take precedence over any prompt-derived fallback. +3. The emitted request trace matches the actual `/videos` body for this fallback. +4. Focused validation passes for the touched runtime slice. + +## Constraints +- Keep the fallback narrow and conservative; do not attempt broad natural-language parameter parsing. +- Preserve existing request-shape compatibility and polling behavior. + +## Non-Goals +- Adding or changing video settings UI. +- Parsing arbitrary style, ratio, or resolution hints from prompts. +- Changing provider safety or moderation behavior. + +## Open Questions +- None. diff --git a/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md b/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md new file mode 100644 index 000000000..bed18d1c6 --- /dev/null +++ b/docs/issues/openai-compatible-video-prompt-duration-fallback/tasks.md @@ -0,0 +1,11 @@ +# Tasks + +## Runtime Fallback +- [x] Add a runtime regression test for prompt-derived video duration. +- [x] Apply a conservative prompt duration fallback before building `/videos` requests. + +## Validation +- [x] Run focused AI SDK runtime tests. +- [x] Run `pnpm run format`. +- [x] Run `pnpm run i18n`. +- [x] Run `pnpm run lint`. diff --git a/src/main/presenter/agentRuntimePresenter/index.ts b/src/main/presenter/agentRuntimePresenter/index.ts index 73dc22488..d5cd9a559 100644 --- a/src/main/presenter/agentRuntimePresenter/index.ts +++ b/src/main/presenter/agentRuntimePresenter/index.ts @@ -57,8 +57,13 @@ import { normalizeImageGenerationOptions, supportsOpenAIImageGenerationSettings } from '@shared/imageGenerationSettings' -import { isDeepSeekSeriesModelId } from '@shared/model' +import { ModelType, isDeepSeekSeriesModelId } from '@shared/model' import { isTtsModelConfig, isTtsModelId } from '@shared/ttsSettings' +import { + isVideoGenerationModelConfig, + normalizeVideoGenerationOptions, + supportsOpenAICompatibleVideoGeneration +} from '@shared/videoGenerationSettings' import { nanoid } from 'nanoid' import type { SQLitePresenter } from '../sqlitePresenter' import { eventBus, SendTarget } from '@/eventbus' @@ -630,6 +635,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { ) const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, + state.modelId, generationSettings.contextLength ) const maxTokens = capAgentRequestMaxTokens(generationSettings.maxTokens, contextBudgetLength) @@ -657,7 +663,10 @@ export class AgentRuntimePresenter implements IAgentImplementation { think: false } - const compactionIntent = this.shouldBypassDeepChatContextBudget(state.providerId) + const compactionIntent = this.shouldBypassDeepChatContextBudget( + state.providerId, + state.modelId + ) ? null : await this.compactionService.prepareForNextUserTurn({ sessionId, @@ -1429,15 +1438,34 @@ export class AgentRuntimePresenter implements IAgentImplementation { return resolvedProviderId === 'acp' } - private shouldBypassDeepChatContextBudget(providerId?: string | null): boolean { - return providerId?.trim() === 'acp' + private shouldBypassDeepChatContextBudget( + providerId?: string | null, + modelId?: string | null + ): boolean { + const normalizedProviderId = providerId?.trim() + if (normalizedProviderId === 'acp') { + return true + } + + const normalizedModelId = modelId?.trim() + if (!normalizedProviderId || !normalizedModelId) { + return false + } + + const modelConfig = this.configPresenter.getModelConfig(normalizedModelId, normalizedProviderId) + return ( + modelConfig.type === ModelType.ImageGeneration || + modelConfig.type === ModelType.TTS || + isVideoGenerationModelConfig(modelConfig, normalizedModelId) + ) } private resolveDeepChatContextBudgetLength( providerId: string | null | undefined, + modelId: string | null | undefined, contextLength: number ): number { - return this.shouldBypassDeepChatContextBudget(providerId) + return this.shouldBypassDeepChatContextBudget(providerId, modelId) ? Number.MAX_SAFE_INTEGER : contextLength } @@ -1620,7 +1648,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { if (!state) { throw new Error(`Session ${sessionId} not found`) } - if (this.shouldBypassDeepChatContextBudget(state.providerId)) { + if (this.shouldBypassDeepChatContextBudget(state.providerId, state.modelId)) { throw new Error('Manual compaction is only available for DeepChat agent sessions.') } if (state.status !== 'idle') { @@ -1640,6 +1668,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { ) const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, + state.modelId, generationSettings.contextLength ) const maxTokens = capAgentRequestMaxTokens(generationSettings.maxTokens, contextBudgetLength) @@ -1858,9 +1887,13 @@ export class AgentRuntimePresenter implements IAgentImplementation { const interleavedReasoning = providedInterleavedReasoning ?? this.resolveInterleavedReasoningConfig(state.providerId, state.modelId, generationSettings) - const bypassContextBudget = this.shouldBypassDeepChatContextBudget(state.providerId) + const bypassContextBudget = this.shouldBypassDeepChatContextBudget( + state.providerId, + state.modelId + ) const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, + state.modelId, generationSettings.contextLength ) const baseModelConfig = this.configPresenter.getModelConfig(state.modelId, state.providerId) @@ -1877,6 +1910,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { reasoningVisibility: generationSettings.reasoningVisibility, verbosity: generationSettings.verbosity, imageGeneration: generationSettings.imageGeneration, + videoGeneration: generationSettings.videoGeneration, reasoning: getReasoningEffectiveEnabledForProvider(capabilityProviderId, reasoningPortrait, { reasoning: baseModelConfig.reasoning, reasoningEffort: generationSettings.reasoningEffort ?? baseModelConfig.reasoningEffort @@ -2555,6 +2589,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { ) const contextBudgetLength = this.resolveDeepChatContextBudgetLength( state.providerId, + state.modelId, generationSettings.contextLength ) const maxTokens = capAgentRequestMaxTokens(generationSettings.maxTokens, contextBudgetLength) @@ -2574,7 +2609,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { activeSkillNames ) this.throwIfAbortRequested(preStreamAbortSignal) - const summaryState = this.shouldBypassDeepChatContextBudget(state.providerId) + const summaryState = this.shouldBypassDeepChatContextBudget(state.providerId, state.modelId) ? this.sessionStore.getSummaryState(sessionId) : await this.resolveCompactionStateForResumeTurn({ sessionId, @@ -2615,7 +2650,7 @@ export class AgentRuntimePresenter implements IAgentImplementation { if ( budgetToolCall?.id && budgetToolCall.name && - !this.shouldBypassDeepChatContextBudget(state.providerId) + !this.shouldBypassDeepChatContextBudget(state.providerId, state.modelId) ) { const resumeBudget = this.fitResumeBudgetForToolCall({ resumeContext, @@ -3393,6 +3428,22 @@ export class AgentRuntimePresenter implements IAgentImplementation { } } + if ( + supportsOpenAICompatibleVideoGeneration({ + providerId, + providerApiType: this.resolveProviderApiType(providerId), + modelId, + apiEndpoint: modelConfig.apiEndpoint, + endpointType: modelConfig.endpointType, + type: modelConfig.type + }) + ) { + const videoGeneration = normalizeVideoGenerationOptions(modelConfig.videoGeneration) + if (videoGeneration) { + defaults.videoGeneration = videoGeneration + } + } + const supportsReasoning = this.configPresenter.supportsReasoningCapability?.(providerId, modelId) === true if (supportsReasoning) { @@ -3637,6 +3688,35 @@ export class AgentRuntimePresenter implements IAgentImplementation { delete next.imageGeneration } + if ( + supportsOpenAICompatibleVideoGeneration({ + providerId, + providerApiType: this.resolveProviderApiType(providerId), + modelId, + apiEndpoint: modelConfig.apiEndpoint, + endpointType: modelConfig.endpointType, + type: modelConfig.type + }) + ) { + if (Object.prototype.hasOwnProperty.call(patch, 'videoGeneration')) { + const videoGeneration = normalizeVideoGenerationOptions(patch.videoGeneration) + if (videoGeneration) { + next.videoGeneration = videoGeneration + } else { + delete next.videoGeneration + } + } else { + const videoGeneration = normalizeVideoGenerationOptions(next.videoGeneration) + if (videoGeneration) { + next.videoGeneration = videoGeneration + } else { + delete next.videoGeneration + } + } + } else { + delete next.videoGeneration + } + if (fixedTemperatureKimi) { next.temperature = fixedTemperatureKimi.temperature } diff --git a/src/main/presenter/configPresenter/index.ts b/src/main/presenter/configPresenter/index.ts index b06bde6ec..35512639e 100644 --- a/src/main/presenter/configPresenter/index.ts +++ b/src/main/presenter/configPresenter/index.ts @@ -27,6 +27,7 @@ import { resolveProviderCapabilityProviderId, type NewApiEndpointType } from '@shared/model' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import { DEFAULT_MODEL_CAPABILITY_FALLBACKS, resolveDerivedModelMaxTokens, @@ -973,6 +974,15 @@ export class ConfigPresenter implements IConfigPresenter { } private inferProviderDbModelType(model: ProviderModel): ModelType { + const videoGenerationType = resolveVideoGenerationCompatType({ + modelId: model.id, + type: model.type, + modalities: model.modalities + }) + if (videoGenerationType) { + return videoGenerationType + } + if (Array.isArray(model.modalities?.output) && model.modalities.output.includes('image')) { return ModelType.ImageGeneration } @@ -984,6 +994,8 @@ export class ConfigPresenter implements IConfigPresenter { return ModelType.Rerank case 'imageGeneration': return ModelType.ImageGeneration + case 'videoGeneration': + return ModelType.VideoGeneration case 'tts': return ModelType.TTS case 'chat': diff --git a/src/main/presenter/configPresenter/modelConfig.ts b/src/main/presenter/configPresenter/modelConfig.ts index c6fe4fe04..fcd2c709c 100644 --- a/src/main/presenter/configPresenter/modelConfig.ts +++ b/src/main/presenter/configPresenter/modelConfig.ts @@ -13,6 +13,7 @@ import { resolveModelFunctionCall } from '@shared/modelConfigDefaults' import { applyMoonshotKimiReasoningTemperaturePolicy } from '@shared/moonshotKimiPolicy' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import ElectronStore from 'electron-store' import { providerDbLoader } from './providerDbLoader' import { @@ -105,6 +106,15 @@ export class ModelConfigHelper { * Priority: 1. modalities.output includes image 2. model.type (from provider.json) 3. default Chat */ private inferModelType(model: ProviderModel): ModelType { + const videoGenerationType = resolveVideoGenerationCompatType({ + modelId: model.id, + type: model.type, + modalities: model.modalities + }) + if (videoGenerationType) { + return videoGenerationType + } + // Priority 1: Output modality indicates image generation if (Array.isArray(model.modalities?.output) && model.modalities.output.includes('image')) { return ModelType.ImageGeneration @@ -121,6 +131,8 @@ export class ModelConfigHelper { return ModelType.Rerank case 'imageGeneration': return ModelType.ImageGeneration + case 'videoGeneration': + return ModelType.VideoGeneration case 'tts': return ModelType.TTS default: @@ -180,9 +192,11 @@ export class ModelConfigHelper { apiEndpoint: modelType === ModelType.ImageGeneration ? ApiEndpointType.Image - : modelType === ModelType.TTS - ? ApiEndpointType.AudioSpeech - : ApiEndpointType.Chat, + : modelType === ModelType.VideoGeneration + ? ApiEndpointType.Video + : modelType === ModelType.TTS + ? ApiEndpointType.AudioSpeech + : ApiEndpointType.Chat, thinkingBudget, forceInterleavedThinkingCompat, reasoningEffort, diff --git a/src/main/presenter/configPresenter/providerModelHelper.ts b/src/main/presenter/configPresenter/providerModelHelper.ts index e71a2833d..6ded4e17d 100644 --- a/src/main/presenter/configPresenter/providerModelHelper.ts +++ b/src/main/presenter/configPresenter/providerModelHelper.ts @@ -2,6 +2,7 @@ import { eventBus, SendTarget } from '@/eventbus' import { CONFIG_EVENTS } from '@/events' import { ModelConfig, MODEL_META } from '@shared/presenter' import { ModelType } from '@shared/model' +import { resolveVideoGenerationCompatType } from '@shared/videoGenerationSettings' import ElectronStore from 'electron-store' import path from 'path' import type { StoreLike } from './storeLike' @@ -143,16 +144,30 @@ export class ProviderModelHelper { normalizedModel.reasoning !== undefined ? normalizedModel.reasoning : config.reasoning || false - normalizedModel.type = - normalizedModel.type !== undefined ? normalizedModel.type : config.type || ModelType.Chat normalizedModel.endpointType = config.endpointType ?? normalizedModel.endpointType + normalizedModel.type = + resolveVideoGenerationCompatType({ + modelId: normalizedModel.id, + type: config.type ?? normalizedModel.type, + apiEndpoint: config.apiEndpoint, + endpointType: normalizedModel.endpointType, + supportedEndpointTypes: normalizedModel.supportedEndpointTypes + }) ?? + (normalizedModel.type !== undefined ? normalizedModel.type : config.type || ModelType.Chat) return normalizedModel } normalizedModel.vision = normalizedModel.vision || false normalizedModel.functionCall = normalizedModel.functionCall || false normalizedModel.reasoning = normalizedModel.reasoning || false - normalizedModel.type = normalizedModel.type || ModelType.Chat + normalizedModel.type = + resolveVideoGenerationCompatType({ + modelId: normalizedModel.id, + type: normalizedModel.type, + endpointType: normalizedModel.endpointType, + supportedEndpointTypes: normalizedModel.supportedEndpointTypes + }) ?? + (normalizedModel.type || ModelType.Chat) return normalizedModel } diff --git a/src/main/presenter/index.ts b/src/main/presenter/index.ts index dfb631d9d..5c3a65308 100644 --- a/src/main/presenter/index.ts +++ b/src/main/presenter/index.ts @@ -388,6 +388,14 @@ export class Presenter implements IPresenter { modelId, imageOptions, options + ), + generateVideoStandalone: (providerId, prompt, modelId, videoOptions, options) => + this.llmproviderPresenter.generateVideoStandalone( + providerId, + prompt, + modelId, + videoOptions, + options ) }), cacheImage: (data) => this.devicePresenter.cacheImage(data), diff --git a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts index 56cff686e..bf5cf959b 100644 --- a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts +++ b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts @@ -19,6 +19,13 @@ import { supportsOpenAIImageGenerationSettings, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + isVideoGenerationModelConfig, + normalizeVideoGenerationOptions, + resolveOpenAICompatibleVideoRequestBodyShape, + type VideoGenerationOptions, + type VideoGenerationReference +} from '@shared/videoGenerationSettings' import { isChatAudioTtsModel, isGeminiGenerateContentTtsModel, @@ -48,9 +55,40 @@ type ImageGenerationRequestOptions = { providerOptions?: Record } +type VideoGenerationRequestBody = { + model: string + prompt: string + seconds?: string + size?: string + input_reference?: string | { mime_type?: string; data: string } + content?: Array> + ratio?: string + duration?: number + resolution?: string + watermark?: boolean + generate_audio?: boolean + extra_body?: Record +} + +type VideoGenerationTaskResponse = { + id?: string + status?: string + url?: string | null + error?: + | string + | { + message?: string + } + | null +} + const DEFAULT_GEMINI_TTS_VOICE = 'Kore' const DEFAULT_GEMINI_PCM_SAMPLE_RATE = 24000 const DEFAULT_GEMINI_PCM_BITS_PER_SAMPLE = 16 +const VIDEO_GENERATION_POLL_INTERVAL_MS = 3000 +const PROMPT_VIDEO_DURATION_EN_PATTERN = + /(^|[^0-9a-z])(?\d{1,2})\s*(?:s|sec|secs|second|seconds)\b/i +const PROMPT_VIDEO_DURATION_ZH_PATTERN = /(?\d{1,2})\s*秒/u export interface AiSdkRuntimeContext { providerKind: AiSdkProviderKind @@ -71,6 +109,7 @@ export interface AiSdkRuntimeContext { cleanHeaders?: boolean supportsNativeTools?: (modelId: string, modelConfig: ModelConfig) => boolean shouldUseImageGeneration?: (modelId: string, modelConfig: ModelConfig) => boolean + shouldUseVideoGeneration?: (modelId: string, modelConfig: ModelConfig) => boolean shouldUseTts?: (modelId: string, modelConfig: ModelConfig) => boolean } @@ -146,6 +185,63 @@ function normalizePromptValue(value: unknown): string { return '' } +function supportsPromptDerivedVideoDuration(modelId: string, duration: number): boolean { + const normalizedModelId = modelId.trim().toLowerCase() + + if (normalizedModelId.startsWith('doubao-seedance-')) { + return duration >= 4 && duration <= 15 + } + + return true +} + +function resolvePromptVideoDuration(prompt: string, modelId: string): number | undefined { + const normalizedPrompt = prompt.trim() + if (!normalizedPrompt) { + return undefined + } + + const matchedDuration = + normalizedPrompt.match(PROMPT_VIDEO_DURATION_EN_PATTERN)?.groups?.duration || + normalizedPrompt.match(PROMPT_VIDEO_DURATION_ZH_PATTERN)?.groups?.duration + + if (!matchedDuration) { + return undefined + } + + const parsed = Number.parseInt(matchedDuration, 10) + if (!Number.isFinite(parsed) || parsed <= 0) { + return undefined + } + + return supportsPromptDerivedVideoDuration(modelId, parsed) ? parsed : undefined +} + +function resolveVideoGenerationRequestOptions( + prompt: string, + modelId: string, + options: VideoGenerationOptions | undefined +): VideoGenerationOptions | undefined { + const normalizedOptions = normalizeVideoGenerationOptions(options) + + if ( + typeof normalizedOptions?.duration === 'number' || + (typeof normalizedOptions?.seconds === 'string' && normalizedOptions.seconds.trim().length > 0) + ) { + return normalizedOptions + } + + const promptDuration = resolvePromptVideoDuration(prompt, modelId) + if (promptDuration === undefined) { + return normalizedOptions + } + + return normalizeVideoGenerationOptions({ + ...normalizedOptions, + duration: promptDuration + }) +} + function extractImagePrompt(messages: ChatMessage[]): string { return messages .map((message) => (message.role === 'user' ? normalizePromptValue(message.content) : '')) @@ -153,6 +249,10 @@ function extractImagePrompt(messages: ChatMessage[]): string { .join('\n\n') } +function extractVideoPrompt(messages: ChatMessage[]): string { + return extractImagePrompt(messages) +} + function resolveSupportsNativeTools( context: AiSdkRuntimeContext, modelId: string, @@ -177,6 +277,21 @@ function shouldUseImageGenerationRuntime( return modelConfig.apiEndpoint === ApiEndpointType.Image } +function shouldUseVideoGenerationRuntime( + context: AiSdkRuntimeContext, + modelId: string, + modelConfig: ModelConfig +): boolean { + if (context.shouldUseVideoGeneration) { + return context.shouldUseVideoGeneration(modelId, modelConfig) + } + + return ( + modelConfig.apiEndpoint === ApiEndpointType.Video || + isVideoGenerationModelConfig(modelConfig, modelId) + ) +} + function shouldUseTtsRuntime( context: AiSdkRuntimeContext, modelId: string, @@ -626,6 +741,355 @@ function resolveRuntimeTemperature( } } +function normalizeOpenAICompatibleBaseUrl(baseUrl: string | undefined): string { + const normalized = (baseUrl || 'https://api.openai.com/v1').trim().replace(/\/+$/, '') + if (!normalized) { + return 'https://api.openai.com/v1' + } + + return /\/v1(?:beta\d+)?$/i.test(normalized) ? normalized : `${normalized}/v1` +} + +function normalizeVideoReferenceDataUrl(reference: VideoGenerationReference): string | undefined { + if (reference.url?.trim()) { + return reference.url.trim() + } + + if (!reference.data?.trim()) { + return undefined + } + + const normalizedData = reference.data.trim() + if (normalizedData.startsWith('data:')) { + return normalizedData + } + + const fallbackMimeType = + reference.mimeType?.trim() || + (reference.type === 'image' + ? 'image/png' + : reference.type === 'audio' + ? 'audio/mpeg' + : 'video/mp4') + + return `data:${fallbackMimeType};base64,${normalizedData}` +} + +function buildVideoGenerationContent( + options: VideoGenerationOptions | undefined +): Array> | undefined { + if (!options) { + return undefined + } + + const content: Record[] = [] + + for (const reference of options.references ?? []) { + const url = normalizeVideoReferenceDataUrl(reference) + if (!url) { + continue + } + + if (reference.type === 'image') { + content.push({ + type: 'image_url', + image_url: { url }, + role: 'reference_image' + }) + continue + } + + if (reference.type === 'audio') { + content.push({ + type: 'audio_url', + audio_url: { url }, + role: 'reference_audio' + }) + continue + } + + content.push({ + type: 'video_url', + video_url: { url }, + role: 'reference_video' + }) + } + + return content.length > 0 ? content : undefined +} + +function buildVideoGenerationExtraBody( + options: VideoGenerationOptions | undefined +): Record | undefined { + if (!options) { + return undefined + } + + const extraBody: Record = {} + + if (typeof options.duration === 'number' && Number.isFinite(options.duration)) { + extraBody.duration = options.duration + } + if (typeof options.ratio === 'string' && options.ratio.trim()) { + extraBody.ratio = options.ratio.trim() + } + if (typeof options.resolution === 'string' && options.resolution.trim()) { + extraBody.resolution = options.resolution.trim() + } + if (typeof options.watermark === 'boolean') { + extraBody.watermark = options.watermark + } + if (typeof options.generateAudio === 'boolean') { + extraBody.generate_audio = options.generateAudio + } + + const content = buildVideoGenerationContent(options) + if (content) { + extraBody.content = content + } + + return Object.keys(extraBody).length > 0 ? extraBody : undefined +} + +function resolveFlatTopLevelVideoDuration( + options: VideoGenerationOptions | undefined +): number | undefined { + if (typeof options?.duration === 'number' && Number.isFinite(options.duration)) { + return Math.max(-1, Math.round(options.duration)) + } + + if (typeof options?.seconds !== 'string') { + return undefined + } + + const parsed = Number.parseInt(options.seconds.trim(), 10) + return Number.isFinite(parsed) ? Math.max(-1, parsed) : undefined +} + +function buildVideoGenerationRequestBody( + provider: LLM_PROVIDER, + modelId: string, + prompt: string, + options: VideoGenerationOptions | undefined +): VideoGenerationRequestBody { + const body: VideoGenerationRequestBody = { + model: modelId, + prompt + } + + if (options?.seconds) { + body.seconds = options.seconds + } + if (options?.size) { + body.size = options.size + } + if (options?.inputReference) { + if (typeof options.inputReference === 'string') { + body.input_reference = options.inputReference + } else { + body.input_reference = { + data: options.inputReference.data, + ...(options.inputReference.mimeType ? { mime_type: options.inputReference.mimeType } : {}) + } + } + } + + const requestBodyShape = resolveOpenAICompatibleVideoRequestBodyShape({ + providerId: provider.id, + providerApiType: provider.apiType, + baseUrl: provider.baseUrl, + modelId + }) + + if (requestBodyShape === 'flat-top-level') { + const content = buildVideoGenerationContent(options) + if (content) { + body.content = content + } + if (options?.ratio) { + body.ratio = options.ratio.trim() + } + const duration = resolveFlatTopLevelVideoDuration(options) + if (duration !== undefined) { + body.duration = duration + } + if (options?.resolution) { + body.resolution = options.resolution.trim() + } + if (typeof options?.watermark === 'boolean') { + body.watermark = options.watermark + } + if (typeof options?.generateAudio === 'boolean') { + body.generate_audio = options.generateAudio + } + + return body + } + + const extraBody = buildVideoGenerationExtraBody(options) + if (extraBody) { + body.extra_body = extraBody + } + + return body +} + +function extractVideoTaskError(response: VideoGenerationTaskResponse | null | undefined): string { + const error = response?.error + if (typeof error === 'string' && error.trim()) { + return error.trim() + } + + if ( + error && + typeof error === 'object' && + typeof error.message === 'string' && + error.message.trim() + ) { + return error.message.trim() + } + + return 'Video generation failed' +} + +function resolveVideoTaskStatus(response: VideoGenerationTaskResponse | null | undefined): string { + return typeof response?.status === 'string' ? response.status.trim().toLowerCase() : '' +} + +function delayWithAbort(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal.aborted) { + reject(signal.reason instanceof Error ? signal.reason : new Error('Aborted')) + return + } + + const onAbort = () => { + clearTimeout(timeoutId) + signal.removeEventListener('abort', onAbort) + reject(signal.reason instanceof Error ? signal.reason : new Error('Aborted')) + } + + const timeoutId = setTimeout(() => { + signal.removeEventListener('abort', onAbort) + resolve() + }, ms) + + signal.addEventListener('abort', onAbort, { once: true }) + }) +} + +async function executeOpenAICompatibleVideoGeneration( + provider: LLM_PROVIDER, + defaultHeaders: Record, + modelId: string, + prompt: string, + modelConfig: ModelConfig, + timeout: number | undefined +): Promise<{ base64: string; mimeType: string }> { + const normalizedOptions = resolveVideoGenerationRequestOptions( + prompt, + modelId, + modelConfig.videoGeneration + ) + const baseUrl = normalizeOpenAICompatibleBaseUrl(provider.baseUrl) + const createUrl = `${baseUrl}/videos` + const body = buildVideoGenerationRequestBody(provider, modelId, prompt, normalizedOptions) + const controller = new AbortController() + const timeoutId = timeout ? setTimeout(() => controller.abort(), timeout) : undefined + const proxyUrl = proxyConfig.getProxyUrl() + const dispatcher = proxyUrl ? new ProxyAgent(proxyUrl) : undefined + + const fetchJson = async (url: string, init: RequestInit): Promise => { + const fetchInit: RequestInit & { dispatcher?: ProxyAgent } = { + ...init, + headers: { + ...defaultHeaders, + Authorization: `Bearer ${provider.oauthToken || provider.apiKey || ''}`, + ...(init.headers as Record | undefined) + }, + signal: controller.signal + } + if (dispatcher) fetchInit.dispatcher = dispatcher + + const response = await fetch(url, fetchInit) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + throw new Error(`Video request failed (${response.status}): ${errorText}`) + } + + return (await response.json()) as T + } + + const fetchBinary = async (url: string): Promise<{ buffer: ArrayBuffer; mimeType: string }> => { + const fetchInit: RequestInit & { dispatcher?: ProxyAgent } = { + method: 'GET', + headers: { + ...defaultHeaders, + Authorization: `Bearer ${provider.oauthToken || provider.apiKey || ''}` + }, + signal: controller.signal + } + if (dispatcher) fetchInit.dispatcher = dispatcher + + const response = await fetch(url, fetchInit) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + throw new Error(`Video content download failed (${response.status}): ${errorText}`) + } + + return { + buffer: await response.arrayBuffer(), + mimeType: response.headers.get('content-type')?.split(';')[0]?.trim() || 'video/mp4' + } + } + + try { + let task = await fetchJson(createUrl, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body) + }) + + const taskId = typeof task.id === 'string' ? task.id.trim() : '' + if (!taskId) { + throw new Error('Video generation response missing task id') + } + + let status = resolveVideoTaskStatus(task) + while (status !== 'completed') { + if (status === 'failed') { + throw new Error(extractVideoTaskError(task)) + } + + await delayWithAbort(VIDEO_GENERATION_POLL_INTERVAL_MS, controller.signal) + task = await fetchJson( + `${createUrl}/${encodeURIComponent(taskId)}`, + { + method: 'GET' + } + ) + status = resolveVideoTaskStatus(task) + } + + const contentUrl = + typeof task.url === 'string' && task.url.trim().length > 0 + ? task.url.trim() + : `${createUrl}/${encodeURIComponent(taskId)}/content` + const { buffer, mimeType } = await fetchBinary(contentUrl) + + return { + base64: Buffer.from(buffer).toString('base64'), + mimeType + } + } finally { + if (timeoutId !== undefined) { + clearTimeout(timeoutId) + } + } +} + async function buildPromptRuntime( context: AiSdkRuntimeContext, messages: ChatMessage[], @@ -801,6 +1265,49 @@ export async function* runAiSdkCoreStream( return } + if (shouldUseVideoGenerationRuntime(context, modelId, normalizedModelConfig)) { + const prompt = extractVideoPrompt(messages) + const normalizedVideoOptions = resolveVideoGenerationRequestOptions( + prompt, + modelId, + normalizedModelConfig.videoGeneration + ) + const requestBody = buildVideoGenerationRequestBody( + context.provider, + modelId, + prompt, + normalizedVideoOptions + ) + + await context.emitRequestTrace?.(normalizedModelConfig, { + endpoint: `${normalizeOpenAICompatibleBaseUrl(context.provider.baseUrl)}/videos`, + headers: context.buildTraceHeaders?.() ?? context.defaultHeaders, + body: requestBody + }) + + const { base64, mimeType } = await executeOpenAICompatibleVideoGeneration( + context.provider, + context.defaultHeaders, + modelId, + prompt, + normalizedModelConfig, + timeout + ) + + yield { + type: 'image_data', + image_data: { + data: `data:${mimeType};base64,${base64}`, + mimeType + } + } + yield { + type: 'stop', + stop_reason: 'complete' + } + return + } + if (shouldUseImageGenerationRuntime(context, modelId, normalizedModelConfig)) { const prompt = extractImagePrompt(messages) diff --git a/src/main/presenter/llmProviderPresenter/index.ts b/src/main/presenter/llmProviderPresenter/index.ts index a1c100a2c..b7bab7c04 100644 --- a/src/main/presenter/llmProviderPresenter/index.ts +++ b/src/main/presenter/llmProviderPresenter/index.ts @@ -9,6 +9,7 @@ import { KeyStatus, LLM_EMBEDDING_ATTRS, StandaloneImageGenerationResult, + StandaloneVideoGenerationResult, ModelScopeMcpSyncOptions, ModelScopeMcpSyncResult, IConfigPresenter, @@ -24,6 +25,10 @@ import { normalizeImageGenerationOptions, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + normalizeVideoGenerationOptions, + type VideoGenerationOptions +} from '@shared/videoGenerationSettings' import { ProviderChange, ProviderBatchUpdate } from '@shared/provider-operations' import { isProviderDbBackedProvider } from '@shared/providerDbCatalog' import { eventBus } from '@/eventbus' @@ -538,6 +543,89 @@ export class LLMProviderPresenter implements ILlmProviderPresenter { } } + async generateVideoStandalone( + providerId: string, + prompt: string, + modelId: string, + videoOptions?: VideoGenerationOptions, + options?: { signal?: AbortSignal } + ): Promise { + const normalizedPrompt = prompt.trim() + if (!normalizedPrompt) { + throw new Error('Video generation prompt is required') + } + + const signal = options?.signal + if (signal?.aborted) { + throw createAbortError() + } + + await this.executeWithRateLimit(providerId, { signal }) + + const provider = this.getProviderInstance(providerId) + const modelConfig = this.configPresenter.getModelConfig(modelId, providerId) + const mergedVideoOptions = normalizeVideoGenerationOptions({ + ...modelConfig.videoGeneration, + ...videoOptions + }) + const resolvedModelConfig: ModelConfig = { + ...modelConfig, + type: ModelType.VideoGeneration, + apiEndpoint: ApiEndpointType.Video, + videoGeneration: mergedVideoOptions + } + const stream = provider.coreStream( + [{ role: 'user', content: normalizedPrompt }], + modelId, + resolvedModelConfig, + modelConfig.temperature ?? 0.7, + modelConfig.maxTokens ?? 1024, + [] + ) + const videos: StandaloneVideoGenerationResult['videos'] = [] + const abort = createAbortPromise(signal, () => { + void stream.return?.(undefined as never) + }) + + const collect = async () => { + for await (const event of stream) { + if (signal?.aborted) { + throw createAbortError() + } + + if ( + event.type === 'image_data' && + event.image_data.mimeType.trim().toLowerCase().startsWith('video/') + ) { + videos.push({ + data: event.image_data.data, + mimeType: event.image_data.mimeType + }) + } + if (event.type === 'error') { + throw new Error(event.error_message) + } + } + } + + try { + await (abort.promise ? Promise.race([collect(), abort.promise]) : collect()) + } finally { + abort.cleanup() + } + + if (videos.length === 0) { + throw new Error('Video generation completed without video output') + } + + return { + providerId, + modelId, + ...(mergedVideoOptions ? { options: mergedVideoOptions } : {}), + videos + } + } + // 配置相关方法 setMaxConcurrentStreams(max: number): void { this.config.maxConcurrentStreams = max diff --git a/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts b/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts index 0cb7ec041..37efbd9a9 100644 --- a/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts +++ b/src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts @@ -8,6 +8,7 @@ import { type NewApiEndpointType } from '@shared/model' import { isTtsModelConfig, isTtsModelId } from '@shared/ttsSettings' +import { isVideoGenerationModelConfig } from '@shared/videoGenerationSettings' import { DEFAULT_MODEL_CONTEXT_LENGTH, DEFAULT_MODEL_MAX_TOKENS, @@ -96,6 +97,10 @@ const shouldUseOpenAIImageGenerationRoute = (modelId: string, modelConfig: Model modelConfig.apiEndpoint === ApiEndpointType.Image || modelConfig.type === ModelType.ImageGeneration +const shouldUseOpenAIVideoGenerationRoute = (modelId: string, modelConfig: ModelConfig): boolean => + modelConfig.apiEndpoint === ApiEndpointType.Video || + isVideoGenerationModelConfig(modelConfig, modelId) + const shouldUseOpenAITtsRoute = (modelId: string, modelConfig: ModelConfig): boolean => isTtsModelConfig(modelConfig) || modelConfig.apiEndpoint === ApiEndpointType.AudioSpeech || @@ -351,6 +356,27 @@ export class AiSdkProvider extends BaseLLMProvider { endpointType: 'image-generation' } } + case 'video-generation': + return { + providerKind: 'openai-compatible', + endpointType, + providerPatch: { + apiType: 'openai-completions', + baseUrl: `${host}/v1`, + capabilityProviderId: resolveProviderCapabilityProviderId( + this.provider.id, + { + endpointType + }, + modelId + ) + }, + modelConfigPatch: { + apiEndpoint: ApiEndpointType.Video, + type: ModelType.VideoGeneration, + endpointType: 'video-generation' + } + } case 'openai': default: return { @@ -575,6 +601,17 @@ export class AiSdkProvider extends BaseLLMProvider { isOpenAIImageGenerationModel(runtimeModelId) || runtimeModelConfig.apiEndpoint === ApiEndpointType.Image + const shouldUseVideoGeneration = + this.isAzureOpenAI(decision, runtimeProvider) || + decision.providerKind === 'gemini' || + decision.providerKind === 'vertex' || + decision.providerKind === 'anthropic' + ? undefined + : decision.endpointType === 'video-generation' + ? () => true + : (runtimeModelId: string, runtimeModelConfig: ModelConfig) => + shouldUseOpenAIVideoGenerationRoute(runtimeModelId, runtimeModelConfig) + // TTS route: only applicable for OpenAI-compatible providers (not Azure, Gemini, Vertex) const shouldUseTts = this.isAzureOpenAI(decision, runtimeProvider) || @@ -602,6 +639,7 @@ export class AiSdkProvider extends BaseLLMProvider { supportsNativeTools: (_runtimeModelId, runtimeModelConfig) => runtimeModelConfig.functionCall === true, shouldUseImageGeneration, + shouldUseVideoGeneration, shouldUseTts } } @@ -1673,17 +1711,22 @@ export class AiSdkProvider extends BaseLLMProvider { normalizedRawType === 'image' || supportedEndpointTypes.includes('image-generation') ? ModelType.ImageGeneration - : normalizedRawType === 'tts' || - normalizedRawType === 'audio-speech' || - normalizedRawType === 'audiospeech' - ? ModelType.TTS - : normalizedRawType === 'embedding' || - normalizedRawType === 'embeddings' || - normalizedModelId.includes('embedding') - ? ModelType.Embedding - : normalizedRawType === 'rerank' || normalizedModelId.includes('rerank') - ? ModelType.Rerank - : undefined + : normalizedRawType === 'videogeneration' || + normalizedRawType === 'video-generation' || + normalizedRawType === 'video' || + supportedEndpointTypes.includes('video-generation') + ? ModelType.VideoGeneration + : normalizedRawType === 'tts' || + normalizedRawType === 'audio-speech' || + normalizedRawType === 'audiospeech' + ? ModelType.TTS + : normalizedRawType === 'embedding' || + normalizedRawType === 'embeddings' || + normalizedModelId.includes('embedding') + ? ModelType.Embedding + : normalizedRawType === 'rerank' || normalizedModelId.includes('rerank') + ? ModelType.Rerank + : undefined const contextLengthCandidate = [ rawModel.context_length, @@ -1708,7 +1751,9 @@ export class AiSdkProvider extends BaseLLMProvider { supportedEndpointTypes.length === 0 ? type === ModelType.ImageGeneration ? 'image-generation' - : undefined + : type === ModelType.VideoGeneration + ? 'video-generation' + : undefined : resolveNewApiEndpointTypeFromRoute( { supportedEndpointTypes, diff --git a/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts b/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts index 779dfa71f..7993f9742 100644 --- a/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts +++ b/src/main/presenter/sqlitePresenter/tables/deepchatSessions.ts @@ -12,6 +12,10 @@ import { normalizeImageGenerationOptions, type ImageGenerationOptions } from '@shared/imageGenerationSettings' +import { + normalizeVideoGenerationOptions, + type VideoGenerationOptions +} from '@shared/videoGenerationSettings' type DeepChatSessionGenerationSettings = Pick< SessionGenerationSettings, @@ -26,6 +30,7 @@ type DeepChatSessionGenerationSettings = Pick< | 'verbosity' | 'forceInterleavedThinkingCompat' | 'imageGeneration' + | 'videoGeneration' > export interface DeepChatSessionRow { @@ -44,6 +49,7 @@ export interface DeepChatSessionRow { verbosity: 'low' | 'medium' | 'high' | null force_interleaved_thinking_compat: number | null image_generation_options_json: string | null + video_generation_options_json: string | null summary_text: string | null summary_cursor_order_seq: number | null summary_updated_at: number | null @@ -109,6 +115,10 @@ export class DeepChatSessionsTable extends BaseTable { columns.push('image_generation_options_json TEXT') } + if (version >= 28) { + columns.push('video_generation_options_json TEXT') + } + if (version >= 14) { columns.push( 'summary_text TEXT', @@ -187,6 +197,11 @@ export class DeepChatSessionsTable extends BaseTable { 'ALTER TABLE deepchat_sessions ADD COLUMN image_generation_options_json TEXT;' ) } + if (!this.hasColumn('video_generation_options_json')) { + statements.push( + 'ALTER TABLE deepchat_sessions ADD COLUMN video_generation_options_json TEXT;' + ) + } return statements } @@ -230,11 +245,14 @@ export class DeepChatSessionsTable extends BaseTable { if (version === 27) { return 'ALTER TABLE deepchat_sessions ADD COLUMN image_generation_options_json TEXT;' } + if (version === 28) { + return 'ALTER TABLE deepchat_sessions ADD COLUMN video_generation_options_json TEXT;' + } return null } getLatestVersion(): number { - return 27 + return 28 } private serializeImageGenerationOptions( @@ -257,6 +275,26 @@ export class DeepChatSessionsTable extends BaseTable { } } + private serializeVideoGenerationOptions( + value: VideoGenerationOptions | undefined + ): string | null { + const normalized = normalizeVideoGenerationOptions(value) + return normalized ? JSON.stringify(normalized) : null + } + + private parseVideoGenerationOptions(value: string | null): VideoGenerationOptions | undefined { + if (!value) { + return undefined + } + + try { + const parsed = JSON.parse(value) as VideoGenerationOptions + return normalizeVideoGenerationOptions(parsed) + } catch { + return undefined + } + } + create( id: string, providerId: string, @@ -282,11 +320,12 @@ export class DeepChatSessionsTable extends BaseTable { verbosity, force_interleaved_thinking_compat, image_generation_options_json, + video_generation_options_json, summary_text, summary_cursor_order_seq, summary_updated_at ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ) .run( id, @@ -308,6 +347,7 @@ export class DeepChatSessionsTable extends BaseTable { ? 1 : 0, this.serializeImageGenerationOptions(generationSettings?.imageGeneration), + this.serializeVideoGenerationOptions(generationSettings?.videoGeneration), null, 1, null @@ -362,6 +402,10 @@ export class DeepChatSessionsTable extends BaseTable { if (imageGeneration) { settings.imageGeneration = imageGeneration } + const videoGeneration = this.parseVideoGenerationOptions(row.video_generation_options_json) + if (videoGeneration) { + settings.videoGeneration = videoGeneration + } return settings } @@ -430,6 +474,10 @@ export class DeepChatSessionsTable extends BaseTable { updates.push('image_generation_options_json = ?') params.push(this.serializeImageGenerationOptions(settings.imageGeneration)) } + if (Object.prototype.hasOwnProperty.call(settings, 'videoGeneration')) { + updates.push('video_generation_options_json = ?') + params.push(this.serializeVideoGenerationOptions(settings.videoGeneration)) + } if (updates.length === 0) { return diff --git a/src/renderer/settings/components/ProviderModelList.vue b/src/renderer/settings/components/ProviderModelList.vue index d4b45ff76..3aa5f83b2 100644 --- a/src/renderer/settings/components/ProviderModelList.vue +++ b/src/renderer/settings/components/ProviderModelList.vue @@ -376,7 +376,9 @@ const TYPE_ORDER: ModelType[] = [ ModelType.Chat, ModelType.Embedding, ModelType.Rerank, - ModelType.ImageGeneration + ModelType.ImageGeneration, + ModelType.VideoGeneration, + ModelType.TTS ] const CAPABILITY_ICONS: Record = { @@ -391,6 +393,7 @@ const TYPE_ICONS: Record = { [ModelType.Embedding]: 'lucide:database', [ModelType.Rerank]: 'lucide:arrow-up-wide-narrow', [ModelType.ImageGeneration]: 'lucide:image', + [ModelType.VideoGeneration]: 'lucide:clapperboard', [ModelType.TTS]: 'lucide:volume-2' } diff --git a/src/renderer/src/components/ChatConfig.vue b/src/renderer/src/components/ChatConfig.vue index 6b3e151db..044aaf584 100644 --- a/src/renderer/src/components/ChatConfig.vue +++ b/src/renderer/src/components/ChatConfig.vue @@ -40,7 +40,7 @@ const props = defineProps<{ providerId?: string reasoningEffort?: ReasoningEffort verbosity?: Verbosity - modelType?: 'chat' | 'imageGeneration' | 'embedding' | 'rerank' + modelType?: 'chat' | 'imageGeneration' | 'videoGeneration' | 'tts' | 'embedding' | 'rerank' }>() const systemPrompt = defineModel('systemPrompt') @@ -129,7 +129,7 @@ const { sliderFields, inputFields, selectFields } = useChatConfigFields({ watch( () => props.modelType, (newType) => { - if (newType === 'imageGeneration' && systemPrompt.value) { + if ((newType === 'imageGeneration' || newType === 'videoGeneration') && systemPrompt.value) { systemPrompt.value = '' } } @@ -140,6 +140,8 @@ const modelTypeIcon = computed(() => { const icons = { chat: 'lucide:message-circle', imageGeneration: 'lucide:image', + videoGeneration: 'lucide:clapperboard', + tts: 'lucide:volume-2', embedding: 'lucide:layers', rerank: 'lucide:arrow-up-down' } @@ -157,7 +159,13 @@ const modelTypeIcon = computed(() => {
-
+
diff --git a/src/renderer/src/components/chat/ChatStatusBar.vue b/src/renderer/src/components/chat/ChatStatusBar.vue index 8d7612cc0..7d479c971 100644 --- a/src/renderer/src/components/chat/ChatStatusBar.vue +++ b/src/renderer/src/components/chat/ChatStatusBar.vue @@ -264,7 +264,7 @@
-
+
@@ -402,7 +402,7 @@

-
+
@@ -541,8 +541,15 @@ @update:model-value="onImageGenerationSettingsUpdate" /> + +
@@ -715,7 +722,7 @@

-
+