From 3efa972f71eaf8b05d1ed7a94506c1f8adab28a5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 16 Mar 2026 16:09:16 +0100 Subject: [PATCH 1/4] Support kokoro model --- Dockerfile.redhat | 17 +- Dockerfile.ubuntu | 17 +- Makefile | 11 +- WORKSPACE | 4 + common_settings.bzl | 3 +- create_package.sh | 10 ++ demos/common/export_models/export_model.py | 61 +++++-- demos/common/export_models/requirements.txt | 7 +- distro.bzl | 26 +++ setupvars.bat | 2 + setupvars.ps1 | 4 + src/BUILD | 6 + src/audio/audio_utils.cpp | 33 +++- src/audio/audio_utils.hpp | 1 + src/audio/speech_to_text/s2t_servable.cpp | 1 - src/audio/text_to_speech/BUILD | 1 + src/audio/text_to_speech/t2s_calculator.cc | 81 +++++++-- src/audio/text_to_speech/t2s_calculator.proto | 2 + src/audio/text_to_speech/t2s_servable.cpp | 35 +++- src/audio/text_to_speech/t2s_servable.hpp | 10 +- src/capi_frontend/server_settings.hpp | 3 +- src/graph_export/graph_export.cpp | 40 ++++- src/graph_export/t2s_graph_cli_parser.cpp | 21 ++- src/pull_module/optimum_export.cpp | 6 +- src/test/graph_export_test.cpp | 50 +++++- src/test/pull_hf_model_test.cpp | 14 ++ third_party/BUILD | 26 +++ third_party/espeak_ng/BUILD | 26 +++ third_party/espeak_ng/espeak_ng.bzl | 164 ++++++++++++++++++ .../espeak_ng/out_of_source_phsource.patch | 28 +++ windows_build.bat | 10 +- windows_create_package.bat | 9 + 32 files changed, 677 insertions(+), 52 deletions(-) create mode 100644 third_party/espeak_ng/BUILD create mode 100644 third_party/espeak_ng/espeak_ng.bzl create mode 100644 third_party/espeak_ng/out_of_source_phsource.patch diff --git a/Dockerfile.redhat b/Dockerfile.redhat index 23e63cde54..4e55964194 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -109,6 +109,7 @@ SHELL ["/bin/bash", "-xo", "pipefail", "-c"] ARG JOBS=40 ARG VERBOSE_LOGS=OFF ARG LTO_ENABLE=OFF +ARG ESPEAK=1 # hadolint ignore=DL3041 RUN dnf install -y -d6 \ @@ -129,6 +130,8 @@ RUN dnf install -y -d6 \ python3.12-pip \ libicu-devel && \ dnf clean all +# Note: espeak-ng is no longer installed from dnf; it is built from source +# by Bazel (see third_party/espeak_ng) and bundled in the release. WORKDIR / @@ -234,11 +237,11 @@ RUN git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /ope fi WORKDIR /openvino_genai/ -ARG ov_genai_branch=master -ARG ov_genai_org=openvinotoolkit +ARG ov_genai_branch=kokoro_tts +ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git # hadolint ignore=DL3003 RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \ - git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \ + git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \ cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \ cmake --build ./build/ --parallel $JOBS && cp -vP /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \ cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \ @@ -393,6 +396,7 @@ LABEL "maintainer"="dariusz.trawinski@intel.com" ARG INSTALL_RPMS_FROM_URL= ARG INSTALL_DRIVER_VERSION="24.52.32224" ARG GPU=0 +ARG ESPEAK=1 ARG debug_bazel_flags= LABEL bazel-build-flags=${debug_bazel_flags} LABEL supported-devices="CPU=1 GPU=${GPU}" @@ -407,6 +411,8 @@ COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh # hadolint ignore=DL3003,DL3041,SC2164,SC1091 RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_downloads=8\nretries=50" >> /etc/dnf/dnf.conf ; else export DNF_TOOL=microdnf ; fi ; \ $DNF_TOOL upgrade --setopt=install_weak_deps=0 --nodocs -y ; \ + # espeak-ng shared library + data are shipped inside /ovms by the Bazel + # build when $ESPEAK == "1"; no system package is required at runtime. if [ "$GPU" == "1" ] ; then \ source /install_gpu_drivers.sh && rm -rf /install_gpu_drivers.sh; \ fi ; \ @@ -422,6 +428,11 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do ENV LD_LIBRARY_PATH=/ovms/lib ENV PATH="$PATH:/ovms/bin" +# When espeak-ng is built into the OVMS release, the libespeak-ng.so loaded +# at runtime needs to locate its data tables. They are shipped under +# /ovms/share/espeak-ng-data; point espeak-ng at that location explicitly so +# we do not depend on any system /usr/share/espeak-ng-data. +ENV ESPEAK_DATA_PATH=/ovms/share/espeak-ng-data COPY --from=pkg /ovms_release /ovms COPY --from=pkg /usr/local/lib/python3.12/site-packages/jinja2 /ovms/python_deps/jinja2 diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index f7e57e380c..5aa99d4d21 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -95,6 +95,7 @@ ENV DEBIAN_FRONTEND=noninteractive SHELL ["/bin/bash", "-xo", "pipefail", "-c"] ARG debug_bazel_flags="--strip=always --config=mp_on_py_on --//:distro=ubuntu" +ARG ESPEAK=1 RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi @@ -124,6 +125,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ vim && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Note: espeak-ng is no longer installed from apt here; it is built from +# source by Bazel (see third_party/espeak_ng) and bundled in the release. # on ubuntu 24.04 python3.12 is used as default python for ovms build and release # TF build needs python3.10 with numpy as it does not support python3.12 RUN python3.10 -m pip install "numpy<2.0.0" --no-cache-dir @@ -220,12 +223,12 @@ RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \ if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \ cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \ fi -ARG ov_genai_branch=master -ARG ov_genai_org=openvinotoolkit +ARG ov_genai_branch=kokoro_tts +ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git WORKDIR /openvino_genai/ # hadolint ignore=DL3003 RUN if [ "$ov_use_binary" == "0" ]; then \ - git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \ + git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \ cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} " -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \ cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \ cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \ @@ -395,6 +398,7 @@ ARG INSTALL_RPMS_FROM_URL= ARG INSTALL_DRIVER_VERSION="24.26.30049" ARG GPU=0 ARG NPU=0 +ARG ESPEAK=1 ENV DEBIAN_FRONTEND=noninteractive ARG debug_bazel_flags= LABEL bazel-build-flags=${debug_bazel_flags} @@ -413,6 +417,8 @@ COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh # hadolint ignore=DL3003,SC2164 RUN apt-get update ; \ apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \ + # espeak-ng shared library + data are shipped inside /ovms by the Bazel + # build when $ESPEAK == "1"; no system package is required at runtime. if [ "$GPU" == "1" ] ; then \ /tmp/install_gpu_drivers.sh ; \ fi ; \ @@ -451,6 +457,11 @@ RUN if ! [[ $debug_bazel_flags == *"py_off"* ]]; then true ; else exit 0 ; fi ; ENV LD_LIBRARY_PATH=/ovms/lib ENV PATH="$PATH:/ovms/bin" +# When espeak-ng is built into the OVMS release, the libespeak-ng.so loaded +# at runtime needs to locate its data tables. They are shipped under +# /ovms/share/espeak-ng-data; point espeak-ng at that location explicitly so +# we do not depend on any system /usr/share/espeak-ng-data. +ENV ESPEAK_DATA_PATH=/ovms/share/espeak-ng-data RUN echo "The source code of added GPL components is stored in https://storage.openvinotoolkit.org/repositories/openvino/ci_dependencies/container_gpl_sources/" > /ovms/thirdparty-licenses/GPL.txt USER ovms diff --git a/Makefile b/Makefile index 6d7c5d2918..aa5bf1ef65 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,7 @@ BUILD_TESTS ?= 0 RUN_GPU_TESTS ?= GPU ?= 0 NPU ?= 0 +ESPEAK ?= 1 BUILD_NGINX ?= 0 MEDIAPIPE_DISABLE ?= 0 PYTHON_DISABLE ?= 0 @@ -146,8 +147,13 @@ else ifeq ($(findstring redhat,$(BASE_OS)),redhat) else $(error BASE_OS must be either ubuntu or redhat) endif -CAPI_FLAGS = "--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)" --config=mp_off_py_off"$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS) -BAZEL_DEBUG_FLAGS="--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)$(DISABLE_PARAMS)$(FUZZER_BUILD_PARAMS)$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(REPO_ENV) +ifeq ($(ESPEAK),1) + ESPEAK_PARAMS = " --//:espeak=on" +else + ESPEAK_PARAMS = " --//:espeak=off" +endif +CAPI_FLAGS = "--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)" --config=mp_off_py_off"$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(ESPEAK_PARAMS) +BAZEL_DEBUG_FLAGS="--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)$(DISABLE_PARAMS)$(FUZZER_BUILD_PARAMS)$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(ESPEAK_PARAMS)$(REPO_ENV) # Option to Override release image. # Release image OS *must have* glibc version >= glibc version on BASE_OS: @@ -237,6 +243,7 @@ BUILD_ARGS = --build-arg http_proxy=$(HTTP_PROXY)\ --build-arg BASE_OS=$(BASE_OS)\ --build-arg INSTALL_RPMS_FROM_URL=$(INSTALL_RPMS_FROM_URL)\ --build-arg INSTALL_DRIVER_VERSION=$(INSTALL_DRIVER_VERSION)\ + --build-arg ESPEAK=$(ESPEAK)\ --build-arg RELEASE_BASE_IMAGE=$(BASE_IMAGE_RELEASE)\ --build-arg JOBS=$(JOBS)\ --build-arg CAPI_FLAGS=$(CAPI_FLAGS)\ diff --git a/WORKSPACE b/WORKSPACE index bcd1dd7b00..be143c6924 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -497,6 +497,10 @@ libgit2_engine() load("@ovms//third_party/drogon:drogon.bzl", "drogon_cpp") drogon_cpp() +### espeak-ng (built from source via Bazel; gated by --//:espeak flag) +load("@ovms//third_party/espeak_ng:espeak_ng.bzl", "espeak_ng") +espeak_ng() + # Azure Storage SDK new_local_repository( name = "azure", diff --git a/common_settings.bzl b/common_settings.bzl index 3faad26bb5..e4b28a7b44 100644 --- a/common_settings.bzl +++ b/common_settings.bzl @@ -20,7 +20,7 @@ load("@bazel_skylib//lib:selects.bzl", "selects") load("@mediapipe//mediapipe/framework:more_selects.bzl", "more_selects") load("@bazel_skylib//rules:common_settings.bzl", "string_flag") -load("//:distro.bzl", "distro_flag") +load("//:distro.bzl", "distro_flag", "espeak_flag") # cc_library rule wrapper that will accept the same arguments but if user will not provide # copts, linkopts, local_defines it will set them to the defaults @@ -52,6 +52,7 @@ def ovms_cc_library(**kwargs): def create_config_settings(): distro_flag() + espeak_flag() native.config_setting( name = "disable_mediapipe", define_values = { diff --git a/create_package.sh b/create_package.sh index 9c2142b1eb..c2efb67d8b 100755 --- a/create_package.sh +++ b/create_package.sh @@ -27,6 +27,16 @@ mkdir -vp /ovms_release/lib/custom_nodes if [ "$ov_use_binary" == "0" ] ; then cp -v /openvino_tokenizers/build/src/libopenvino_tokenizers.so /ovms_release/lib/ ; fi find /ovms/bazel-out/k8-*/bin -iname '*.so*' ! -type d ! -name "libgtest.so" ! -name "*params" ! -name "*.hana.*" ! -name "py_generate_pipeline.cpython*" ! -name "lib_node_*" ! -path "*test_python_binding*" ! -name "*libpython*" -exec cp -v {} /ovms_release/lib/ \; + +# Bundle espeak-ng data files when espeak was enabled in the Bazel build. +# rules_foreign_cc places the cmake install tree under copy_/espeak-ng/ +# inside bazel-out. Both the shared library (picked up by the find above) +# and the espeak-ng-data directory are required at runtime. +ESPEAK_DATA_SRC=$(find /ovms/bazel-out/k8-*/bin/external/espeak_ng -type d -name 'espeak-ng-data' 2>/dev/null | head -n 1 || true) +if [ -n "$ESPEAK_DATA_SRC" ] && [ -d "$ESPEAK_DATA_SRC" ] ; then + mkdir -p /ovms_release/share + cp -rL "$ESPEAK_DATA_SRC" /ovms_release/share/ ; +fi if [ "$FUZZER_BUILD" == "0" ]; then mv /ovms_release/lib/libcustom_node* /ovms_release/lib/custom_nodes/; fi; cd /ovms_release/lib/ ; rm -f libcurl.so* cd /ovms_release/lib/ ; rm -f libazurestorage.so.* ; ln -s libazurestorage.so libazurestorage.so.7 ;ln -s libazurestorage.so libazurestorage.so.7.5 diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 584f9a4d64..255a78c4d2 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -84,9 +84,11 @@ def add_common_arguments(parser): parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint') add_common_arguments(parser_text2speech) parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') -parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder') -parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker', dest='speaker_name') -parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file.', dest='speaker_path') +parser_text2speech.add_argument('--model_type', default='speecht5', choices=['speecht5', 'kokoro'], help='Type of the source TTS model. speecht5 uses optimum-cli; kokoro uses a dedicated PyTorch->OpenVINO conversion path.', dest='model_type') +parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for speecht5. For example microsoft/speecht5_hifigan. Ignored for kokoro.', dest='vocoder') +parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_name') +parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_path') +parser_text2speech.add_argument('--language', type=str, default=None, help='Default language code passed to the calculator (e.g. en-us, zh). Used mainly by kokoro.', dest='language') parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint') @@ -109,13 +111,19 @@ def add_common_arguments(parser): models_path: "{{model_path}}", plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', target_device: "{{target_device|default("CPU", true)}}", - {%- if speaker_name and speaker_path %} + {%- if language %} + language: "{{language}}", + {%- endif %} + {%- if voices %} voices: [ + {%- for v in voices %} { - name: "{{speaker_name}}", - path: "{{speaker_path}}" - } - ]{% endif %} + name: "{{v.name}}", + path: "{{v.path}}" + }{% if not loop.last %},{% endif %} + {%- endfor %} + ] + {%- endif %} } } } @@ -476,13 +484,42 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) +def _list_kokoro_voices(destination_path): + """optimum-cli's Kokoro exporter writes per-voice speaker embeddings to + /voices/.bin. Return the sorted list of voice names.""" + voices_dir = os.path.join(destination_path, "voices") + if not os.path.isdir(voices_dir): + print("Warning: no voices/ directory found under", destination_path) + return [] + return sorted(Path(p).stem for p in Path(voices_dir).glob("*.bin")) + def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path): destination_path = os.path.join(model_repository_path, model_name) print("Exporting text2speech model to ",destination_path) - if not os.path.isdir(destination_path) or args['overwrite_models']: - optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path) - if os.system(optimum_command): - raise ValueError("Failed to export text2speech model", source_model) + model_type = task_parameters.get('model_type', 'speecht5') + + if model_type == 'kokoro': + # optimum-intel registers Kokoro under library_name=kokoro / task=text-to-audio. + # The kokoro exporter also dumps each speaker embedding to voices/.bin. + if not os.path.isfile(os.path.join(destination_path, 'openvino_model.xml')) or args['overwrite_models']: + optimum_command = "optimum-cli export openvino --model {} --task text-to-audio --weight-format {} {} --trust-remote-code {}".format( + source_model, precision, task_parameters['extra_quantization_params'], destination_path) + print('Running command:', optimum_command) + if os.system(optimum_command): + raise ValueError("Failed to export kokoro model", source_model) + voice_names = _list_kokoro_voices(destination_path) + # Render the graph with every available voice (path is relative to graph.pbtxt). + task_parameters['voices'] = [{'name': n, 'path': f'./voices/{n}.bin'} for n in voice_names] + else: + if not os.path.isdir(destination_path) or args['overwrite_models']: + if not task_parameters.get('vocoder'): + raise ValueError("--vocoder is required when --model_type=speecht5") + optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path) + if os.system(optimum_command): + raise ValueError("Failed to export text2speech model", source_model) + if task_parameters.get('speaker_name') and task_parameters.get('speaker_path'): + task_parameters['voices'] = [{'name': task_parameters['speaker_name'], 'path': task_parameters['speaker_path']}] + gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template) graph_content = gtemplate.render(model_path="./", **task_parameters) with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index 60cbf3d2fc..3828e94e7f 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -1,7 +1,7 @@ --extra-index-url "https://download.pytorch.org/whl/cpu" --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/nightly" --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/pre-release" -optimum-intel@git+https://github.com/huggingface/optimum-intel.git@d4dd21a3aa89c0671d85b704847ac06a378e761c +optimum-intel@git+https://github.com/huggingface/optimum-intel.git@7a109befdfd4f9b94351135b17505efa28698210 accelerate datasets diffusers # for image generation @@ -14,3 +14,8 @@ sentence_transformers==5.3.0 sentencepiece # Required by: transformers` torchvision requests +# Kokoro TTS export (text2speech --model_type kokoro). +# typer must stay <0.12 because kokoro -> misaki -> spacy imports `typer.main`, +# which was removed in typer 0.12. +kokoro==0.9.4 +typer<0.12 diff --git a/distro.bzl b/distro.bzl index 258fee8c47..c01ad7bb56 100644 --- a/distro.bzl +++ b/distro.bzl @@ -42,3 +42,29 @@ def distro_flag(): name = "not_ubuntu_build", negate = ":ubuntu_build", ) + +# Controls whether espeak-ng is built from source (via Bazel) and bundled +# into the OVMS release. When "off", no espeak-ng artifacts are produced +# and the runtime will not have phonemization fallback available. +def espeak_flag(): + string_flag( + name = "espeak", + values = ["on", "off"], + build_setting_default = "on", + ) + native.config_setting( + name = "espeak_on", + flag_values = { + "espeak": "on", + }, + ) + native.config_setting( + name = "espeak_off", + flag_values = { + "espeak": "off", + }, + ) + more_selects.config_setting_negation( + name = "not_espeak_on", + negate = ":espeak_on", + ) diff --git a/setupvars.bat b/setupvars.bat index 200991839b..7f99e863d7 100644 --- a/setupvars.bat +++ b/setupvars.bat @@ -17,4 +17,6 @@ set "OVMS_DIR=%~dp0" set "PYTHONHOME=%OVMS_DIR%\python" set "PATH=%OVMS_DIR%;%PYTHONHOME%;%PYTHONHOME%\Scripts;%PATH%" +:: Point bundled espeak-ng (when packaged) at its data tables. +if exist "%OVMS_DIR%\espeak-ng-data" set "ESPEAK_DATA_PATH=%OVMS_DIR%\espeak-ng-data" echo "OpenVINO Model Server Environment Initialized" diff --git a/setupvars.ps1 b/setupvars.ps1 index 549bfa1a1b..f0703d00d0 100644 --- a/setupvars.ps1 +++ b/setupvars.ps1 @@ -22,4 +22,8 @@ if (Test-Path "$env:OVMS_DIR\python") { } else { $env:PATH="$env:PATH:$env:OVMS_DIR" } +# Point bundled espeak-ng (when packaged) at its data tables. +if (Test-Path "$env:OVMS_DIR\espeak-ng-data") { + $env:ESPEAK_DATA_PATH="$env:OVMS_DIR\espeak-ng-data" +} echo "OpenVINO Model Server Environment Initialized" diff --git a/src/BUILD b/src/BUILD index 5163149311..8c6d5b4fd7 100644 --- a/src/BUILD +++ b/src/BUILD @@ -2287,6 +2287,12 @@ cc_binary( "//src/python/binding:pyovms.so", ], "//:disable_python": [] + }) + select({ + "//:espeak_on": [ + "//third_party:espeak_ng", + "//third_party:espeak_ng_data", + ], + "//:espeak_off": [], }), # linkstatic = False, # Use for dynamic linking when necessary ) diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 77b38e70df..f56d600f14 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -1,5 +1,5 @@ //***************************************************************************** -// Copyright 2025 Intel Corporation +// Copyright 2026 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "src/logging.hpp" #include #include +#include #include #include #pragma warning(push) @@ -188,3 +189,33 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); } + +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) { + enum : unsigned int { + OUTPUT_PREPARATION, + TIMER_END + }; + Timer timer; + timer.start(OUTPUT_PREPARATION); + + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 24000; // Kokoro native sample rate + format.bitsPerSample = 32; + drwav wav; + + auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr); + if (status == DRWAV_FALSE) { + throw std::runtime_error("Failed to initialize WAV writer"); + } + drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr); + if (framesWritten != speechSize) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uninit(&wav); + timer.stop(OUTPUT_PREPARATION); + auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); +} diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index cbeea8b457..0928d03f3d 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -25,3 +25,4 @@ bool isWavBuffer(const std::string buf); std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr); diff --git a/src/audio/speech_to_text/s2t_servable.cpp b/src/audio/speech_to_text/s2t_servable.cpp index 5ce2c451a4..dbe9001d16 100644 --- a/src/audio/speech_to_text/s2t_servable.cpp +++ b/src/audio/speech_to_text/s2t_servable.cpp @@ -35,7 +35,6 @@ namespace ovms { namespace { constexpr size_t ISO_LANG_CODE_MAX = 3; } - SttServable::SttServable(const ::mediapipe::S2tCalculatorOptions& nodeOptions, const std::string& graphPath) { auto fsModelsPath = std::filesystem::path(nodeOptions.models_path()); if (fsModelsPath.is_relative()) { diff --git a/src/audio/text_to_speech/BUILD b/src/audio/text_to_speech/BUILD index 2a494f6e16..a3d10f9893 100644 --- a/src/audio/text_to_speech/BUILD +++ b/src/audio/text_to_speech/BUILD @@ -37,6 +37,7 @@ ovms_cc_library( srcs = ["t2s_calculator.cc", "tts_node_initializer.cpp"], deps = [ + "//third_party:genai", "@mediapipe//mediapipe/framework:calculator_framework", "//src:httppayload", "//src:libovmslogging", diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc index f8f4912f0d..7c78d764e1 100644 --- a/src/audio/text_to_speech/t2s_calculator.cc +++ b/src/audio/text_to_speech/t2s_calculator.cc @@ -28,6 +28,8 @@ #include "src/client_connection.hpp" #include "src/http_payload.hpp" #include "src/logging.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "openvino/openvino.hpp" #include #include @@ -63,6 +65,8 @@ static absl::Status checkClientDisconnected(const ovms::HttpPayload& payload, co class T2sCalculator : public CalculatorBase { static const std::string INPUT_TAG_NAME; static const std::string OUTPUT_TAG_NAME; + std::string defaultLanguage = "en-us"; + float defaultSpeed = 1.0f; public: static absl::Status GetContract(CalculatorContract* cc) { @@ -81,6 +85,13 @@ class T2sCalculator : public CalculatorBase { absl::Status Open(CalculatorContext* cc) final { SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "T2sCalculator [Node: {}] Open start", cc->NodeName()); + const auto& options = cc->Options(); + if (options.has_language() && !options.language().empty()) { + defaultLanguage = options.language(); + } + if (options.has_speed()) { + defaultSpeed = options.speed(); + } return absl::OkStatus(); } @@ -113,26 +124,72 @@ class T2sCalculator : public CalculatorBase { if (streamIt != payload.parsedJson->MemberEnd()) { return absl::InvalidArgumentError("streaming is not supported"); } + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "1"); std::optional voiceName; auto voiceIt = payload.parsedJson->FindMember("voice"); - if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + if (voiceIt != payload.parsedJson->MemberEnd()) { + if (!voiceIt->value.IsString()) { + return absl::InvalidArgumentError("voice field is not a string"); + } voiceName = voiceIt->value.GetString(); - if (pipe->voices.find(voiceName.value()) == pipe->voices.end()) - return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", voiceName.value())); } - + std::string language = defaultLanguage; + auto languageIt = payload.parsedJson->FindMember("language"); + if (languageIt != payload.parsedJson->MemberEnd()) { + if (!languageIt->value.IsString()) { + return absl::InvalidArgumentError("language field is not a string"); + } + language = languageIt->value.GetString(); + } + float speed = defaultSpeed; + auto speedIt = payload.parsedJson->FindMember("speed"); + if (speedIt != payload.parsedJson->MemberEnd()) { + if (!speedIt->value.IsNumber()) { + return absl::InvalidArgumentError("speed field is not a number"); + } + speed = speedIt->value.GetFloat(); + } ov::genai::Text2SpeechDecodedResults generatedSpeech; std::unique_lock lock(pipe->ttsPipelineMutex); auto disconnectStatus = checkClientDisconnected(payload, cc->NodeName(), "before generation"); if (!disconnectStatus.ok()) return disconnectStatus; - + ov::Tensor speakerEmbedding; + std::string selectedVoice; if (voiceName.has_value()) { - generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voiceName.value()]); - } else { - generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); + selectedVoice = voiceName.value(); + auto speakerIt = pipe->voices.find(selectedVoice); + if (speakerIt != pipe->voices.end()) { + speakerEmbedding = speakerIt->second; + } else if (!pipe->voices.empty()) { + // Voice was explicitly requested but is unknown - fail with the list of available voices. + std::string available; + for (const auto& [name, _] : pipe->voices) { + if (!available.empty()) + available += ", "; + available += name; + } + return absl::InvalidArgumentError(absl::StrCat( + "Unknown voice \"", selectedVoice, "\". Available voices: ", available)); + } + // pipe->voices is empty (e.g. SpeechT5 deployment without speaker embeddings) - + // pass the requested name through to the pipeline with an empty embedding. + } else if (!pipe->voices.empty()) { + // No voice in the request - pick a default from the voices loaded by the servable. + auto preferredIt = pipe->voices.find("af_alloy"); + if (preferredIt != pipe->voices.end()) { + selectedVoice = preferredIt->first; + speakerEmbedding = preferredIt->second; + } else { + const auto& firstVoice = *pipe->voices.begin(); + selectedVoice = firstVoice.first; + speakerEmbedding = firstVoice.second; + } } - auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth(); + ov::AnyMap properties{{"voice", selectedVoice}, {"language", language}, {"speed", speed}}; + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding, properties); + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "3"); + // auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth(); auto speechSize = generatedSpeech.speeches[0].get_size(); ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape()); // copy results to release inference request @@ -143,7 +200,9 @@ class T2sCalculator : public CalculatorBase { return disconnectStatus; void* ppData; size_t pDataSize; - prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpuTensor.data()); + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "4"); + prepareAudioOutputKokoro(&ppData, pDataSize, speechSize, cpuTensor.data()); + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "5"); output = std::make_unique(reinterpret_cast(ppData), pDataSize); drwav_free(ppData, NULL); } else { @@ -151,6 +210,8 @@ class T2sCalculator : public CalculatorBase { } } catch (ov::AssertFailure& e) { return absl::InvalidArgumentError(e.what()); + } catch (std::runtime_error& e) { + return absl::InvalidArgumentError(e.what()); } catch (...) { return absl::InvalidArgumentError("Response generation failed"); } diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto index efea722c3d..5bfce7811f 100644 --- a/src/audio/text_to_speech/t2s_calculator.proto +++ b/src/audio/text_to_speech/t2s_calculator.proto @@ -40,4 +40,6 @@ message T2sCalculatorOptions { required string path = 2; } repeated SpeakerEmbeddings voices = 4; + optional string language = 5 [default = "en-us"]; + optional float speed = 6 [default = 1.0]; } diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp index c782c9346d..2c9b1abfe3 100644 --- a/src/audio/text_to_speech/t2s_servable.cpp +++ b/src/audio/text_to_speech/t2s_servable.cpp @@ -19,8 +19,8 @@ #include #include #include +#include -#include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" #include "src/audio/text_to_speech/t2s_calculator.pb.h" #include "src/status.hpp" @@ -31,7 +31,15 @@ namespace ovms { -static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) { +static size_t getShapeElementsCount(const ov::Shape& shape) { + size_t elementsCount = 1; + for (const auto dim : shape) { + elementsCount *= dim; + } + return elementsCount; +} + +static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path, const ov::Shape& expectedShape) { std::ifstream input(file_path, std::ios::binary); if (input.fail()) { std::stringstream ss; @@ -48,12 +56,16 @@ static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) if (buffer_size % sizeof(float) != 0) { throw std::runtime_error("File size is not a multiple of float size."); } - size_t num_floats = buffer_size / sizeof(float); - if (num_floats != 512) { - throw std::runtime_error("File must contain speaker embedding including 512 32-bit floats."); + const size_t numFloats = buffer_size / sizeof(float); + const size_t expectedElements = getShapeElementsCount(expectedShape); + if (numFloats != expectedElements) { + std::stringstream ss; + ss << "File must contain speaker embedding with " << expectedElements + << " 32-bit floats. Got: " << numFloats; + throw std::runtime_error(ss.str()); } - ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats}); + ov::Tensor floats_tensor(ov::element::f32, expectedShape); input.read(reinterpret_cast(floats_tensor.data()), buffer_size); if (input.fail()) { throw std::runtime_error("Failed to read all data from file."); @@ -76,10 +88,15 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD throw std::runtime_error("Error during plugin_config option parsing"); } ttsPipeline = std::make_shared(parsedModelsPath.string(), targetDevice, config); + const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape(); for (auto voice : graphVoices) { - if (!std::filesystem::exists(voice.path())) - throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voice.path()}; - voices[voice.name()] = read_speaker_embedding(voice.path()); + std::filesystem::path voicePath(voice.path()); + if (voicePath.is_relative()) { + voicePath = std::filesystem::path(graphPath) / voicePath; + } + if (!std::filesystem::exists(voicePath)) + throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voicePath.string()}; + voices[voice.name()] = read_speaker_embedding(voicePath.string(), speakerEmbeddingShape); } } } // namespace ovms diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp index 6d192edcfb..6f5249baa4 100644 --- a/src/audio/text_to_speech/t2s_servable.hpp +++ b/src/audio/text_to_speech/t2s_servable.hpp @@ -16,15 +16,21 @@ #pragma once -#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" #include "src/audio/text_to_speech/t2s_calculator.pb.h" +#include #include +#include #include #include -namespace ovms { +#include "openvino/runtime/tensor.hpp" + +namespace ov::genai { +class Text2SpeechPipeline; +} +namespace ovms { class TtsServable { public: std::shared_ptr ttsPipeline; diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp index 2c0a364bb9..115b27f062 100644 --- a/src/capi_frontend/server_settings.hpp +++ b/src/capi_frontend/server_settings.hpp @@ -133,7 +133,7 @@ struct EmbeddingsGraphSettingsImpl { }; struct TextToSpeechGraphSettingsImpl { - uint32_t unused = 1; // will be added + std::optional language; }; struct SpeechToTextGraphSettingsImpl { @@ -161,6 +161,7 @@ struct ExportSettings { std::string targetDevice = "CPU"; std::optional extraQuantizationParams; std::optional vocoder; + std::string modelType = "speecht5"; // text2speech only: "speecht5" or "kokoro" std::string precision = "int8"; PluginConfigSettingsImpl pluginConfig; }; diff --git a/src/graph_export/graph_export.cpp b/src/graph_export/graph_export.cpp index 64d80bc23c..d121022a1c 100644 --- a/src/graph_export/graph_export.cpp +++ b/src/graph_export/graph_export.cpp @@ -16,10 +16,12 @@ #include "graph_export.hpp" #include +#include #include #include #include #include +#include #pragma warning(push) #pragma warning(disable : 6313) @@ -315,6 +317,7 @@ static Status createTextToSpeechGraphTemplate(const std::string& directoryPath, SPDLOG_ERROR("Graph options not initialized for speech generation."); return StatusCode::INTERNAL_ERROR; } + auto& graphSettings = std::get(hfSettings.graphSettings); auto& ggufFilename = hfSettings.ggufFilename; auto& exportSettings = hfSettings.exportSettings; @@ -323,6 +326,24 @@ static Status createTextToSpeechGraphTemplate(const std::string& directoryPath, std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename); SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt")); GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings); + + // Enumerate kokoro speaker embeddings dumped by optimum-cli to /voices/*.bin. + std::vector voiceNames; + if (exportSettings.modelType == "kokoro") { + std::filesystem::path voicesDir = std::filesystem::path(directoryPath) / "voices"; + std::error_code ec; + if (std::filesystem::is_directory(voicesDir, ec)) { + for (const auto& entry : std::filesystem::directory_iterator(voicesDir, ec)) { + if (entry.is_regular_file() && entry.path().extension() == ".bin") { + voiceNames.push_back(entry.path().stem().string()); + } + } + std::sort(voiceNames.begin(), voiceNames.end()); + } else { + SPDLOG_WARN("Kokoro voices directory not found at {}", voicesDir.string()); + } + } + // clang-format off oss << R"( input_stream: "HTTP_REQUEST_PAYLOAD:input" @@ -342,7 +363,24 @@ node { )"; if (pluginConfigOpt.has_value()) { oss << R"(plugin_config: ')" << pluginConfigOpt.value() << R"(' - )"; + )"; + } + if (graphSettings.language.has_value()) { + oss << R"(language: ")" << graphSettings.language.value() << R"(" + )"; + } + if (!voiceNames.empty()) { + oss << R"(voices: [)"; + for (size_t i = 0; i < voiceNames.size(); ++i) { + oss << R"( + { name: ")" << voiceNames[i] << R"(", path: "./voices/)" << voiceNames[i] << R"(.bin" })"; + if (i + 1 < voiceNames.size()) { + oss << ","; + } + } + oss << R"( + ] + )"; } oss << R"(} } diff --git a/src/graph_export/t2s_graph_cli_parser.cpp b/src/graph_export/t2s_graph_cli_parser.cpp index 69f4479a24..c3e562729a 100644 --- a/src/graph_export/t2s_graph_cli_parser.cpp +++ b/src/graph_export/t2s_graph_cli_parser.cpp @@ -43,7 +43,16 @@ void TextToSpeechGraphCLIParser::createOptions() { ("num_streams", "The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.", cxxopts::value()->default_value("1"), - "NUM_STREAMS"); + "NUM_STREAMS") + ("model_type", + "Type of the source TTS model: speecht5 (default) or kokoro.", + cxxopts::value()->default_value("speecht5"), + "MODEL_TYPE") + ("language", + "Default language code passed to the TTS calculator (e.g. en-us, zh). Used by kokoro.", + cxxopts::value(), + "LANGUAGE"); + // clang-format on } void TextToSpeechGraphCLIParser::printHelp() { @@ -64,7 +73,7 @@ std::vector TextToSpeechGraphCLIParser::parse(const std::vector(options->parse(cStrArray.size(), args)); - return result->unmatched(); + return result->unmatched(); } void TextToSpeechGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) { @@ -82,6 +91,14 @@ void TextToSpeechGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsIm } } else { hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as(); + const std::string modelType = result->operator[]("model_type").as(); + if (modelType != "speecht5" && modelType != "kokoro") { + throw std::invalid_argument("--model_type must be one of: speecht5, kokoro"); + } + hfSettings.exportSettings.modelType = modelType; + if (result->count("language")) { + textToSpeechGraphSettings.language = result->operator[]("language").as(); + } } hfSettings.graphSettings = std::move(textToSpeechGraphSettings); } diff --git a/src/pull_module/optimum_export.cpp b/src/pull_module/optimum_export.cpp index f57e2448dc..00b7d68103 100644 --- a/src/pull_module/optimum_export.cpp +++ b/src/pull_module/optimum_export.cpp @@ -67,7 +67,11 @@ std::string OptimumDownloader::getExportCmdTextToSpeech() { std::ostringstream oss; // clang-format off oss << this->OPTIMUM_CLI_EXPORT_COMMAND; - if (this->exportSettings.vocoder.has_value()){ + if (this->exportSettings.modelType == "kokoro") { + // optimum-intel registers Kokoro under library_name="kokoro" with task "text-to-audio". + // The library is auto-detected from the HF repo; --task must be specified explicitly. + oss << "--task text-to-audio "; + } else if (this->exportSettings.vocoder.has_value()) { oss << "--model-kwargs \"{\"vocoder\": \"" << this->exportSettings.vocoder.value() << "\"}\" "; } oss << "--model " << this->sourceModel << " --trust-remote-code "; diff --git a/src/test/graph_export_test.cpp b/src/test/graph_export_test.cpp index 0bbd684646..2b4483f972 100644 --- a/src/test/graph_export_test.cpp +++ b/src/test/graph_export_test.cpp @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. //***************************************************************************** +#include +#include #include #include @@ -392,7 +394,7 @@ node { models_path: "/model1/path" target_device: "GPU" plugin_config: '{"NUM_STREAMS":"2"}' - } + } } } )"; @@ -415,6 +417,29 @@ node { } )"; +const std::string expectedTextToSpeechGraphContentsKokoro = R"( +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "myModel" + calculator: "T2sCalculator" + input_side_packet: "TTS_NODE_RESOURCES:t2s_servable" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { + models_path: "./" + target_device: "CPU" + language: "en-us" + voices: [ + { name: "af_alloy", path: "./voices/af_alloy.bin" }, + { name: "am_adam", path: "./voices/am_adam.bin" } + ] + } + } +} +)"; + const std::string expectedSpeechToTextGraphContents = R"( input_stream: "HTTP_REQUEST_PAYLOAD:input" output_stream: "HTTP_RESPONSE_PAYLOAD:output" @@ -868,6 +893,29 @@ TEST_F(GraphCreationTest, textToSpeechPositiveDefault) { ASSERT_EQ(expectedTextToSpeechGraphContentsDefault, removeVersionString(graphContents)) << graphContents; } +TEST_F(GraphCreationTest, textToSpeechPositiveKokoro) { + // Pre-create the voices/ directory that optimum-cli would have populated for kokoro. + std::filesystem::path voicesDir = std::filesystem::path(this->directoryPath) / "voices"; + std::filesystem::create_directories(voicesDir); + { std::ofstream f(voicesDir / "af_alloy.bin"); } + { std::ofstream f(voicesDir / "am_adam.bin"); } + + ovms::HFSettingsImpl hfSettings; + hfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH; + hfSettings.exportSettings.modelName = "myModel"; + hfSettings.exportSettings.modelType = "kokoro"; + ovms::TextToSpeechGraphSettingsImpl textToSpeechGraphSettings; + textToSpeechGraphSettings.language = "en-us"; + hfSettings.graphSettings = std::move(textToSpeechGraphSettings); + std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; + std::unique_ptr graphExporter = std::make_unique(); + auto status = graphExporter->createServableConfig(this->directoryPath, hfSettings); + ASSERT_EQ(status, ovms::StatusCode::OK); + + std::string graphContents = GetFileContents(graphPath); + ASSERT_EQ(expectedTextToSpeechGraphContentsKokoro, removeVersionString(graphContents)) << graphContents; +} + TEST_F(GraphCreationTest, textToSpeechCreatedPbtxtInvalid) { ovms::HFSettingsImpl hfSettings; hfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH; diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp index e1ab3f9ff3..db1ff26757 100644 --- a/src/test/pull_hf_model_test.cpp +++ b/src/test/pull_hf_model_test.cpp @@ -1392,6 +1392,20 @@ TEST_F(TestOptimumDownloaderSetup, TextToSpeechExportCmd) { ASSERT_EQ(optimumDownloader->getConvertCmd(), expectedCmd2); } +TEST_F(TestOptimumDownloaderSetup, TextToSpeechKokoroExportCmd) { + inHfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH; + inHfSettings.exportSettings.modelType = "kokoro"; + std::unique_ptr optimumDownloader = std::make_unique(inHfSettings); + std::string expectedCmd = "optimum-cli export openvino --task text-to-audio --model model/name --trust-remote-code --weight-format fp64 --someOptimumParam --anotherOptParam value \\path\\to\\Download\\model\\name"; + std::string expectedCmd2 = "convert_tokenizer model/name -o \\path\\to\\Download\\model\\name"; +#ifdef __linux__ + std::replace(expectedCmd.begin(), expectedCmd.end(), '\\', '/'); + std::replace(expectedCmd2.begin(), expectedCmd2.end(), '\\', '/'); +#endif + ASSERT_EQ(optimumDownloader->getExportCmd(), expectedCmd); + ASSERT_EQ(optimumDownloader->getConvertCmd(), expectedCmd2); +} + TEST_F(TestOptimumDownloaderSetup, SpeechToTextExportCmd) { inHfSettings.task = ovms::SPEECH_TO_TEXT_GRAPH; std::unique_ptr optimumDownloader = std::make_unique(inHfSettings); diff --git a/third_party/BUILD b/third_party/BUILD index dcde0bd7e4..bce804d7a1 100644 --- a/third_party/BUILD +++ b/third_party/BUILD @@ -58,4 +58,30 @@ alias( "//conditions:default": "@linux_curl//:curl", }), visibility = ["//visibility:public"], +) + +# espeak-ng built from source via Bazel (rules_foreign_cc cmake). +# Selected on/off via the //:espeak build flag. When disabled this resolves +# to an empty cc_library so dependents can unconditionally list it. +cc_library( + name = "espeak_ng_empty", + visibility = ["//visibility:public"], +) + +alias( + name = "espeak_ng", + actual = select({ + "//:espeak_on": "@espeak_ng//:espeak_ng", + "//:espeak_off": ":espeak_ng_empty", + }), + visibility = ["//visibility:public"], +) + +alias( + name = "espeak_ng_data", + actual = select({ + "//:espeak_on": "@espeak_ng//:espeak_ng_data", + "//:espeak_off": ":espeak_ng_empty", + }), + visibility = ["//visibility:public"], ) \ No newline at end of file diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD new file mode 100644 index 0000000000..882deb642f --- /dev/null +++ b/third_party/espeak_ng/BUILD @@ -0,0 +1,26 @@ +# +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Marks third_party/espeak_ng as a Bazel package so espeak_ng.bzl can be +# loaded as //third_party/espeak_ng:espeak_ng.bzl from WORKSPACE. + +package(default_visibility = ["//visibility:public"]) + +exports_files([ + "espeak_ng.bzl", + "out_of_source_phsource.patch", +]) + +exports_files(["espeak_ng.bzl"]) diff --git a/third_party/espeak_ng/espeak_ng.bzl b/third_party/espeak_ng/espeak_ng.bzl new file mode 100644 index 0000000000..9b2439b3c2 --- /dev/null +++ b/third_party/espeak_ng/espeak_ng.bzl @@ -0,0 +1,164 @@ +# +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Fetches and builds espeak-ng from source using rules_foreign_cc (CMake) +# so OVMS no longer relies on a system-installed espeak-ng package. +# +# Works on both Linux (libespeak-ng.so + share/espeak-ng-data) and Windows +# (espeak-ng.dll + espeak-ng-data). The build is gated by the //:espeak +# build flag; when set to "off", the rule is still defined but no targets +# in OVMS depend on it. + +load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository") + +# Pinned to espeak-ng release tag 1.52.0 (commit hash). +_ESPEAK_NG_COMMIT = "212928b394a96e8fd2096616bfd54e17845c48f6" # 1.52.0 +_ESPEAK_NG_REMOTE = "https://github.com/espeak-ng/espeak-ng.git" + +def _is_windows(ctx): + return ctx.os.name.lower().find("windows") != -1 + +def espeak_ng(): + _espeak_ng_repository(name = "_espeak_ng") + new_git_repository( + name = "espeak_ng", + remote = _ESPEAK_NG_REMOTE, + commit = _ESPEAK_NG_COMMIT, + build_file = "@_espeak_ng//:BUILD", + init_submodules = False, + shallow_since = "1709251200 +0000", # roughly 2024-03-01, around 1.52.0 + patches = ["@ovms//third_party/espeak_ng:out_of_source_phsource.patch"], + patch_args = ["-p1"], + ) + +def _impl(repository_ctx): + http_proxy = repository_ctx.os.environ.get("http_proxy", "") + if http_proxy == "": + http_proxy = repository_ctx.os.environ.get("HTTP_PROXY", "") + https_proxy = repository_ctx.os.environ.get("https_proxy", "") + if https_proxy == "": + https_proxy = repository_ctx.os.environ.get("HTTPS_PROXY", "") + + if _is_windows(repository_ctx): + # On Windows espeak-ng builds espeak-ng.dll. + out_shared = "out_shared_libs = [\"espeak-ng.dll\"]," + out_interface = "out_interface_libs = [\"espeak-ng.lib\"]," + out_dll_dir = "out_dll_dir = \"bin\"," + out_lib_dir = "out_lib_dir = \"lib\"" + platform_cache = """ + "CMAKE_POSITION_INDEPENDENT_CODE": "ON", + "CMAKE_CXX_FLAGS": " /guard:cf /GS /DN_PATH_HOME=1024", + "CMAKE_C_FLAGS": " /DN_PATH_HOME=1024", + "WIN32": "True", + """ + jobs_param = "\"-j 8\"" + else: + # On Linux espeak-ng produces libespeak-ng.so.. + out_shared = "out_shared_libs = [\"libespeak-ng.so.1\"]," + out_interface = "" + out_dll_dir = "" + out_lib_dir = "out_lib_dir = \"lib\"" + platform_cache = """ + "CMAKE_POSITION_INDEPENDENT_CODE": "ON", + "CMAKE_C_FLAGS": " -fPIC -Wno-error -DN_PATH_HOME=1024", + """ + jobs_param = "\"-j 8\"" + + # NOTE: braces inside the format string are doubled to escape them. + build_file_content = """ +load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake") + +visibility = ["//visibility:public"] + +filegroup( + name = "all_srcs", + srcs = glob(["**"]), + visibility = ["//visibility:public"], +) + +# Build espeak-ng as a minimal text-to-phoneme library; we do not need +# audio output, MBROLA, klatt, libsonic or the CLI executable at runtime. +cmake( + name = "espeak_ng_cmake", + build_args = [ + "--verbose", + "--", + {jobs_param} + ], + cache_entries = {{ + "CMAKE_BUILD_TYPE": "Release", + "BUILD_SHARED_LIBS": "ON", + "USE_ASYNC": "OFF", + "USE_MBROLA": "OFF", + "USE_LIBSONIC": "OFF", + "USE_LIBPCAUDIO": "OFF", + "USE_KLATT": "OFF", + "USE_SPEECHPLAYER": "OFF", + "BUILD_ESPEAK_NG_EXE": "OFF", + "BUILD_SPEAK_EXE": "OFF", + "EXTRA_cmn": "OFF", + "EXTRA_ru": "OFF", + {platform_cache} + }}, + env = {{ + "http_proxy": "{http_proxy}", + "https_proxy": "{https_proxy}", + }}, + lib_source = ":all_srcs", + out_include_dir = "include", + {out_lib_dir}, + {out_shared} + {out_interface} + {out_dll_dir} + out_data_dirs = ["share/espeak-ng-data"], + tags = ["requires-network"], + visibility = ["//visibility:public"], + lib_name = "espeak-ng", +) + +# Headers + shared library suitable for cc_library deps. +cc_library( + name = "espeak_ng", + deps = [":espeak_ng_cmake"], + visibility = ["//visibility:public"], +) + +# Exposes the espeak-ng-data directory as a filegroup so create_package.sh +# (or similar packaging) can locate it under bazel-out. +filegroup( + name = "espeak_ng_data", + srcs = [":espeak_ng_cmake"], + output_group = "gen_dir", + visibility = ["//visibility:public"], +) +""" + repository_ctx.file( + "BUILD", + build_file_content.format( + http_proxy = http_proxy, + https_proxy = https_proxy, + out_shared = out_shared, + out_interface = out_interface, + out_dll_dir = out_dll_dir, + out_lib_dir = out_lib_dir, + platform_cache = platform_cache, + jobs_param = jobs_param, + ), + ) + +_espeak_ng_repository = repository_rule( + implementation = _impl, + local = True, +) diff --git a/third_party/espeak_ng/out_of_source_phsource.patch b/third_party/espeak_ng/out_of_source_phsource.patch new file mode 100644 index 0000000000..3cc7849d8c --- /dev/null +++ b/third_party/espeak_ng/out_of_source_phsource.patch @@ -0,0 +1,28 @@ +Replace configure-time file(COPY) of phsource with a symbolic link. + +Upstream cmake/data.cmake does: + + file(COPY "${PHONEME_SRC_DIR}" DESTINATION "${DATA_DIST_ROOT}") + +CMake's file(COPY) implements a skip-if-newer optimisation that has been +observed to silently omit individual files when an incremental build runs +against a previously partially-populated DATA_DIST_ROOT (which is exactly +what happens with rules_foreign_cc on rebuilds). Replacing the copy with a +symlink eliminates the entire copy step and guarantees the build always +sees the exact source phsource tree. phsource is read-only during the +build, so a symlink is safe. + +This patch is OVMS-local; see third_party/espeak_ng/espeak_ng.bzl. + +--- a/cmake/data.cmake ++++ b/cmake/data.cmake +@@ -58,7 +58,8 @@ + file(MAKE_DIRECTORY "${DICT_TMP_DIR}") + file(COPY "${DATA_SRC_DIR}/lang" DESTINATION "${DATA_DIST_DIR}") + file(COPY "${DATA_SRC_DIR}/voices/!v" DESTINATION "${DATA_DIST_DIR}/voices") +-file(COPY "${PHONEME_SRC_DIR}" DESTINATION "${DATA_DIST_ROOT}") ++file(REMOVE_RECURSE "${DATA_DIST_ROOT}/phsource") ++file(CREATE_LINK "${PHONEME_SRC_DIR}" "${DATA_DIST_ROOT}/phsource" SYMBOLIC) + + set(ESPEAK_RUN_ENV ${CMAKE_COMMAND} -E env "ESPEAK_DATA_PATH=${DATA_DIST_ROOT}") + set(ESPEAK_RUN_CMD ${ESPEAK_RUN_ENV} $ENV{VALGRIND} "$") diff --git a/windows_build.bat b/windows_build.bat index f2331abee4..fb5b5eedd6 100644 --- a/windows_build.bat +++ b/windows_build.bat @@ -49,10 +49,18 @@ IF "%~4"=="--integrity" ( set "buildWithIntegrity=" ) +:: Allow disabling espeak-ng (built from source via Bazel) by setting +:: ESPEAK=0 before invoking this script. Defaults to on. +IF "%ESPEAK%"=="0" ( + set "espeakArg=--//:espeak=off" +) ELSE ( + set "espeakArg=--//:espeak=on" +) + set "bazelStartupCmd=--output_user_root=!BAZEL_SHORT_PATH!" set "openvino_dir=!BAZEL_SHORT_PATH!/openvino/runtime/cmake" -set "buildCommand=bazel %bazelStartupCmd% build %buildWithIntegrity% %bazelBuildArgs% --action_env OpenVINO_DIR=%openvino_dir% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures %buildTargets% 2>&1 | tee win_build.log" +set "buildCommand=bazel %bazelStartupCmd% build %buildWithIntegrity% %bazelBuildArgs% %espeakArg% --action_env OpenVINO_DIR=%openvino_dir% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures %buildTargets% 2>&1 | tee win_build.log" set "setOvmsVersionCmd=python windows_set_ovms_version.py" :: Setting PATH environment variable based on default windows node settings: Added ovms_windows specific python settings and c:/opt and removed unused Nvidia and OCL specific tools. diff --git a/windows_create_package.bat b/windows_create_package.bat index 533a2f82ee..e11de08e64 100644 --- a/windows_create_package.bat +++ b/windows_create_package.bat @@ -98,6 +98,15 @@ if exist %cd%\bazel-out\x64_windows-opt\bin\src\core_tokenizers.dll ( if !errorlevel! neq 0 exit /b !errorlevel! ) +:: Bundle espeak-ng DLL + data when it was built from source by Bazel +:: (--//:espeak=on). Picked up from the rules_foreign_cc cmake output tree. +for /f "delims=" %%D in ('dir /b /s /a:d %cd%\bazel-out\x64_windows-opt\bin\external\espeak_ng\espeak-ng.dll 2^>nul') do ( + copy /Y "%%D" dist\windows\ovms +) +for /f "delims=" %%D in ('dir /b /s /a:d %cd%\bazel-out\x64_windows-opt\bin\external\espeak_ng 2^>nul ^| findstr /e "espeak-ng-data"') do ( + xcopy "%%D" dist\windows\ovms\espeak-ng-data /E /I /H /Y +) + copy %cd%\setupvars.* dist\windows\ovms if !errorlevel! neq 0 exit /b !errorlevel! copy %cd%\install_ovms_service.bat dist\windows\ovms From 6d7e5022c3e22d570e26de6b601c24585878996c Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 20 May 2026 14:32:31 +0200 Subject: [PATCH 2/4] Add header --- third_party/espeak_ng/out_of_source_phsource.patch | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/third_party/espeak_ng/out_of_source_phsource.patch b/third_party/espeak_ng/out_of_source_phsource.patch index 3cc7849d8c..8711e1575a 100644 --- a/third_party/espeak_ng/out_of_source_phsource.patch +++ b/third_party/espeak_ng/out_of_source_phsource.patch @@ -1,3 +1,16 @@ +Copyright (c) 2026 Intel Corporation +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + Replace configure-time file(COPY) of phsource with a symbolic link. Upstream cmake/data.cmake does: From a482758290137ce31c36f5cffd7c74fb50d6ada0 Mon Sep 17 00:00:00 2001 From: mkulakow Date: Wed, 20 May 2026 17:18:15 +0200 Subject: [PATCH 3/4] fix --- src/audio/text_to_speech/t2s_servable.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp index 2c9b1abfe3..2da7263302 100644 --- a/src/audio/text_to_speech/t2s_servable.cpp +++ b/src/audio/text_to_speech/t2s_servable.cpp @@ -88,7 +88,10 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD throw std::runtime_error("Error during plugin_config option parsing"); } ttsPipeline = std::make_shared(parsedModelsPath.string(), targetDevice, config); - const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape(); + // SpeechT5 speaker embeddings have a fixed shape of {1, 512}. For Kokoro backend the + // speaker embedding tensor is ignored by the pipeline, so this shape is only used to + // validate raw float32 voice files loaded for SpeechT5 deployments. + const ov::Shape speakerEmbeddingShape{1, 512}; for (auto voice : graphVoices) { std::filesystem::path voicePath(voice.path()); if (voicePath.is_relative()) { From 97f95acc2fc4952a11c84b46880eca1bd1e22322 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 21 May 2026 08:55:20 +0200 Subject: [PATCH 4/4] fix --- create_package.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/create_package.sh b/create_package.sh index c2efb67d8b..61110bb6b4 100755 --- a/create_package.sh +++ b/create_package.sh @@ -37,6 +37,17 @@ if [ -n "$ESPEAK_DATA_SRC" ] && [ -d "$ESPEAK_DATA_SRC" ] ; then mkdir -p /ovms_release/share cp -rL "$ESPEAK_DATA_SRC" /ovms_release/share/ ; fi +# The version matches the espeak-ng tag pinned +# in third_party/espeak_ng/espeak_ng.bzl (1.52.0); +# update both places together when bumping espeak-ng. +ESPEAK_REAL=libespeak-ng.so.1.52.0.1 +if [ -f "/ovms_release/lib/$ESPEAK_REAL" ]; then + cd /ovms_release/lib + rm -f libespeak-ng.so libespeak-ng.so.1 + ln -s "$ESPEAK_REAL" libespeak-ng.so.1 + ln -s "$ESPEAK_REAL" libespeak-ng.so + cd - >/dev/null +fi if [ "$FUZZER_BUILD" == "0" ]; then mv /ovms_release/lib/libcustom_node* /ovms_release/lib/custom_nodes/; fi; cd /ovms_release/lib/ ; rm -f libcurl.so* cd /ovms_release/lib/ ; rm -f libazurestorage.so.* ; ln -s libazurestorage.so libazurestorage.so.7 ;ln -s libazurestorage.so libazurestorage.so.7.5