From 3efa972f71eaf8b05d1ed7a94506c1f8adab28a5 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Mon, 16 Mar 2026 16:09:16 +0100
Subject: [PATCH 1/4] Support kokoro model

---
 Dockerfile.redhat                             |  17 +-
 Dockerfile.ubuntu                             |  17 +-
 Makefile                                      |  11 +-
 WORKSPACE                                     |   4 +
 common_settings.bzl                           |   3 +-
 create_package.sh                             |  10 ++
 demos/common/export_models/export_model.py    |  61 +++++--
 demos/common/export_models/requirements.txt   |   7 +-
 distro.bzl                                    |  26 +++
 setupvars.bat                                 |   2 +
 setupvars.ps1                                 |   4 +
 src/BUILD                                     |   6 +
 src/audio/audio_utils.cpp                     |  33 +++-
 src/audio/audio_utils.hpp                     |   1 +
 src/audio/speech_to_text/s2t_servable.cpp     |   1 -
 src/audio/text_to_speech/BUILD                |   1 +
 src/audio/text_to_speech/t2s_calculator.cc    |  81 +++++++--
 src/audio/text_to_speech/t2s_calculator.proto |   2 +
 src/audio/text_to_speech/t2s_servable.cpp     |  35 +++-
 src/audio/text_to_speech/t2s_servable.hpp     |  10 +-
 src/capi_frontend/server_settings.hpp         |   3 +-
 src/graph_export/graph_export.cpp             |  40 ++++-
 src/graph_export/t2s_graph_cli_parser.cpp     |  21 ++-
 src/pull_module/optimum_export.cpp            |   6 +-
 src/test/graph_export_test.cpp                |  50 +++++-
 src/test/pull_hf_model_test.cpp               |  14 ++
 third_party/BUILD                             |  26 +++
 third_party/espeak_ng/BUILD                   |  26 +++
 third_party/espeak_ng/espeak_ng.bzl           | 164 ++++++++++++++++++
 .../espeak_ng/out_of_source_phsource.patch    |  28 +++
 windows_build.bat                             |  10 +-
 windows_create_package.bat                    |   9 +
 32 files changed, 677 insertions(+), 52 deletions(-)
 create mode 100644 third_party/espeak_ng/BUILD
 create mode 100644 third_party/espeak_ng/espeak_ng.bzl
 create mode 100644 third_party/espeak_ng/out_of_source_phsource.patch

diff --git a/Dockerfile.redhat b/Dockerfile.redhat
index 23e63cde54..4e55964194 100644
--- a/Dockerfile.redhat
+++ b/Dockerfile.redhat
@@ -109,6 +109,7 @@ SHELL ["/bin/bash", "-xo", "pipefail", "-c"]
 ARG JOBS=40
 ARG VERBOSE_LOGS=OFF
 ARG LTO_ENABLE=OFF
+ARG ESPEAK=1
 
 # hadolint ignore=DL3041
 RUN dnf install -y -d6 \
@@ -129,6 +130,8 @@ RUN dnf install -y -d6 \
             python3.12-pip \
             libicu-devel && \
             dnf clean all
+# Note: espeak-ng is no longer installed from dnf; it is built from source
+# by Bazel (see third_party/espeak_ng) and bundled in the release.
 
 WORKDIR /
 
@@ -234,11 +237,11 @@ RUN git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /ope
     fi
 
 WORKDIR /openvino_genai/
-ARG ov_genai_branch=master
-ARG ov_genai_org=openvinotoolkit
+ARG ov_genai_branch=kokoro_tts
+ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
 # hadolint ignore=DL3003
 RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
-    git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
+    git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
     cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
     cmake --build ./build/ --parallel $JOBS && cp -vP /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
     cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
@@ -393,6 +396,7 @@ LABEL "maintainer"="dariusz.trawinski@intel.com"
 ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.52.32224"
 ARG GPU=0
+ARG ESPEAK=1
 ARG debug_bazel_flags=
 LABEL bazel-build-flags=${debug_bazel_flags}
 LABEL supported-devices="CPU=1 GPU=${GPU}"
@@ -407,6 +411,8 @@ COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
 # hadolint ignore=DL3003,DL3041,SC2164,SC1091
 RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_downloads=8\nretries=50" >> /etc/dnf/dnf.conf ; else export DNF_TOOL=microdnf ; fi ; \
     $DNF_TOOL upgrade --setopt=install_weak_deps=0 --nodocs -y ; \
+    # espeak-ng shared library + data are shipped inside /ovms by the Bazel
+    # build when $ESPEAK == "1"; no system package is required at runtime.
     if [ "$GPU" == "1" ] ; then \
         source /install_gpu_drivers.sh && rm -rf /install_gpu_drivers.sh; \
     fi ; \
@@ -422,6 +428,11 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
 
 ENV LD_LIBRARY_PATH=/ovms/lib
 ENV PATH="$PATH:/ovms/bin"
+# When espeak-ng is built into the OVMS release, the libespeak-ng.so loaded
+# at runtime needs to locate its data tables. They are shipped under
+# /ovms/share/espeak-ng-data; point espeak-ng at that location explicitly so
+# we do not depend on any system /usr/share/espeak-ng-data.
+ENV ESPEAK_DATA_PATH=/ovms/share/espeak-ng-data
 
 COPY --from=pkg /ovms_release /ovms
 COPY --from=pkg /usr/local/lib/python3.12/site-packages/jinja2 /ovms/python_deps/jinja2
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index f7e57e380c..5aa99d4d21 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -95,6 +95,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 SHELL ["/bin/bash", "-xo", "pipefail", "-c"]
 
 ARG debug_bazel_flags="--strip=always  --config=mp_on_py_on --//:distro=ubuntu"
+ARG ESPEAK=1
 RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
@@ -124,6 +125,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
             vim && \
             apt-get clean && \
             rm -rf /var/lib/apt/lists/*
+# Note: espeak-ng is no longer installed from apt here; it is built from
+# source by Bazel (see third_party/espeak_ng) and bundled in the release.
 # on ubuntu 24.04 python3.12 is used as default python for ovms build and release
 # TF build needs python3.10 with numpy as it does not support python3.12
 RUN python3.10 -m pip install "numpy<2.0.0" --no-cache-dir
@@ -220,12 +223,12 @@ RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
     if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \
     cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \
     fi
-ARG ov_genai_branch=master
-ARG ov_genai_org=openvinotoolkit
+ARG ov_genai_branch=kokoro_tts
+ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
 WORKDIR /openvino_genai/
 # hadolint ignore=DL3003
 RUN if [ "$ov_use_binary" == "0" ]; then \
-    git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
+    git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
     cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} " -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
     cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
     cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
@@ -395,6 +398,7 @@ ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.26.30049"
 ARG GPU=0
 ARG NPU=0
+ARG ESPEAK=1
 ENV DEBIAN_FRONTEND=noninteractive
 ARG debug_bazel_flags=
 LABEL bazel-build-flags=${debug_bazel_flags}
@@ -413,6 +417,8 @@ COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
     apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
+    # espeak-ng shared library + data are shipped inside /ovms by the Bazel
+    # build when $ESPEAK == "1"; no system package is required at runtime.
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \
@@ -451,6 +457,11 @@ RUN if ! [[ $debug_bazel_flags == *"py_off"* ]]; then true ; else exit 0 ; fi ;
 
 ENV LD_LIBRARY_PATH=/ovms/lib
 ENV PATH="$PATH:/ovms/bin"
+# When espeak-ng is built into the OVMS release, the libespeak-ng.so loaded
+# at runtime needs to locate its data tables. They are shipped under
+# /ovms/share/espeak-ng-data; point espeak-ng at that location explicitly so
+# we do not depend on any system /usr/share/espeak-ng-data.
+ENV ESPEAK_DATA_PATH=/ovms/share/espeak-ng-data
 
 RUN echo "The source code of added GPL components is stored in https://storage.openvinotoolkit.org/repositories/openvino/ci_dependencies/container_gpl_sources/" > /ovms/thirdparty-licenses/GPL.txt
 USER ovms
diff --git a/Makefile b/Makefile
index 6d7c5d2918..aa5bf1ef65 100644
--- a/Makefile
+++ b/Makefile
@@ -61,6 +61,7 @@ BUILD_TESTS ?= 0
 RUN_GPU_TESTS ?=
 GPU ?= 0
 NPU ?= 0
+ESPEAK ?= 1
 BUILD_NGINX ?= 0
 MEDIAPIPE_DISABLE ?= 0
 PYTHON_DISABLE ?= 0
@@ -146,8 +147,13 @@ else ifeq ($(findstring redhat,$(BASE_OS)),redhat)
 else
   $(error BASE_OS must be either ubuntu or redhat)
 endif
-CAPI_FLAGS = "--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)"  --config=mp_off_py_off"$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)
-BAZEL_DEBUG_FLAGS="--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)$(DISABLE_PARAMS)$(FUZZER_BUILD_PARAMS)$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(REPO_ENV)
+ifeq ($(ESPEAK),1)
+  ESPEAK_PARAMS = " --//:espeak=on"
+else
+  ESPEAK_PARAMS = " --//:espeak=off"
+endif
+CAPI_FLAGS = "--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)"  --config=mp_off_py_off"$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(ESPEAK_PARAMS)
+BAZEL_DEBUG_FLAGS="--strip=$(STRIP)"$(BAZEL_DEBUG_BUILD_FLAGS)$(DISABLE_PARAMS)$(FUZZER_BUILD_PARAMS)$(OV_TRACING_PARAMS)$(TARGET_DISTRO_PARAMS)$(ESPEAK_PARAMS)$(REPO_ENV)
 
 # Option to Override release image.
 # Release image OS *must have* glibc version >= glibc version on BASE_OS:
@@ -237,6 +243,7 @@ BUILD_ARGS = --build-arg http_proxy=$(HTTP_PROXY)\
 	--build-arg BASE_OS=$(BASE_OS)\
 	--build-arg INSTALL_RPMS_FROM_URL=$(INSTALL_RPMS_FROM_URL)\
 	--build-arg INSTALL_DRIVER_VERSION=$(INSTALL_DRIVER_VERSION)\
+	--build-arg ESPEAK=$(ESPEAK)\
 	--build-arg RELEASE_BASE_IMAGE=$(BASE_IMAGE_RELEASE)\
 	--build-arg JOBS=$(JOBS)\
 	--build-arg CAPI_FLAGS=$(CAPI_FLAGS)\
diff --git a/WORKSPACE b/WORKSPACE
index bcd1dd7b00..be143c6924 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -497,6 +497,10 @@ libgit2_engine()
 load("@ovms//third_party/drogon:drogon.bzl", "drogon_cpp")
 drogon_cpp()
 
+### espeak-ng (built from source via Bazel; gated by --//:espeak flag)
+load("@ovms//third_party/espeak_ng:espeak_ng.bzl", "espeak_ng")
+espeak_ng()
+
 # Azure Storage SDK
 new_local_repository(
     name = "azure",
diff --git a/common_settings.bzl b/common_settings.bzl
index 3faad26bb5..e4b28a7b44 100644
--- a/common_settings.bzl
+++ b/common_settings.bzl
@@ -20,7 +20,7 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@mediapipe//mediapipe/framework:more_selects.bzl", "more_selects")
 load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
-load("//:distro.bzl", "distro_flag")
+load("//:distro.bzl", "distro_flag", "espeak_flag")
 
 # cc_library rule wrapper that will accept the same arguments but if user will not provide
 # copts, linkopts, local_defines it will set them to the defaults
@@ -52,6 +52,7 @@ def ovms_cc_library(**kwargs):
 
 def create_config_settings():
     distro_flag()
+    espeak_flag()
     native.config_setting(
         name = "disable_mediapipe",
         define_values = {
diff --git a/create_package.sh b/create_package.sh
index 9c2142b1eb..c2efb67d8b 100755
--- a/create_package.sh
+++ b/create_package.sh
@@ -27,6 +27,16 @@ mkdir -vp /ovms_release/lib/custom_nodes
 if [ "$ov_use_binary" == "0" ] ; then cp -v /openvino_tokenizers/build/src/libopenvino_tokenizers.so /ovms_release/lib/ ; fi
 
 find /ovms/bazel-out/k8-*/bin -iname '*.so*' ! -type d ! -name "libgtest.so" ! -name "*params" ! -name "*.hana.*" ! -name "py_generate_pipeline.cpython*" !  -name "lib_node_*" ! -path "*test_python_binding*" ! -name "*libpython*" -exec cp -v {} /ovms_release/lib/ \;
+
+# Bundle espeak-ng data files when espeak was enabled in the Bazel build.
+# rules_foreign_cc places the cmake install tree under copy_<rule>/espeak-ng/
+# inside bazel-out. Both the shared library (picked up by the find above)
+# and the espeak-ng-data directory are required at runtime.
+ESPEAK_DATA_SRC=$(find /ovms/bazel-out/k8-*/bin/external/espeak_ng -type d -name 'espeak-ng-data' 2>/dev/null | head -n 1 || true)
+if [ -n "$ESPEAK_DATA_SRC" ] && [ -d "$ESPEAK_DATA_SRC" ] ; then
+    mkdir -p /ovms_release/share
+    cp -rL "$ESPEAK_DATA_SRC" /ovms_release/share/ ;
+fi
 if [ "$FUZZER_BUILD" == "0" ]; then mv /ovms_release/lib/libcustom_node* /ovms_release/lib/custom_nodes/; fi;
 cd /ovms_release/lib/ ; rm -f libcurl.so*
 cd /ovms_release/lib/ ; rm -f libazurestorage.so.* ; ln -s libazurestorage.so libazurestorage.so.7 ;ln -s libazurestorage.so libazurestorage.so.7.5
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
index 584f9a4d64..255a78c4d2 100644
--- a/demos/common/export_models/export_model.py
+++ b/demos/common/export_models/export_model.py
@@ -84,9 +84,11 @@ def add_common_arguments(parser):
 parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
 add_common_arguments(parser_text2speech)
 parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
-parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')
-parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker', dest='speaker_name')
-parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file.', dest='speaker_path')
+parser_text2speech.add_argument('--model_type', default='speecht5', choices=['speecht5', 'kokoro'], help='Type of the source TTS model. speecht5 uses optimum-cli; kokoro uses a dedicated PyTorch->OpenVINO conversion path.', dest='model_type')
+parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for speecht5. For example microsoft/speecht5_hifigan. Ignored for kokoro.', dest='vocoder')
+parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_name')
+parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_path')
+parser_text2speech.add_argument('--language', type=str, default=None, help='Default language code passed to the calculator (e.g. en-us, zh). Used mainly by kokoro.', dest='language')
 
 
 parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
@@ -109,13 +111,19 @@ def add_common_arguments(parser):
       models_path: "{{model_path}}",
       plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
       target_device: "{{target_device|default("CPU", true)}}",
-      {%- if speaker_name and speaker_path %}
+      {%- if language %}
+      language: "{{language}}",
+      {%- endif %}
+      {%- if voices %}
       voices: [
+        {%- for v in voices %}
         {
-            name: "{{speaker_name}}",
-            path: "{{speaker_path}}"
-        }
-      ]{% endif %}
+            name: "{{v.name}}",
+            path: "{{v.path}}"
+        }{% if not loop.last %},{% endif %}
+        {%- endfor %}
+      ]
+      {%- endif %}
     }
   }
 }
@@ -476,13 +484,42 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
     add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
 
+def _list_kokoro_voices(destination_path):
+    """optimum-cli's Kokoro exporter writes per-voice speaker embeddings to
+    <destination_path>/voices/<name>.bin. Return the sorted list of voice names."""
+    voices_dir = os.path.join(destination_path, "voices")
+    if not os.path.isdir(voices_dir):
+        print("Warning: no voices/ directory found under", destination_path)
+        return []
+    return sorted(Path(p).stem for p in Path(voices_dir).glob("*.bin"))
+
 def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
     destination_path = os.path.join(model_repository_path, model_name)
     print("Exporting text2speech model to ",destination_path)
-    if not os.path.isdir(destination_path) or args['overwrite_models']:
-        optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
-        if os.system(optimum_command):
-            raise ValueError("Failed to export text2speech model", source_model)
+    model_type = task_parameters.get('model_type', 'speecht5')
+
+    if model_type == 'kokoro':
+        # optimum-intel registers Kokoro under library_name=kokoro / task=text-to-audio.
+        # The kokoro exporter also dumps each speaker embedding to voices/<name>.bin.
+        if not os.path.isfile(os.path.join(destination_path, 'openvino_model.xml')) or args['overwrite_models']:
+            optimum_command = "optimum-cli export openvino --model {} --task text-to-audio --weight-format {} {} --trust-remote-code {}".format(
+                source_model, precision, task_parameters['extra_quantization_params'], destination_path)
+            print('Running command:', optimum_command)
+            if os.system(optimum_command):
+                raise ValueError("Failed to export kokoro model", source_model)
+        voice_names = _list_kokoro_voices(destination_path)
+        # Render the graph with every available voice (path is relative to graph.pbtxt).
+        task_parameters['voices'] = [{'name': n, 'path': f'./voices/{n}.bin'} for n in voice_names]
+    else:
+        if not os.path.isdir(destination_path) or args['overwrite_models']:
+            if not task_parameters.get('vocoder'):
+                raise ValueError("--vocoder is required when --model_type=speecht5")
+            optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
+            if os.system(optimum_command):
+                raise ValueError("Failed to export text2speech model", source_model)
+        if task_parameters.get('speaker_name') and task_parameters.get('speaker_path'):
+            task_parameters['voices'] = [{'name': task_parameters['speaker_name'], 'path': task_parameters['speaker_path']}]
+
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
     graph_content = gtemplate.render(model_path="./", **task_parameters)
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
index 60cbf3d2fc..3828e94e7f 100644
--- a/demos/common/export_models/requirements.txt
+++ b/demos/common/export_models/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url "https://download.pytorch.org/whl/cpu"
 --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/nightly"
 --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/pre-release"
-optimum-intel@git+https://github.com/huggingface/optimum-intel.git@d4dd21a3aa89c0671d85b704847ac06a378e761c
+optimum-intel@git+https://github.com/huggingface/optimum-intel.git@7a109befdfd4f9b94351135b17505efa28698210
 accelerate
 datasets
 diffusers  # for image generation
@@ -14,3 +14,8 @@ sentence_transformers==5.3.0
 sentencepiece  # Required by: transformers`
 torchvision
 requests
+# Kokoro TTS export (text2speech --model_type kokoro).
+# typer must stay <0.12 because kokoro -> misaki -> spacy imports `typer.main`,
+# which was removed in typer 0.12.
+kokoro==0.9.4
+typer<0.12
diff --git a/distro.bzl b/distro.bzl
index 258fee8c47..c01ad7bb56 100644
--- a/distro.bzl
+++ b/distro.bzl
@@ -42,3 +42,29 @@ def distro_flag():
         name = "not_ubuntu_build",
         negate = ":ubuntu_build",
     )
+
+# Controls whether espeak-ng is built from source (via Bazel) and bundled
+# into the OVMS release. When "off", no espeak-ng artifacts are produced
+# and the runtime will not have phonemization fallback available.
+def espeak_flag():
+    string_flag(
+        name = "espeak",
+        values = ["on", "off"],
+        build_setting_default = "on",
+    )
+    native.config_setting(
+        name = "espeak_on",
+        flag_values = {
+            "espeak": "on",
+        },
+    )
+    native.config_setting(
+        name = "espeak_off",
+        flag_values = {
+            "espeak": "off",
+        },
+    )
+    more_selects.config_setting_negation(
+        name = "not_espeak_on",
+        negate = ":espeak_on",
+    )
diff --git a/setupvars.bat b/setupvars.bat
index 200991839b..7f99e863d7 100644
--- a/setupvars.bat
+++ b/setupvars.bat
@@ -17,4 +17,6 @@
 set "OVMS_DIR=%~dp0"
 set "PYTHONHOME=%OVMS_DIR%\python"
 set "PATH=%OVMS_DIR%;%PYTHONHOME%;%PYTHONHOME%\Scripts;%PATH%"
+:: Point bundled espeak-ng (when packaged) at its data tables.
+if exist "%OVMS_DIR%\espeak-ng-data" set "ESPEAK_DATA_PATH=%OVMS_DIR%\espeak-ng-data"
 echo "OpenVINO Model Server Environment Initialized"
diff --git a/setupvars.ps1 b/setupvars.ps1
index 549bfa1a1b..f0703d00d0 100644
--- a/setupvars.ps1
+++ b/setupvars.ps1
@@ -22,4 +22,8 @@ if (Test-Path "$env:OVMS_DIR\python") {
 } else {
     $env:PATH="$env:PATH:$env:OVMS_DIR"
 }
+# Point bundled espeak-ng (when packaged) at its data tables.
+if (Test-Path "$env:OVMS_DIR\espeak-ng-data") {
+    $env:ESPEAK_DATA_PATH="$env:OVMS_DIR\espeak-ng-data"
+}
 echo "OpenVINO Model Server Environment Initialized"
diff --git a/src/BUILD b/src/BUILD
index 5163149311..8c6d5b4fd7 100644
--- a/src/BUILD
+++ b/src/BUILD
@@ -2287,6 +2287,12 @@ cc_binary(
             "//src/python/binding:pyovms.so",
         ],
         "//:disable_python": []
+    }) + select({
+        "//:espeak_on": [
+            "//third_party:espeak_ng",
+            "//third_party:espeak_ng_data",
+        ],
+        "//:espeak_off": [],
     }),
     # linkstatic = False,  # Use for dynamic linking when necessary
 )
diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 77b38e70df..f56d600f14 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright 2025 Intel Corporation
+// Copyright 2026 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "src/logging.hpp"
 #include <string>
 #include <vector>
+#include <cmath>
 #include <random>
 #include <algorithm>
 #pragma warning(push)
@@ -188,3 +189,33 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
+
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) {
+    enum : unsigned int {
+        OUTPUT_PREPARATION,
+        TIMER_END
+    };
+    Timer<TIMER_END> timer;
+    timer.start(OUTPUT_PREPARATION);
+
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = 1;
+    format.sampleRate = 24000;  // Kokoro native sample rate
+    format.bitsPerSample = 32;
+    drwav wav;
+
+    auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
+    if (status == DRWAV_FALSE) {
+        throw std::runtime_error("Failed to initialize WAV writer");
+    }
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr);
+    if (framesWritten != speechSize) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uninit(&wav);
+    timer.stop(OUTPUT_PREPARATION);
+    auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
+    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+}
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
index cbeea8b457..0928d03f3d 100644
--- a/src/audio/audio_utils.hpp
+++ b/src/audio/audio_utils.hpp
@@ -25,3 +25,4 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr);
diff --git a/src/audio/speech_to_text/s2t_servable.cpp b/src/audio/speech_to_text/s2t_servable.cpp
index 5ce2c451a4..dbe9001d16 100644
--- a/src/audio/speech_to_text/s2t_servable.cpp
+++ b/src/audio/speech_to_text/s2t_servable.cpp
@@ -35,7 +35,6 @@ namespace ovms {
 namespace {
 constexpr size_t ISO_LANG_CODE_MAX = 3;
 }
-
 SttServable::SttServable(const ::mediapipe::S2tCalculatorOptions& nodeOptions, const std::string& graphPath) {
     auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
     if (fsModelsPath.is_relative()) {
diff --git a/src/audio/text_to_speech/BUILD b/src/audio/text_to_speech/BUILD
index 2a494f6e16..a3d10f9893 100644
--- a/src/audio/text_to_speech/BUILD
+++ b/src/audio/text_to_speech/BUILD
@@ -37,6 +37,7 @@ ovms_cc_library(
     srcs = ["t2s_calculator.cc",
             "tts_node_initializer.cpp"],
     deps = [
+        "//third_party:genai",
         "@mediapipe//mediapipe/framework:calculator_framework",
         "//src:httppayload",
         "//src:libovmslogging",
diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc
index f8f4912f0d..7c78d764e1 100644
--- a/src/audio/text_to_speech/t2s_calculator.cc
+++ b/src/audio/text_to_speech/t2s_calculator.cc
@@ -28,6 +28,8 @@
 #include "src/client_connection.hpp"
 #include "src/http_payload.hpp"
 #include "src/logging.hpp"
+#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+#include "openvino/openvino.hpp"
 #include <mutex>
 #include <thread>
 
@@ -63,6 +65,8 @@ static absl::Status checkClientDisconnected(const ovms::HttpPayload& payload, co
 class T2sCalculator : public CalculatorBase {
     static const std::string INPUT_TAG_NAME;
     static const std::string OUTPUT_TAG_NAME;
+    std::string defaultLanguage = "en-us";
+    float defaultSpeed = 1.0f;
 
 public:
     static absl::Status GetContract(CalculatorContract* cc) {
@@ -81,6 +85,13 @@ class T2sCalculator : public CalculatorBase {
 
     absl::Status Open(CalculatorContext* cc) final {
         SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "T2sCalculator  [Node: {}] Open start", cc->NodeName());
+        const auto& options = cc->Options<mediapipe::T2sCalculatorOptions>();
+        if (options.has_language() && !options.language().empty()) {
+            defaultLanguage = options.language();
+        }
+        if (options.has_speed()) {
+            defaultSpeed = options.speed();
+        }
         return absl::OkStatus();
     }
 
@@ -113,26 +124,72 @@ class T2sCalculator : public CalculatorBase {
                 if (streamIt != payload.parsedJson->MemberEnd()) {
                     return absl::InvalidArgumentError("streaming is not supported");
                 }
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "1");
                 std::optional<std::string> voiceName;
                 auto voiceIt = payload.parsedJson->FindMember("voice");
-                if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                if (voiceIt != payload.parsedJson->MemberEnd()) {
+                    if (!voiceIt->value.IsString()) {
+                        return absl::InvalidArgumentError("voice field is not a string");
+                    }
                     voiceName = voiceIt->value.GetString();
-                    if (pipe->voices.find(voiceName.value()) == pipe->voices.end())
-                        return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", voiceName.value()));
                 }
-
+                std::string language = defaultLanguage;
+                auto languageIt = payload.parsedJson->FindMember("language");
+                if (languageIt != payload.parsedJson->MemberEnd()) {
+                    if (!languageIt->value.IsString()) {
+                        return absl::InvalidArgumentError("language field is not a string");
+                    }
+                    language = languageIt->value.GetString();
+                }
+                float speed = defaultSpeed;
+                auto speedIt = payload.parsedJson->FindMember("speed");
+                if (speedIt != payload.parsedJson->MemberEnd()) {
+                    if (!speedIt->value.IsNumber()) {
+                        return absl::InvalidArgumentError("speed field is not a number");
+                    }
+                    speed = speedIt->value.GetFloat();
+                }
                 ov::genai::Text2SpeechDecodedResults generatedSpeech;
                 std::unique_lock lock(pipe->ttsPipelineMutex);
                 auto disconnectStatus = checkClientDisconnected(payload, cc->NodeName(), "before generation");
                 if (!disconnectStatus.ok())
                     return disconnectStatus;
-
+                ov::Tensor speakerEmbedding;
+                std::string selectedVoice;
                 if (voiceName.has_value()) {
-                    generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voiceName.value()]);
-                } else {
-                    generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+                    selectedVoice = voiceName.value();
+                    auto speakerIt = pipe->voices.find(selectedVoice);
+                    if (speakerIt != pipe->voices.end()) {
+                        speakerEmbedding = speakerIt->second;
+                    } else if (!pipe->voices.empty()) {
+                        // Voice was explicitly requested but is unknown - fail with the list of available voices.
+                        std::string available;
+                        for (const auto& [name, _] : pipe->voices) {
+                            if (!available.empty())
+                                available += ", ";
+                            available += name;
+                        }
+                        return absl::InvalidArgumentError(absl::StrCat(
+                            "Unknown voice \"", selectedVoice, "\". Available voices: ", available));
+                    }
+                    // pipe->voices is empty (e.g. SpeechT5 deployment without speaker embeddings) -
+                    // pass the requested name through to the pipeline with an empty embedding.
+                } else if (!pipe->voices.empty()) {
+                    // No voice in the request - pick a default from the voices loaded by the servable.
+                    auto preferredIt = pipe->voices.find("af_alloy");
+                    if (preferredIt != pipe->voices.end()) {
+                        selectedVoice = preferredIt->first;
+                        speakerEmbedding = preferredIt->second;
+                    } else {
+                        const auto& firstVoice = *pipe->voices.begin();
+                        selectedVoice = firstVoice.first;
+                        speakerEmbedding = firstVoice.second;
+                    }
                 }
-                auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
+                ov::AnyMap properties{{"voice", selectedVoice}, {"language", language}, {"speed", speed}};
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding, properties);
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "3");
+                // auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
                 auto speechSize = generatedSpeech.speeches[0].get_size();
                 ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
                 // copy results to release inference request
@@ -143,7 +200,9 @@ class T2sCalculator : public CalculatorBase {
                     return disconnectStatus;
                 void* ppData;
                 size_t pDataSize;
-                prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpuTensor.data<const float>());
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "4");
+                prepareAudioOutputKokoro(&ppData, pDataSize, speechSize, cpuTensor.data<const float>());
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "5");
                 output = std::make_unique<std::string>(reinterpret_cast<char*>(ppData), pDataSize);
                 drwav_free(ppData, NULL);
             } else {
@@ -151,6 +210,8 @@ class T2sCalculator : public CalculatorBase {
             }
         } catch (ov::AssertFailure& e) {
             return absl::InvalidArgumentError(e.what());
+        } catch (std::runtime_error& e) {
+            return absl::InvalidArgumentError(e.what());
         } catch (...) {
             return absl::InvalidArgumentError("Response generation failed");
         }
diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto
index efea722c3d..5bfce7811f 100644
--- a/src/audio/text_to_speech/t2s_calculator.proto
+++ b/src/audio/text_to_speech/t2s_calculator.proto
@@ -40,4 +40,6 @@ message T2sCalculatorOptions {
       required string path = 2;
     }
     repeated SpeakerEmbeddings voices = 4;
+    optional string language = 5 [default = "en-us"];
+    optional float speed = 6 [default = 1.0];
 }
diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp
index c782c9346d..2c9b1abfe3 100644
--- a/src/audio/text_to_speech/t2s_servable.cpp
+++ b/src/audio/text_to_speech/t2s_servable.cpp
@@ -19,8 +19,8 @@
 #include <unordered_map>
 #include <vector>
 #include <fstream>
+#include <sstream>
 
-#include "openvino/genai/whisper_pipeline.hpp"
 #include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
 #include "src/audio/text_to_speech/t2s_calculator.pb.h"
 #include "src/status.hpp"
@@ -31,7 +31,15 @@
 
 namespace ovms {
 
-static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
+static size_t getShapeElementsCount(const ov::Shape& shape) {
+    size_t elementsCount = 1;
+    for (const auto dim : shape) {
+        elementsCount *= dim;
+    }
+    return elementsCount;
+}
+
+static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path, const ov::Shape& expectedShape) {
     std::ifstream input(file_path, std::ios::binary);
     if (input.fail()) {
         std::stringstream ss;
@@ -48,12 +56,16 @@ static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path)
     if (buffer_size % sizeof(float) != 0) {
         throw std::runtime_error("File size is not a multiple of float size.");
     }
-    size_t num_floats = buffer_size / sizeof(float);
-    if (num_floats != 512) {
-        throw std::runtime_error("File must contain speaker embedding including 512 32-bit floats.");
+    const size_t numFloats = buffer_size / sizeof(float);
+    const size_t expectedElements = getShapeElementsCount(expectedShape);
+    if (numFloats != expectedElements) {
+        std::stringstream ss;
+        ss << "File must contain speaker embedding with " << expectedElements
+           << " 32-bit floats. Got: " << numFloats;
+        throw std::runtime_error(ss.str());
     }
 
-    ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
+    ov::Tensor floats_tensor(ov::element::f32, expectedShape);
     input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
     if (input.fail()) {
         throw std::runtime_error("Failed to read all data from file.");
@@ -76,10 +88,15 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD
         throw std::runtime_error("Error during plugin_config option parsing");
     }
     ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice, config);
+    const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape();
     for (auto voice : graphVoices) {
-        if (!std::filesystem::exists(voice.path()))
-            throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voice.path()};
-        voices[voice.name()] = read_speaker_embedding(voice.path());
+        std::filesystem::path voicePath(voice.path());
+        if (voicePath.is_relative()) {
+            voicePath = std::filesystem::path(graphPath) / voicePath;
+        }
+        if (!std::filesystem::exists(voicePath))
+            throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voicePath.string()};
+        voices[voice.name()] = read_speaker_embedding(voicePath.string(), speakerEmbeddingShape);
     }
 }
 }  // namespace ovms
diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp
index 6d192edcfb..6f5249baa4 100644
--- a/src/audio/text_to_speech/t2s_servable.hpp
+++ b/src/audio/text_to_speech/t2s_servable.hpp
@@ -16,15 +16,21 @@
 
 #pragma once
 
-#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
 #include "src/audio/text_to_speech/t2s_calculator.pb.h"
 
+#include <filesystem>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 
-namespace ovms {
+#include "openvino/runtime/tensor.hpp"
+
+namespace ov::genai {
+class Text2SpeechPipeline;
+}
 
+namespace ovms {
 class TtsServable {
 public:
     std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp
index 2c0a364bb9..115b27f062 100644
--- a/src/capi_frontend/server_settings.hpp
+++ b/src/capi_frontend/server_settings.hpp
@@ -133,7 +133,7 @@ struct EmbeddingsGraphSettingsImpl {
 };
 
 struct TextToSpeechGraphSettingsImpl {
-    uint32_t unused = 1;  // will be added
+    std::optional<std::string> language;
 };
 
 struct SpeechToTextGraphSettingsImpl {
@@ -161,6 +161,7 @@ struct ExportSettings {
     std::string targetDevice = "CPU";
     std::optional<std::string> extraQuantizationParams;
     std::optional<std::string> vocoder;
+    std::string modelType = "speecht5";  // text2speech only: "speecht5" or "kokoro"
     std::string precision = "int8";
     PluginConfigSettingsImpl pluginConfig;
 };
diff --git a/src/graph_export/graph_export.cpp b/src/graph_export/graph_export.cpp
index 64d80bc23c..d121022a1c 100644
--- a/src/graph_export/graph_export.cpp
+++ b/src/graph_export/graph_export.cpp
@@ -16,10 +16,12 @@
 #include "graph_export.hpp"
 
 #include <algorithm>
+#include <filesystem>
 #include <fstream>
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
 
 #pragma warning(push)
 #pragma warning(disable : 6313)
@@ -315,6 +317,7 @@ static Status createTextToSpeechGraphTemplate(const std::string& directoryPath,
         SPDLOG_ERROR("Graph options not initialized for speech generation.");
         return StatusCode::INTERNAL_ERROR;
     }
+    auto& graphSettings = std::get<TextToSpeechGraphSettingsImpl>(hfSettings.graphSettings);
     auto& ggufFilename = hfSettings.ggufFilename;
     auto& exportSettings = hfSettings.exportSettings;
 
@@ -323,6 +326,24 @@ static Status createTextToSpeechGraphTemplate(const std::string& directoryPath,
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
+
+    // Enumerate kokoro speaker embeddings dumped by optimum-cli to <dir>/voices/*.bin.
+    std::vector<std::string> voiceNames;
+    if (exportSettings.modelType == "kokoro") {
+        std::filesystem::path voicesDir = std::filesystem::path(directoryPath) / "voices";
+        std::error_code ec;
+        if (std::filesystem::is_directory(voicesDir, ec)) {
+            for (const auto& entry : std::filesystem::directory_iterator(voicesDir, ec)) {
+                if (entry.is_regular_file() && entry.path().extension() == ".bin") {
+                    voiceNames.push_back(entry.path().stem().string());
+                }
+            }
+            std::sort(voiceNames.begin(), voiceNames.end());
+        } else {
+            SPDLOG_WARN("Kokoro voices directory not found at {}", voicesDir.string());
+        }
+    }
+
     // clang-format off
     oss << R"(
 input_stream: "HTTP_REQUEST_PAYLOAD:input"
@@ -342,7 +363,24 @@ node {
             )";
     if (pluginConfigOpt.has_value()) {
         oss << R"(plugin_config: ')" << pluginConfigOpt.value() << R"('
-        )";
+            )";
+    }
+    if (graphSettings.language.has_value()) {
+        oss << R"(language: ")" << graphSettings.language.value() << R"("
+            )";
+    }
+    if (!voiceNames.empty()) {
+        oss << R"(voices: [)";
+        for (size_t i = 0; i < voiceNames.size(); ++i) {
+            oss << R"(
+                { name: ")" << voiceNames[i] << R"(", path: "./voices/)" << voiceNames[i] << R"(.bin" })";
+            if (i + 1 < voiceNames.size()) {
+                oss << ",";
+            }
+        }
+        oss << R"(
+            ]
+            )";
     }
     oss << R"(}
     }
diff --git a/src/graph_export/t2s_graph_cli_parser.cpp b/src/graph_export/t2s_graph_cli_parser.cpp
index 69f4479a24..c3e562729a 100644
--- a/src/graph_export/t2s_graph_cli_parser.cpp
+++ b/src/graph_export/t2s_graph_cli_parser.cpp
@@ -43,7 +43,16 @@ void TextToSpeechGraphCLIParser::createOptions() {
         ("num_streams",
             "The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.",
             cxxopts::value<uint32_t>()->default_value("1"),
-            "NUM_STREAMS");
+            "NUM_STREAMS")
+        ("model_type",
+            "Type of the source TTS model: speecht5 (default) or kokoro.",
+            cxxopts::value<std::string>()->default_value("speecht5"),
+            "MODEL_TYPE")
+        ("language",
+            "Default language code passed to the TTS calculator (e.g. en-us, zh). Used by kokoro.",
+            cxxopts::value<std::string>(),
+            "LANGUAGE");
+    // clang-format on
 }
 
 void TextToSpeechGraphCLIParser::printHelp() {
@@ -64,7 +73,7 @@ std::vector<std::string> TextToSpeechGraphCLIParser::parse(const std::vector<std
     const char* const* args = cStrArray.data();
     result = std::make_unique<cxxopts::ParseResult>(options->parse(cStrArray.size(), args));
 
-    return  result->unmatched();
+    return result->unmatched();
 }
 
 void TextToSpeechGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) {
@@ -82,6 +91,14 @@ void TextToSpeechGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsIm
         }
     } else {
         hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as<uint32_t>();
+        const std::string modelType = result->operator[]("model_type").as<std::string>();
+        if (modelType != "speecht5" && modelType != "kokoro") {
+            throw std::invalid_argument("--model_type must be one of: speecht5, kokoro");
+        }
+        hfSettings.exportSettings.modelType = modelType;
+        if (result->count("language")) {
+            textToSpeechGraphSettings.language = result->operator[]("language").as<std::string>();
+        }
     }
     hfSettings.graphSettings = std::move(textToSpeechGraphSettings);
 }
diff --git a/src/pull_module/optimum_export.cpp b/src/pull_module/optimum_export.cpp
index f57e2448dc..00b7d68103 100644
--- a/src/pull_module/optimum_export.cpp
+++ b/src/pull_module/optimum_export.cpp
@@ -67,7 +67,11 @@ std::string OptimumDownloader::getExportCmdTextToSpeech() {
     std::ostringstream oss;
     // clang-format off
     oss << this->OPTIMUM_CLI_EXPORT_COMMAND;
-    if (this->exportSettings.vocoder.has_value()){
+    if (this->exportSettings.modelType == "kokoro") {
+        // optimum-intel registers Kokoro under library_name="kokoro" with task "text-to-audio".
+        // The library is auto-detected from the HF repo; --task must be specified explicitly.
+        oss << "--task text-to-audio ";
+    } else if (this->exportSettings.vocoder.has_value()) {
         oss << "--model-kwargs \"{\"vocoder\": \"" << this->exportSettings.vocoder.value() << "\"}\" ";
     }
     oss << "--model " << this->sourceModel << " --trust-remote-code ";
diff --git a/src/test/graph_export_test.cpp b/src/test/graph_export_test.cpp
index 0bbd684646..2b4483f972 100644
--- a/src/test/graph_export_test.cpp
+++ b/src/test/graph_export_test.cpp
@@ -13,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
+#include <filesystem>
+#include <fstream>
 #include <memory>
 #include <string>
 
@@ -392,7 +394,7 @@ node {
             models_path: "/model1/path"
             target_device: "GPU"
             plugin_config: '{"NUM_STREAMS":"2"}'
-        }
+            }
     }
 }
 )";
@@ -415,6 +417,29 @@ node {
 }
 )";
 
+const std::string expectedTextToSpeechGraphContentsKokoro = R"(
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+    name: "myModel"
+    calculator: "T2sCalculator"
+    input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
+    input_stream: "HTTP_REQUEST_PAYLOAD:input"
+    output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+    node_options: {
+        [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
+            models_path: "./"
+            target_device: "CPU"
+            language: "en-us"
+            voices: [
+                { name: "af_alloy", path: "./voices/af_alloy.bin" },
+                { name: "am_adam", path: "./voices/am_adam.bin" }
+            ]
+            }
+    }
+}
+)";
+
 const std::string expectedSpeechToTextGraphContents = R"(
 input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
@@ -868,6 +893,29 @@ TEST_F(GraphCreationTest, textToSpeechPositiveDefault) {
     ASSERT_EQ(expectedTextToSpeechGraphContentsDefault, removeVersionString(graphContents)) << graphContents;
 }
 
+TEST_F(GraphCreationTest, textToSpeechPositiveKokoro) {
+    // Pre-create the voices/ directory that optimum-cli would have populated for kokoro.
+    std::filesystem::path voicesDir = std::filesystem::path(this->directoryPath) / "voices";
+    std::filesystem::create_directories(voicesDir);
+    { std::ofstream f(voicesDir / "af_alloy.bin"); }
+    { std::ofstream f(voicesDir / "am_adam.bin"); }
+
+    ovms::HFSettingsImpl hfSettings;
+    hfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH;
+    hfSettings.exportSettings.modelName = "myModel";
+    hfSettings.exportSettings.modelType = "kokoro";
+    ovms::TextToSpeechGraphSettingsImpl textToSpeechGraphSettings;
+    textToSpeechGraphSettings.language = "en-us";
+    hfSettings.graphSettings = std::move(textToSpeechGraphSettings);
+    std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt";
+    std::unique_ptr<ovms::GraphExport> graphExporter = std::make_unique<ovms::GraphExport>();
+    auto status = graphExporter->createServableConfig(this->directoryPath, hfSettings);
+    ASSERT_EQ(status, ovms::StatusCode::OK);
+
+    std::string graphContents = GetFileContents(graphPath);
+    ASSERT_EQ(expectedTextToSpeechGraphContentsKokoro, removeVersionString(graphContents)) << graphContents;
+}
+
 TEST_F(GraphCreationTest, textToSpeechCreatedPbtxtInvalid) {
     ovms::HFSettingsImpl hfSettings;
     hfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH;
diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp
index e1ab3f9ff3..db1ff26757 100644
--- a/src/test/pull_hf_model_test.cpp
+++ b/src/test/pull_hf_model_test.cpp
@@ -1392,6 +1392,20 @@ TEST_F(TestOptimumDownloaderSetup, TextToSpeechExportCmd) {
     ASSERT_EQ(optimumDownloader->getConvertCmd(), expectedCmd2);
 }
 
+TEST_F(TestOptimumDownloaderSetup, TextToSpeechKokoroExportCmd) {
+    inHfSettings.task = ovms::TEXT_TO_SPEECH_GRAPH;
+    inHfSettings.exportSettings.modelType = "kokoro";
+    std::unique_ptr<TestOptimumDownloader> optimumDownloader = std::make_unique<TestOptimumDownloader>(inHfSettings);
+    std::string expectedCmd = "optimum-cli export openvino --task text-to-audio --model model/name --trust-remote-code  --weight-format fp64 --someOptimumParam --anotherOptParam value \\path\\to\\Download\\model\\name";
+    std::string expectedCmd2 = "convert_tokenizer model/name -o \\path\\to\\Download\\model\\name";
+#ifdef __linux__
+    std::replace(expectedCmd.begin(), expectedCmd.end(), '\\', '/');
+    std::replace(expectedCmd2.begin(), expectedCmd2.end(), '\\', '/');
+#endif
+    ASSERT_EQ(optimumDownloader->getExportCmd(), expectedCmd);
+    ASSERT_EQ(optimumDownloader->getConvertCmd(), expectedCmd2);
+}
+
 TEST_F(TestOptimumDownloaderSetup, SpeechToTextExportCmd) {
     inHfSettings.task = ovms::SPEECH_TO_TEXT_GRAPH;
     std::unique_ptr<TestOptimumDownloader> optimumDownloader = std::make_unique<TestOptimumDownloader>(inHfSettings);
diff --git a/third_party/BUILD b/third_party/BUILD
index dcde0bd7e4..bce804d7a1 100644
--- a/third_party/BUILD
+++ b/third_party/BUILD
@@ -58,4 +58,30 @@ alias(
         "//conditions:default": "@linux_curl//:curl",
     }),
     visibility = ["//visibility:public"],
+)
+
+# espeak-ng built from source via Bazel (rules_foreign_cc cmake).
+# Selected on/off via the //:espeak build flag. When disabled this resolves
+# to an empty cc_library so dependents can unconditionally list it.
+cc_library(
+    name = "espeak_ng_empty",
+    visibility = ["//visibility:public"],
+)
+
+alias(
+    name = "espeak_ng",
+    actual = select({
+        "//:espeak_on": "@espeak_ng//:espeak_ng",
+        "//:espeak_off": ":espeak_ng_empty",
+    }),
+    visibility = ["//visibility:public"],
+)
+
+alias(
+    name = "espeak_ng_data",
+    actual = select({
+        "//:espeak_on": "@espeak_ng//:espeak_ng_data",
+        "//:espeak_off": ":espeak_ng_empty",
+    }),
+    visibility = ["//visibility:public"],
 )
\ No newline at end of file
diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD
new file mode 100644
index 0000000000..882deb642f
--- /dev/null
+++ b/third_party/espeak_ng/BUILD
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Marks third_party/espeak_ng as a Bazel package so espeak_ng.bzl can be
+# loaded as //third_party/espeak_ng:espeak_ng.bzl from WORKSPACE.
+
+package(default_visibility = ["//visibility:public"])
+
+exports_files([
+    "espeak_ng.bzl",
+    "out_of_source_phsource.patch",
+])
+
+exports_files(["espeak_ng.bzl"])
diff --git a/third_party/espeak_ng/espeak_ng.bzl b/third_party/espeak_ng/espeak_ng.bzl
new file mode 100644
index 0000000000..9b2439b3c2
--- /dev/null
+++ b/third_party/espeak_ng/espeak_ng.bzl
@@ -0,0 +1,164 @@
+#
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Fetches and builds espeak-ng from source using rules_foreign_cc (CMake)
+# so OVMS no longer relies on a system-installed espeak-ng package.
+#
+# Works on both Linux (libespeak-ng.so + share/espeak-ng-data) and Windows
+# (espeak-ng.dll + espeak-ng-data). The build is gated by the //:espeak
+# build flag; when set to "off", the rule is still defined but no targets
+# in OVMS depend on it.
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+
+# Pinned to espeak-ng release tag 1.52.0 (commit hash).
+_ESPEAK_NG_COMMIT = "212928b394a96e8fd2096616bfd54e17845c48f6"  # 1.52.0
+_ESPEAK_NG_REMOTE = "https://github.com/espeak-ng/espeak-ng.git"
+
+def _is_windows(ctx):
+    return ctx.os.name.lower().find("windows") != -1
+
+def espeak_ng():
+    _espeak_ng_repository(name = "_espeak_ng")
+    new_git_repository(
+        name = "espeak_ng",
+        remote = _ESPEAK_NG_REMOTE,
+        commit = _ESPEAK_NG_COMMIT,
+        build_file = "@_espeak_ng//:BUILD",
+        init_submodules = False,
+        shallow_since = "1709251200 +0000",  # roughly 2024-03-01, around 1.52.0
+        patches = ["@ovms//third_party/espeak_ng:out_of_source_phsource.patch"],
+        patch_args = ["-p1"],
+    )
+
+def _impl(repository_ctx):
+    http_proxy = repository_ctx.os.environ.get("http_proxy", "")
+    if http_proxy == "":
+        http_proxy = repository_ctx.os.environ.get("HTTP_PROXY", "")
+    https_proxy = repository_ctx.os.environ.get("https_proxy", "")
+    if https_proxy == "":
+        https_proxy = repository_ctx.os.environ.get("HTTPS_PROXY", "")
+
+    if _is_windows(repository_ctx):
+        # On Windows espeak-ng builds espeak-ng.dll.
+        out_shared = "out_shared_libs = [\"espeak-ng.dll\"],"
+        out_interface = "out_interface_libs = [\"espeak-ng.lib\"],"
+        out_dll_dir = "out_dll_dir = \"bin\","
+        out_lib_dir = "out_lib_dir = \"lib\""
+        platform_cache = """
+            "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+            "CMAKE_CXX_FLAGS": " /guard:cf /GS /DN_PATH_HOME=1024",
+            "CMAKE_C_FLAGS": " /DN_PATH_HOME=1024",
+            "WIN32": "True",
+        """
+        jobs_param = "\"-j 8\""
+    else:
+        # On Linux espeak-ng produces libespeak-ng.so.<ABI>.
+        out_shared = "out_shared_libs = [\"libespeak-ng.so.1\"],"
+        out_interface = ""
+        out_dll_dir = ""
+        out_lib_dir = "out_lib_dir = \"lib\""
+        platform_cache = """
+            "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+            "CMAKE_C_FLAGS": " -fPIC -Wno-error -DN_PATH_HOME=1024",
+        """
+        jobs_param = "\"-j 8\""
+
+    # NOTE: braces inside the format string are doubled to escape them.
+    build_file_content = """
+load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake")
+
+visibility = ["//visibility:public"]
+
+filegroup(
+    name = "all_srcs",
+    srcs = glob(["**"]),
+    visibility = ["//visibility:public"],
+)
+
+# Build espeak-ng as a minimal text-to-phoneme library; we do not need
+# audio output, MBROLA, klatt, libsonic or the CLI executable at runtime.
+cmake(
+    name = "espeak_ng_cmake",
+    build_args = [
+        "--verbose",
+        "--",
+        {jobs_param}
+    ],
+    cache_entries = {{
+        "CMAKE_BUILD_TYPE": "Release",
+        "BUILD_SHARED_LIBS": "ON",
+        "USE_ASYNC": "OFF",
+        "USE_MBROLA": "OFF",
+        "USE_LIBSONIC": "OFF",
+        "USE_LIBPCAUDIO": "OFF",
+        "USE_KLATT": "OFF",
+        "USE_SPEECHPLAYER": "OFF",
+        "BUILD_ESPEAK_NG_EXE": "OFF",
+        "BUILD_SPEAK_EXE": "OFF",
+        "EXTRA_cmn": "OFF",
+        "EXTRA_ru": "OFF",
+        {platform_cache}
+    }},
+    env = {{
+        "http_proxy": "{http_proxy}",
+        "https_proxy": "{https_proxy}",
+    }},
+    lib_source = ":all_srcs",
+    out_include_dir = "include",
+    {out_lib_dir},
+    {out_shared}
+    {out_interface}
+    {out_dll_dir}
+    out_data_dirs = ["share/espeak-ng-data"],
+    tags = ["requires-network"],
+    visibility = ["//visibility:public"],
+    lib_name = "espeak-ng",
+)
+
+# Headers + shared library suitable for cc_library deps.
+cc_library(
+    name = "espeak_ng",
+    deps = [":espeak_ng_cmake"],
+    visibility = ["//visibility:public"],
+)
+
+# Exposes the espeak-ng-data directory as a filegroup so create_package.sh
+# (or similar packaging) can locate it under bazel-out.
+filegroup(
+    name = "espeak_ng_data",
+    srcs = [":espeak_ng_cmake"],
+    output_group = "gen_dir",
+    visibility = ["//visibility:public"],
+)
+"""
+    repository_ctx.file(
+        "BUILD",
+        build_file_content.format(
+            http_proxy = http_proxy,
+            https_proxy = https_proxy,
+            out_shared = out_shared,
+            out_interface = out_interface,
+            out_dll_dir = out_dll_dir,
+            out_lib_dir = out_lib_dir,
+            platform_cache = platform_cache,
+            jobs_param = jobs_param,
+        ),
+    )
+
+_espeak_ng_repository = repository_rule(
+    implementation = _impl,
+    local = True,
+)
diff --git a/third_party/espeak_ng/out_of_source_phsource.patch b/third_party/espeak_ng/out_of_source_phsource.patch
new file mode 100644
index 0000000000..3cc7849d8c
--- /dev/null
+++ b/third_party/espeak_ng/out_of_source_phsource.patch
@@ -0,0 +1,28 @@
+Replace configure-time file(COPY) of phsource with a symbolic link.
+
+Upstream cmake/data.cmake does:
+
+    file(COPY "${PHONEME_SRC_DIR}" DESTINATION "${DATA_DIST_ROOT}")
+
+CMake's file(COPY) implements a skip-if-newer optimisation that has been
+observed to silently omit individual files when an incremental build runs
+against a previously partially-populated DATA_DIST_ROOT (which is exactly
+what happens with rules_foreign_cc on rebuilds). Replacing the copy with a
+symlink eliminates the entire copy step and guarantees the build always
+sees the exact source phsource tree. phsource is read-only during the
+build, so a symlink is safe.
+
+This patch is OVMS-local; see third_party/espeak_ng/espeak_ng.bzl.
+
+--- a/cmake/data.cmake
++++ b/cmake/data.cmake
+@@ -58,7 +58,8 @@
+ file(MAKE_DIRECTORY "${DICT_TMP_DIR}")
+ file(COPY "${DATA_SRC_DIR}/lang" DESTINATION "${DATA_DIST_DIR}")
+ file(COPY "${DATA_SRC_DIR}/voices/!v" DESTINATION "${DATA_DIST_DIR}/voices")
+-file(COPY "${PHONEME_SRC_DIR}" DESTINATION "${DATA_DIST_ROOT}")
++file(REMOVE_RECURSE "${DATA_DIST_ROOT}/phsource")
++file(CREATE_LINK "${PHONEME_SRC_DIR}" "${DATA_DIST_ROOT}/phsource" SYMBOLIC)
+ 
+ set(ESPEAK_RUN_ENV ${CMAKE_COMMAND} -E env "ESPEAK_DATA_PATH=${DATA_DIST_ROOT}")
+ set(ESPEAK_RUN_CMD ${ESPEAK_RUN_ENV} $ENV{VALGRIND} "$<TARGET_FILE:espeak-ng-bin>")
diff --git a/windows_build.bat b/windows_build.bat
index f2331abee4..fb5b5eedd6 100644
--- a/windows_build.bat
+++ b/windows_build.bat
@@ -49,10 +49,18 @@ IF "%~4"=="--integrity" (
     set "buildWithIntegrity="
 )
 
+:: Allow disabling espeak-ng (built from source via Bazel) by setting
+:: ESPEAK=0 before invoking this script. Defaults to on.
+IF "%ESPEAK%"=="0" (
+    set "espeakArg=--//:espeak=off"
+) ELSE (
+    set "espeakArg=--//:espeak=on"
+)
+
 set "bazelStartupCmd=--output_user_root=!BAZEL_SHORT_PATH!"
 set "openvino_dir=!BAZEL_SHORT_PATH!/openvino/runtime/cmake"
 
-set "buildCommand=bazel %bazelStartupCmd% build  %buildWithIntegrity% %bazelBuildArgs% --action_env OpenVINO_DIR=%openvino_dir% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures %buildTargets% 2>&1 | tee win_build.log"
+set "buildCommand=bazel %bazelStartupCmd% build  %buildWithIntegrity% %bazelBuildArgs% %espeakArg% --action_env OpenVINO_DIR=%openvino_dir% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures %buildTargets% 2>&1 | tee win_build.log"
 set "setOvmsVersionCmd=python windows_set_ovms_version.py"
 
 :: Setting PATH environment variable based on default windows node settings: Added ovms_windows specific python settings and c:/opt and removed unused Nvidia and OCL specific tools.
diff --git a/windows_create_package.bat b/windows_create_package.bat
index 533a2f82ee..e11de08e64 100644
--- a/windows_create_package.bat
+++ b/windows_create_package.bat
@@ -98,6 +98,15 @@ if exist %cd%\bazel-out\x64_windows-opt\bin\src\core_tokenizers.dll (
     if !errorlevel! neq 0 exit /b !errorlevel!
 )
 
+:: Bundle espeak-ng DLL + data when it was built from source by Bazel
+:: (--//:espeak=on). Picked up from the rules_foreign_cc cmake output tree.
+for /f "delims=" %%D in ('dir /b /s /a:d %cd%\bazel-out\x64_windows-opt\bin\external\espeak_ng\espeak-ng.dll 2^>nul') do (
+    copy /Y "%%D" dist\windows\ovms
+)
+for /f "delims=" %%D in ('dir /b /s /a:d %cd%\bazel-out\x64_windows-opt\bin\external\espeak_ng 2^>nul ^| findstr /e "espeak-ng-data"') do (
+    xcopy "%%D" dist\windows\ovms\espeak-ng-data /E /I /H /Y
+)
+
 copy %cd%\setupvars.* dist\windows\ovms
 if !errorlevel! neq 0 exit /b !errorlevel!
 copy %cd%\install_ovms_service.bat dist\windows\ovms

From 6d7e5022c3e22d570e26de6b601c24585878996c Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 20 May 2026 14:32:31 +0200
Subject: [PATCH 2/4] Add header

---
 third_party/espeak_ng/out_of_source_phsource.patch | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/third_party/espeak_ng/out_of_source_phsource.patch b/third_party/espeak_ng/out_of_source_phsource.patch
index 3cc7849d8c..8711e1575a 100644
--- a/third_party/espeak_ng/out_of_source_phsource.patch
+++ b/third_party/espeak_ng/out_of_source_phsource.patch
@@ -1,3 +1,16 @@
+Copyright (c) 2026 Intel Corporation
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
 Replace configure-time file(COPY) of phsource with a symbolic link.
 
 Upstream cmake/data.cmake does:

From a482758290137ce31c36f5cffd7c74fb50d6ada0 Mon Sep 17 00:00:00 2001
From: mkulakow <devuser@ov-ptl-13.sclab.intel.com>
Date: Wed, 20 May 2026 17:18:15 +0200
Subject: [PATCH 3/4] fix

---
 src/audio/text_to_speech/t2s_servable.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp
index 2c9b1abfe3..2da7263302 100644
--- a/src/audio/text_to_speech/t2s_servable.cpp
+++ b/src/audio/text_to_speech/t2s_servable.cpp
@@ -88,7 +88,10 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD
         throw std::runtime_error("Error during plugin_config option parsing");
     }
     ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice, config);
-    const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape();
+    // SpeechT5 speaker embeddings have a fixed shape of {1, 512}. For Kokoro backend the
+    // speaker embedding tensor is ignored by the pipeline, so this shape is only used to
+    // validate raw float32 voice files loaded for SpeechT5 deployments.
+    const ov::Shape speakerEmbeddingShape{1, 512};
     for (auto voice : graphVoices) {
         std::filesystem::path voicePath(voice.path());
         if (voicePath.is_relative()) {

From 97f95acc2fc4952a11c84b46880eca1bd1e22322 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Thu, 21 May 2026 08:55:20 +0200
Subject: [PATCH 4/4] fix

---
 create_package.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/create_package.sh b/create_package.sh
index c2efb67d8b..61110bb6b4 100755
--- a/create_package.sh
+++ b/create_package.sh
@@ -37,6 +37,17 @@ if [ -n "$ESPEAK_DATA_SRC" ] && [ -d "$ESPEAK_DATA_SRC" ] ; then
     mkdir -p /ovms_release/share
     cp -rL "$ESPEAK_DATA_SRC" /ovms_release/share/ ;
 fi
+# The version matches the espeak-ng tag pinned
+# in third_party/espeak_ng/espeak_ng.bzl (1.52.0);
+# update both places together when bumping espeak-ng.
+ESPEAK_REAL=libespeak-ng.so.1.52.0.1
+if [ -f "/ovms_release/lib/$ESPEAK_REAL" ]; then
+    cd /ovms_release/lib
+    rm -f libespeak-ng.so libespeak-ng.so.1
+    ln -s "$ESPEAK_REAL" libespeak-ng.so.1
+    ln -s "$ESPEAK_REAL" libespeak-ng.so
+    cd - >/dev/null
+fi
 if [ "$FUZZER_BUILD" == "0" ]; then mv /ovms_release/lib/libcustom_node* /ovms_release/lib/custom_nodes/; fi;
 cd /ovms_release/lib/ ; rm -f libcurl.so*
 cd /ovms_release/lib/ ; rm -f libazurestorage.so.* ; ln -s libazurestorage.so libazurestorage.so.7 ;ln -s libazurestorage.so libazurestorage.so.7.5