From 6e4ce522548c1f4d4dab6092cf969d41194d4da9 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 02:01:51 +0800
Subject: [PATCH 01/10] fix: avoid blocking health during GPU cleanup

---
 app/infra/job_runtime.py       | 15 +++++++++---
 tests/unit/test_job_runtime.py | 45 ++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/app/infra/job_runtime.py b/app/infra/job_runtime.py
index 553bfe2..9ea189a 100644
--- a/app/infra/job_runtime.py
+++ b/app/infra/job_runtime.py
@@ -87,11 +87,14 @@ def flush_torch_cuda_cache(
     """Best-effort CUDA cache flush used around serialized GPU work."""
 
     try:
-        import gc as _gc
-
         import torch as _torch
 
-        _gc.collect()
+        # Full Python GC can hold the GIL long enough to make FastAPI liveness
+        # probes time out after large alignment results. Keep active job
+        # boundaries lightweight; the idle-unload path remains the heavy cleanup
+        # point because it runs after the GPU pipeline has been idle.
+        if phase == "idle-unload":
+            _collect_python_gc()
         if _torch.cuda.is_available():
             _torch.cuda.empty_cache()
     except Exception as exc:  # pragma: no cover - guarded for runtime-only failures
@@ -99,6 +102,12 @@ def flush_torch_cuda_cache(
             logger.warning("%s CUDA cache flush failed: %s", phase, exc)
 
 
+def _collect_python_gc() -> None:
+    import gc as _gc
+
+    _gc.collect()
+
+
 def run_serialized_gpu_work(
     work: Callable[[], _T],
     *,
diff --git a/tests/unit/test_job_runtime.py b/tests/unit/test_job_runtime.py
index c360031..6deca32 100644
--- a/tests/unit/test_job_runtime.py
+++ b/tests/unit/test_job_runtime.py
@@ -2,6 +2,9 @@
 
 from __future__ import annotations
 
+import sys
+from types import SimpleNamespace
+
 import pytest
 
 import infra.job_runtime as job_runtime
@@ -77,6 +80,48 @@ def test_run_serialized_gpu_work_releases_semaphore_after_error(monkeypatch):
     assert events == ["pre-whisper", "pre-whisper", "retry", "post-pipeline"]
 
 
+def test_flush_torch_cuda_cache_skips_python_gc_for_active_job_phases(monkeypatch):
+    events = []
+    fake_torch = SimpleNamespace(
+        cuda=SimpleNamespace(
+            is_available=lambda: True,
+            empty_cache=lambda: events.append("empty_cache"),
+        )
+    )
+
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+    monkeypatch.setattr(
+        job_runtime,
+        "_collect_python_gc",
+        lambda: events.append("gc_collect"),
+    )
+
+    job_runtime.flush_torch_cuda_cache(phase="post-pipeline")
+
+    assert events == ["empty_cache"]
+
+
+def test_flush_torch_cuda_cache_keeps_full_gc_for_idle_unload(monkeypatch):
+    events = []
+    fake_torch = SimpleNamespace(
+        cuda=SimpleNamespace(
+            is_available=lambda: True,
+            empty_cache=lambda: events.append("empty_cache"),
+        )
+    )
+
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+    monkeypatch.setattr(
+        job_runtime,
+        "_collect_python_gc",
+        lambda: events.append("gc_collect"),
+    )
+
+    job_runtime.flush_torch_cuda_cache(phase="idle-unload")
+
+    assert events == ["gc_collect", "empty_cache"]
+
+
 def test_idle_unload_daemon_disabled_when_timeout_zero():
     pipeline = _FakePipeline(loaded=True)
 

From 947c42ad2e16718afcf7f82ec10dc33eb25d07ef Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 02:10:13 +0800
Subject: [PATCH 02/10] fix: keep ASR runtime on cuDNN9-compatible deps

---
 app/requirements.txt                          |  6 ++++-
 doc/changelog.en.md                           | 11 +++++++---
 doc/changelog.zh.md                           |  8 +++++--
 .../unit/test_dependency_runtime_baseline.py  | 22 +++++++++++++++++++
 4 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 tests/unit/test_dependency_runtime_baseline.py

diff --git a/app/requirements.txt b/app/requirements.txt
index ac1185e..a25305b 100755
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -12,7 +12,11 @@ pyannote.metrics>=3.2,<4.0
 pyannote.pipeline>=3.0.1,<4.0
 omegaconf==2.3.0
 antlr4-python3-runtime==4.9.3
-faster-whisper==1.1.0
+# faster-whisper 1.1.0 resolves to a CTranslate2 wheel that looks for cuDNN 8
+# (`libcudnn_ops_infer.so.8`). The Docker base is CUDA 12.4 + cuDNN 9, so keep
+# the ASR runtime on the cuDNN9-compatible CTranslate2 line verified remotely.
+faster-whisper>=1.2.1,<2.0.0
+ctranslate2>=4.7.1,<5.0
 fastapi>=0.115.0
 uvicorn[standard]>=0.30.0
 python-multipart>=0.0.26,<0.0.28
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 859ed78..5fd9db7 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -10,9 +10,10 @@
   tighten the public dependency scanning flow.
 - Moved WhisperX alignment from the yanked `3.1.x` package series to
   `whisperx==3.3.1`, with compatible `pyannote.audio==3.3.2` and
-  `faster-whisper==1.1.0` pins, plus bounded `pyannote.*` and `pandas`
-  transitive dependencies, while keeping the current `numpy<2` / SciPy 1.11.x
-  dependency baseline.
+  cuDNN9-compatible `faster-whisper>=1.2.1,<2.0.0` /
+  `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas`
+  transitive dependencies. This keeps the current `numpy<2` / SciPy 1.11.x
+  dependency baseline while avoiding runtime lookups for cuDNN8 libraries.
 
 ### Observability
 
@@ -20,6 +21,10 @@
   diarization, embedding, voiceprint match, enhancement, and pipeline stage
   timing. Logs record only stage, model, elapsed time, and aggregate metrics;
   they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens.
+- Transcription jobs no longer run full Python GC before/after every GPU job;
+  active job boundaries only clear the CUDA cache, while full GC remains on the
+  idle-unload path. This avoids long GIL holds that can make `/healthz` time out
+  after large alignment results complete.
 
 ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29)
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index 0f41a85..bf3bcc5 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -9,14 +9,18 @@
 - 更新依赖安全基线与 FOSSA policy test 覆盖，收紧公开依赖扫描流程。
 - 将 WhisperX alignment 从 PyPI yanked 的 `3.1.x` 系列迁移到
   `whisperx==3.3.1`，并配套固定 `pyannote.audio==3.3.2` 与
-  `faster-whisper==1.1.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界，
-  保持当前 `numpy<2` / SciPy 1.11.x 依赖基线。
+  cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` /
+  `ctranslate2>=4.7.1,<5.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界，
+  保持当前 `numpy<2` / SciPy 1.11.x 依赖基线，避免运行时查找 cuDNN8 动态库。
 
 ### 可观测性
 
 - 新增安全的模型加载与转写阶段耗时日志，覆盖 ASR、diarization、embedding、
   voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、
   耗时和聚合指标，不记录文件名、路径、job ID、speaker ID、host 或 token。
+- 转写 job 前后不再执行完整 Python GC，只清理 CUDA cache；完整 GC 保留在
+  idle-unload 阶段，避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz`
+  超时。
 
 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29)
 
diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py
new file mode 100644
index 0000000..d39c3c3
--- /dev/null
+++ b/tests/unit/test_dependency_runtime_baseline.py
@@ -0,0 +1,22 @@
+"""Regression tests for runtime dependency pins that affect Docker GPU loads."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def _requirements_lines() -> list[str]:
+    root = Path(__file__).resolve().parents[2]
+    return [
+        line.strip()
+        for line in (root / "app" / "requirements.txt").read_text().splitlines()
+        if line.strip() and not line.lstrip().startswith("#")
+    ]
+
+
+def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2():
+    lines = _requirements_lines()
+
+    assert "faster-whisper>=1.2.1,<2.0.0" in lines
+    assert "ctranslate2>=4.7.1,<5.0" in lines
+    assert "faster-whisper==1.1.0" not in lines

From 0c04619ec59b4f432271a0c4fb868f2bb9e5d750 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 02:15:47 +0800
Subject: [PATCH 03/10] fix: install WhisperX without replacing ASR runtime

---
 app/Dockerfile                                 |  5 +++++
 app/requirements.txt                           |  8 ++++----
 doc/changelog.en.md                            |  5 +++--
 doc/changelog.zh.md                            |  5 +++--
 tests/unit/test_dependency_runtime_baseline.py | 17 +++++++++++++++++
 5 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/app/Dockerfile b/app/Dockerfile
index 97d5ae7..37b7b5f 100755
--- a/app/Dockerfile
+++ b/app/Dockerfile
@@ -45,6 +45,11 @@ RUN if [ -n "$PIP_INDEX_URL" ]; then \
     else \
         pip install --no-cache-dir -r requirements.txt; \
     fi
+RUN if [ -n "$PIP_INDEX_URL" ]; then \
+        pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1; \
+    else \
+        pip install --no-cache-dir --no-deps whisperx==3.3.1; \
+    fi
 
 COPY --chown=app:app . .
 
diff --git a/app/requirements.txt b/app/requirements.txt
index a25305b..1cd5b6b 100755
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -38,10 +38,10 @@ scipy>=1.11.4,<1.12.0
 # sqlite-vec: fast on-disk top-k cosine search for voiceprints.
 sqlite-vec>=0.1.0
 # whisperx: forced word-level alignment via wav2vec2.
-# 3.3.1 is the newest non-yanked WhisperX release that still supports the
-# current torch 2.6 / numpy<2 / SciPy 1.11.x runtime without moving to the
-# WhisperX 3.3.3+ numpy>=2 or 3.8.x torch~=2.8 dependency stack.
-whisperx==3.3.1
+# Docker installs whisperx==3.3.1 with --no-deps after this file is resolved.
+# WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0,
+# which would replace the cuDNN9-compatible ASR stack above and look for
+# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image.
 transformers>=5.0.0,<5.1.0
 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none")
 deepfilternet>=0.5.6
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 5fd9db7..14877f5 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -12,8 +12,9 @@
   `whisperx==3.3.1`, with compatible `pyannote.audio==3.3.2` and
   cuDNN9-compatible `faster-whisper>=1.2.1,<2.0.0` /
   `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas`
-  transitive dependencies. This keeps the current `numpy<2` / SciPy 1.11.x
-  dependency baseline while avoiding runtime lookups for cuDNN8 libraries.
+  transitive dependencies. Docker installs WhisperX with `--no-deps` so its old
+  ASR transitive dependencies cannot replace the current `numpy<2` /
+  SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups.
 
 ### Observability
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index bf3bcc5..de1980a 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -10,8 +10,9 @@
 - 将 WhisperX alignment 从 PyPI yanked 的 `3.1.x` 系列迁移到
   `whisperx==3.3.1`，并配套固定 `pyannote.audio==3.3.2` 与
   cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` /
-  `ctranslate2>=4.7.1,<5.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界，
-  保持当前 `numpy<2` / SciPy 1.11.x 依赖基线，避免运行时查找 cuDNN8 动态库。
+  `ctranslate2>=4.7.1,<5.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。
+  Docker 会以 `--no-deps` 安装 WhisperX，避免其旧 ASR 传递依赖覆盖当前
+  `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库。
 
 ### 可观测性
 
diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py
index d39c3c3..a0ca920 100644
--- a/tests/unit/test_dependency_runtime_baseline.py
+++ b/tests/unit/test_dependency_runtime_baseline.py
@@ -14,9 +14,26 @@ def _requirements_lines() -> list[str]:
     ]
 
 
+def _dockerfile_text() -> str:
+    root = Path(__file__).resolve().parents[2]
+    return (root / "app" / "Dockerfile").read_text()
+
+
 def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2():
     lines = _requirements_lines()
 
     assert "faster-whisper>=1.2.1,<2.0.0" in lines
     assert "ctranslate2>=4.7.1,<5.0" in lines
     assert "faster-whisper==1.1.0" not in lines
+
+
+def test_docker_installs_whisperx_without_replacing_asr_runtime_stack():
+    lines = _requirements_lines()
+    dockerfile = _dockerfile_text()
+
+    assert "whisperx==3.3.1" not in lines
+    assert (
+        'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1'
+        in dockerfile
+    )
+    assert "pip install --no-cache-dir --no-deps whisperx==3.3.1" in dockerfile

From 222ebb291c4e7749f4a2a62a258b2d5cb03f7b02 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 02:22:53 +0800
Subject: [PATCH 04/10] fix: include WhisperX alignment runtime dependency

---
 app/requirements.txt                           | 1 +
 doc/changelog.en.md                            | 3 ++-
 doc/changelog.zh.md                            | 3 ++-
 tests/unit/test_dependency_runtime_baseline.py | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/app/requirements.txt b/app/requirements.txt
index 1cd5b6b..aceccd1 100755
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -42,6 +42,7 @@ sqlite-vec>=0.1.0
 # WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0,
 # which would replace the cuDNN9-compatible ASR stack above and look for
 # cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image.
+nltk>=3.9,<4.0
 transformers>=5.0.0,<5.1.0
 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none")
 deepfilternet>=0.5.6
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 14877f5..fcf573d 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -14,7 +14,8 @@
   `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas`
   transitive dependencies. Docker installs WhisperX with `--no-deps` so its old
   ASR transitive dependencies cannot replace the current `numpy<2` /
-  SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups.
+  SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups;
+  the `nltk` package needed by alignment is installed explicitly.
 
 ### Observability
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index de1980a..f594f58 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -12,7 +12,8 @@
   cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` /
   `ctranslate2>=4.7.1,<5.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。
   Docker 会以 `--no-deps` 安装 WhisperX，避免其旧 ASR 传递依赖覆盖当前
-  `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库。
+  `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库；
+  alignment 所需的 `nltk` 由运行时依赖显式安装。
 
 ### 可观测性
 
diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py
index a0ca920..657ea96 100644
--- a/tests/unit/test_dependency_runtime_baseline.py
+++ b/tests/unit/test_dependency_runtime_baseline.py
@@ -31,6 +31,7 @@ def test_docker_installs_whisperx_without_replacing_asr_runtime_stack():
     lines = _requirements_lines()
     dockerfile = _dockerfile_text()
 
+    assert "nltk>=3.9,<4.0" in lines
     assert "whisperx==3.3.1" not in lines
     assert (
         'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1'

From 4d3222e6560ff6fdbae6e4482f207eb34f939c72 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 03:07:24 +0800
Subject: [PATCH 05/10] fix: filter stock outro hallucinations

---
 app/providers/asr/default.py            | 27 +++++++++++++++++++++
 doc/changelog.en.md                     |  3 +++
 doc/changelog.zh.md                     |  3 +++
 tests/unit/test_asr_repetition_guard.py | 31 +++++++++++++++++++++++++
 4 files changed, 64 insertions(+)

diff --git a/app/providers/asr/default.py b/app/providers/asr/default.py
index 24b8a9a..37dd7a3 100644
--- a/app/providers/asr/default.py
+++ b/app/providers/asr/default.py
@@ -16,6 +16,18 @@
     "简体中文输出",
     "以下是普通话的对话",
 )
+_OUTRO_HALLUCINATION_MARKERS = (
+    "请不吝点赞",
+    "点赞",
+    "订阅",
+    "转发",
+    "打赏",
+    "打赏支持",
+    "明镜与点点栏目",
+    "谢谢观看",
+    "感谢观看",
+    "下期再见",
+)
 
 
 def _duration(segment: dict[str, Any]) -> float:
@@ -51,6 +63,17 @@ def _prompt_marker_key(normalized_text: str) -> str:
     return ""
 
 
+def _outro_marker_score(normalized_text: str) -> tuple[int, float]:
+    if not normalized_text:
+        return 0, 0.0
+
+    matched = {
+        marker for marker in _OUTRO_HALLUCINATION_MARKERS if marker in normalized_text
+    }
+    marker_chars = sum(len(marker) for marker in matched)
+    return len(matched), marker_chars / len(normalized_text)
+
+
 def _dominant_repeated_unit(normalized_text: str) -> tuple[str, int, float]:
     """Return the dominant repeated short unit, repeat count, and coverage ratio."""
 
@@ -89,6 +112,10 @@ def _is_single_segment_hallucination(segment: dict[str, Any]) -> bool:
     if duration >= 3.0 and marker_count >= 2 and marker_ratio >= 0.55:
         return True
 
+    outro_count, outro_ratio = _outro_marker_score(normalized)
+    if 3.0 <= duration <= 30.0 and outro_count >= 3 and outro_ratio >= 0.40:
+        return True
+
     unit, repeat_count, repeat_ratio = _dominant_repeated_unit(normalized)
     return (
         bool(unit) and duration >= 12.0 and repeat_count >= 4 and repeat_ratio >= 0.82
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index fcf573d..8533575 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -27,6 +27,9 @@
   active job boundaries only clear the CUDA cache, while full GC remains on the
   idle-unload path. This avoids long GIL holds that can make `/healthz` time out
   after large alignment results complete.
+- The ASR hallucination guard now filters short single-segment stock outros
+  dominated by markers such as "like / subscribe / repost / tip", while keeping
+  normal contextual words in longer meeting transcripts.
 
 ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29)
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index f594f58..735cc84 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -23,6 +23,9 @@
 - 转写 job 前后不再执行完整 Python GC，只清理 CUDA cache；完整 GC 保留在
   idle-unload 阶段，避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz`
   超时。
+- ASR hallucination guard 新增短单段 stock outro 过滤，拦截
+  “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉，同时保留
+  长会议中的正常上下文词。
 
 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29)
 
diff --git a/tests/unit/test_asr_repetition_guard.py b/tests/unit/test_asr_repetition_guard.py
index 7d5adbf..96f0970 100644
--- a/tests/unit/test_asr_repetition_guard.py
+++ b/tests/unit/test_asr_repetition_guard.py
@@ -49,6 +49,37 @@ def test_suppresses_long_repeated_non_prompt_segment():
     assert report["removed_duration"] == 20.0
 
 
+def test_suppresses_single_segment_stock_outro_hallucination():
+    segments = [
+        {
+            "start": 0.438,
+            "end": 18.091,
+            "text": "请不吝点赞 订阅 转发 打赏支持明镜与点点栏目",
+        }
+    ]
+
+    filtered, report = suppress_repetition_hallucinations(segments)
+
+    assert filtered == []
+    assert report["removed_segment_count"] == 1
+    assert report["removed_duration"] == 17.653
+
+
+def test_keeps_contextual_subscribe_word_in_normal_segment():
+    segments = [
+        {
+            "start": 0.0,
+            "end": 8.0,
+            "text": "这个功能里订阅提醒只是用户消息设置的一部分，后面还有支付通知。",
+        }
+    ]
+
+    filtered, report = suppress_repetition_hallucinations(segments)
+
+    assert filtered == segments
+    assert report["removed_segment_count"] == 0
+
+
 def test_keeps_normal_short_repetition_below_hallucination_thresholds():
     segments = [
         {"start": 0.0, "end": 1.0, "text": "对"},

From 70dba4090925eb021b1ee5eb6ceca8f93c42c8e6 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 03:57:18 +0800
Subject: [PATCH 06/10] fix: isolate whisperx alignment runtime

---
 .env.example                                |  5 ++
 app/config.py                               |  1 +
 app/pipeline/orchestrator.py                | 15 +++-
 app/providers/diarization/default.py        | 89 +++++++++++++++++----
 doc/changelog.en.md                         |  5 ++
 doc/changelog.zh.md                         |  3 +
 doc/configuration.en.md                     |  1 +
 doc/configuration.zh.md                     |  1 +
 doc/quickstart.en.md                        | 15 ++--
 doc/quickstart.zh.md                        | 12 +--
 tests/unit/test_pipeline_model_lifecycle.py | 11 +++
 tests/unit/test_provider_registry.py        | 74 +++++++++++++++++
 12 files changed, 205 insertions(+), 27 deletions(-)

diff --git a/.env.example b/.env.example
index 0a5b034..d7f2cd9 100644
--- a/.env.example
+++ b/.env.example
@@ -86,6 +86,11 @@ HF_HUB_ETAG_TIMEOUT=3
 # want to complete jobs without word-level timestamps for selected languages.
 WHISPERX_ALIGN_DISABLED_LANGUAGES=
 
+# Alignment defaults to CPU to isolate wav2vec2 alignment from the GPU ASR and
+# speaker-embedding runtimes. Set to pipeline/asr/cuda/cuda:0 only if you have
+# validated the target CUDA stack is stable for WhisperX alignment.
+WHISPERX_ALIGN_DEVICE=cpu
+
 # Optional comma-separated language=model overrides.
 # Example: WHISPERX_ALIGN_MODEL_MAP=zh=your-org/your-zh-align-model
 WHISPERX_ALIGN_MODEL_MAP=
diff --git a/app/config.py b/app/config.py
index 7460c2e..9e1f665 100644
--- a/app/config.py
+++ b/app/config.py
@@ -96,6 +96,7 @@ def _env_mapping(name: str) -> dict[str, str]:
 
 # WhisperX forced-alignment controls. Languages are attempted by default; use
 # WHISPERX_ALIGN_DISABLED_LANGUAGES only for an explicit operational fallback.
+WHISPERX_ALIGN_DEVICE: str = _env_str("WHISPERX_ALIGN_DEVICE", "cpu").lower()
 WHISPERX_ALIGN_DISABLED_LANGUAGES: frozenset[str] = _env_csv_set(
     "WHISPERX_ALIGN_DISABLED_LANGUAGES",
     "",
diff --git a/app/pipeline/orchestrator.py b/app/pipeline/orchestrator.py
index 1f6bec0..a808c2e 100644
--- a/app/pipeline/orchestrator.py
+++ b/app/pipeline/orchestrator.py
@@ -266,6 +266,10 @@ def __init__(
         self._whisper_device = None
         self._diarization_device = None
         self._embedding_device = None
+        self._alignment_cache_key = None
+        self._alignment_device = None
+        self._alignment_model = None
+        self._alignment_metadata = None
         self.model_size = model_size or WHISPER_MODEL
         self.hf_token = hf_token or HF_TOKEN
         self._whisper = None
@@ -284,16 +288,25 @@ def runner(self) -> PipelineRunner:
     def has_loaded_models(self) -> bool:
         return any(
             getattr(self, name, None) is not None
-            for name in ("_whisper", "_diarization", "_embedding_model")
+            for name in (
+                "_whisper",
+                "_diarization",
+                "_embedding_model",
+                "_alignment_model",
+            )
         )
 
     def unload_models(self) -> None:
         self._whisper = None
         self._diarization = None
         self._embedding_model = None
+        self._alignment_model = None
+        self._alignment_metadata = None
+        self._alignment_cache_key = None
         self._whisper_device = None
         self._diarization_device = None
         self._embedding_device = None
+        self._alignment_device = None
 
     def _select_device_for_lazy_load(self, device_attr: str) -> str:
         selected_device = getattr(self, device_attr, None)
diff --git a/app/providers/diarization/default.py b/app/providers/diarization/default.py
index 3f2021d..2afa3ab 100644
--- a/app/providers/diarization/default.py
+++ b/app/providers/diarization/default.py
@@ -12,6 +12,7 @@
 from typing import Any
 
 from config import (
+    WHISPERX_ALIGN_DEVICE,
     WHISPERX_ALIGN_CACHE_ONLY,
     WHISPERX_ALIGN_DISABLED_LANGUAGES,
     WHISPERX_ALIGN_MODEL_DIR,
@@ -107,6 +108,35 @@ def _alignment_disabled(language: str) -> bool:
     )
 
 
+def _resolve_alignment_device(pipeline) -> str:
+    configured = (WHISPERX_ALIGN_DEVICE or "cpu").strip().lower()
+    if configured in {"pipeline", "asr"}:
+        return str(getattr(pipeline, "device", "cpu") or "cpu")
+    if configured == "auto":
+        selector = getattr(pipeline, "_select_device_for_lazy_load", None)
+        if callable(selector):
+            return str(selector("_alignment_device"))
+        return str(getattr(pipeline, "device", "cpu") or "cpu")
+    return configured or "cpu"
+
+
+def _alignment_cache_key(
+    *,
+    language: str,
+    model_name: str | None,
+    model_source: str,
+    device: str,
+) -> tuple[str, str | None, str, str | None, bool, str]:
+    return (
+        language,
+        model_name,
+        model_source,
+        WHISPERX_ALIGN_MODEL_DIR,
+        WHISPERX_ALIGN_CACHE_ONLY,
+        device,
+    )
+
+
 def _language_disabled_hint(language: str) -> str:
     return (
         f"Remove {language} from WHISPERX_ALIGN_DISABLED_LANGUAGES to retry "
@@ -248,32 +278,61 @@ def align_diarized_segments_with_metadata(
         preflight_message = _torch_preflight_message(language, model_name)
         if preflight_message:
             logger.info(preflight_message)
+        alignment_device = _resolve_alignment_device(pipeline)
         audio = whisperx.load_audio(audio_path)
         load_kwargs = _load_align_model_kwargs(
             whisperx.load_align_model,
             language,
-            pipeline.device,
+            alignment_device,
         )
-        load_started = time.perf_counter()
-        with _cache_only_alignment_environment():
-            align_model, align_metadata = whisperx.load_align_model(
-                **load_kwargs,
-            )
-        logger.info(
-            "Loaded WhisperX alignment model in %.2fs "
-            "(cold_load=True, language=%s, model_source=%s, device=%s)",
-            time.perf_counter() - load_started,
-            language,
-            model_source,
-            pipeline.device,
+        cache_key = _alignment_cache_key(
+            language=language,
+            model_name=model_name,
+            model_source=model_source,
+            device=alignment_device,
         )
+        cached_key = getattr(pipeline, "_alignment_cache_key", None)
+        align_model = getattr(pipeline, "_alignment_model", None)
+        align_metadata = getattr(pipeline, "_alignment_metadata", None)
+        if (
+            cached_key == cache_key
+            and align_model is not None
+            and align_metadata is not None
+        ):
+            logger.info(
+                "Reusing WhisperX alignment model (hot reuse, language=%s, model_source=%s, device=%s)",
+                language,
+                model_source,
+                alignment_device,
+            )
+        else:
+            setattr(pipeline, "_alignment_model", None)
+            setattr(pipeline, "_alignment_metadata", None)
+            setattr(pipeline, "_alignment_cache_key", None)
+            load_started = time.perf_counter()
+            with _cache_only_alignment_environment():
+                align_model, align_metadata = whisperx.load_align_model(
+                    **load_kwargs,
+                )
+            setattr(pipeline, "_alignment_model", align_model)
+            setattr(pipeline, "_alignment_metadata", align_metadata)
+            setattr(pipeline, "_alignment_cache_key", cache_key)
+            setattr(pipeline, "_alignment_device", alignment_device)
+            logger.info(
+                "Loaded WhisperX alignment model in %.2fs "
+                "(cold_load=True, language=%s, model_source=%s, device=%s)",
+                time.perf_counter() - load_started,
+                language,
+                model_source,
+                alignment_device,
+            )
         processing_started = time.perf_counter()
         aligned_result = whisperx.align(
             segments,
             align_model,
             align_metadata,
             audio,
-            pipeline.device,
+            alignment_device,
             return_char_alignments=False,
         )
         processing_elapsed_s = time.perf_counter() - processing_started
@@ -283,7 +342,7 @@ def align_diarized_segments_with_metadata(
             processing_elapsed_s,
             language,
             len(segments),
-            pipeline.device,
+            alignment_device,
         )
         logger.info("WhisperX forced alignment succeeded for language=%s", language)
         metadata = {
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 8533575..961d0c9 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -27,6 +27,11 @@
   active job boundaries only clear the CUDA cache, while full GC remains on the
   idle-unload path. This avoids long GIL holds that can make `/healthz` time out
   after large alignment results complete.
+- WhisperX forced-alignment models are now cached by language / model / device
+  and default to `WHISPERX_ALIGN_DEVICE=cpu`, isolating alignment from GPU ASR,
+  diarization, and embedding runtimes. Operators can explicitly set
+  `pipeline`, `asr`, `cuda`, or `cuda:0` after validating CUDA alignment
+  stability.
 - The ASR hallucination guard now filters short single-segment stock outros
   dominated by markers such as "like / subscribe / repost / tip", while keeping
   normal contextual words in longer meeting transcripts.
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index 735cc84..ba6d7fe 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -23,6 +23,9 @@
 - 转写 job 前后不再执行完整 Python GC，只清理 CUDA cache；完整 GC 保留在
   idle-unload 阶段，避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz`
   超时。
+- WhisperX forced alignment 模型现在按语言 / 模型 / 设备缓存复用，并默认通过
+  `WHISPERX_ALIGN_DEVICE=cpu` 与 GPU ASR、diarization、embedding 运行时隔离；
+  需要 CUDA alignment 时可显式设为 `pipeline`、`asr`、`cuda` 或 `cuda:0`。
 - ASR hallucination guard 新增短单段 stock outro 过滤，拦截
   “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉，同时保留
   长会议中的正常上下文词。
diff --git a/doc/configuration.en.md b/doc/configuration.en.md
index d9fb5b0..d490bbe 100644
--- a/doc/configuration.en.md
+++ b/doc/configuration.en.md
@@ -120,6 +120,7 @@ use `deepfilternet`; `noisereduce` runs whenever it is selected.
 | API `min_speakers` / `max_speakers` | `0` | Per-request speaker-count bounds. `0` means auto and is not passed to pyannote. |
 | `PYANNOTE_MIN_DURATION_OFF` | `0.5` | pyannote `_binarize.min_duration_off`, used to merge short pauses and reduce over-segmentation. If the pyannote object does not support it, the service logs a warning and continues. |
 | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | empty | Comma-separated languages that skip forced alignment when no model override is present. Use only as a temporary operational fallback. |
+| `WHISPERX_ALIGN_DEVICE` | `cpu` | Runtime device for WhisperX forced alignment. CPU is the default to isolate wav2vec2 alignment from GPU ASR / speaker-embedding runtimes; set to `pipeline` / `asr` / `cuda` / `cuda:0` only after validating CUDA alignment stability. |
 | `WHISPERX_ALIGN_MODEL_MAP` | empty | Comma-separated `lang=model` overrides, for example `zh=org/model`. |
 | `WHISPERX_ALIGN_MODEL_DIR` | empty | Optional alignment model directory; passed through only when the installed WhisperX supports that parameter. |
 | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | When `1`, requests cache-only alignment model loading, only when supported by the installed WhisperX. |
diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md
index 5a8ca32..88b50b3 100644
--- a/doc/configuration.zh.md
+++ b/doc/configuration.zh.md
@@ -112,6 +112,7 @@ v0.7.5 默认面向干净会议录音，因此 `DENOISE_MODEL=none`。只有噪
 | API `min_speakers` / `max_speakers` | `0` | 请求级说话人数约束；`0` 表示自动，不传入 pyannote。 |
 | `PYANNOTE_MIN_DURATION_OFF` | `0.5` | pyannote `_binarize.min_duration_off`，用于合并短暂停顿、减少过度切分。若当前 pyannote 对象不支持该属性，服务会记录警告并继续运行。 |
 | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | 空 | 逗号分隔语言列表；命中且没有模型覆盖时跳过 forced alignment。只建议作为临时降级开关。 |
+| `WHISPERX_ALIGN_DEVICE` | `cpu` | WhisperX forced alignment 的运行设备。默认 CPU，用于隔离 wav2vec2 alignment 与 GPU ASR / speaker embedding 运行时；确认 CUDA alignment 稳定后可设为 `pipeline` / `asr` / `cuda` / `cuda:0`。 |
 | `WHISPERX_ALIGN_MODEL_MAP` | 空 | 逗号分隔 `lang=model` 覆盖，例如 `zh=org/model`。 |
 | `WHISPERX_ALIGN_MODEL_DIR` | 空 | 可选 alignment 模型目录；仅在当前 WhisperX 版本支持该参数时透传。 |
 | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | 为 `1` 时，请求 WhisperX 只使用缓存加载 alignment 模型；仅在当前 WhisperX 版本支持时透传。 |
diff --git a/doc/quickstart.en.md b/doc/quickstart.en.md
index 0e327eb..9be0076 100644
--- a/doc/quickstart.en.md
+++ b/doc/quickstart.en.md
@@ -204,6 +204,7 @@ A few worth knowing about:
 | `MIN_EMBED_DURATION` | `1.5` | Minimum diarization turn duration used for speaker embedding extraction |
 | `MAX_EMBED_DURATION` | `10.0` | Maximum per-turn audio window used for speaker embedding extraction |
 | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | empty | Comma-separated languages that explicitly skip WhisperX forced alignment; use only as a temporary operational fallback |
+| `WHISPERX_ALIGN_DEVICE` | `cpu` | Runtime device for WhisperX forced alignment; CPU is the default to keep alignment isolated from GPU ASR / speaker-embedding runtimes |
 | `WHISPERX_ALIGN_MODEL_MAP` | empty | Comma-separated `lang=model` overrides, for example `zh=your-org/your-zh-align-model` |
 | `WHISPERX_ALIGN_MODEL_DIR` | empty | Optional alignment model cache directory passed through when the installed WhisperX supports it |
 | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | Set to 1 to request cache-only alignment model loading when supported by the installed WhisperX version |
@@ -217,12 +218,14 @@ For every supported setting, the Whisper / ASR parameters that are not exposed
 as env yet, and AS-norm cohort preservation semantics, see
 [`configuration.en.md`](./configuration.en.md).
 
-Chinese word-level alignment is attempted by default. The Docker image uses
-PyTorch 2.6.0 so recent transformers safety checks can load the default
-Chinese `.bin` alignment weights. If you run a custom image with older torch,
-use torch>=2.6 or a trusted replacement alignment model that provides
-safetensors; only set `WHISPERX_ALIGN_DISABLED_LANGUAGES=zh` if you
-intentionally want a temporary segment-level fallback.
+Chinese word-level alignment is attempted by default and runs on CPU by
+default to keep wav2vec2 alignment isolated from GPU ASR / speaker-embedding
+runtimes. The Docker image uses PyTorch 2.6.0 so recent transformers safety
+checks can load the default Chinese `.bin` alignment weights. If you run a
+custom image with older torch, use torch>=2.6 or a trusted replacement
+alignment model that provides safetensors; only set
+`WHISPERX_ALIGN_DISABLED_LANGUAGES=zh` if you intentionally want a temporary
+segment-level fallback.
 
 ### Host directory ownership
 
diff --git a/doc/quickstart.zh.md b/doc/quickstart.zh.md
index e02133e..a62634e 100644
--- a/doc/quickstart.zh.md
+++ b/doc/quickstart.zh.md
@@ -178,6 +178,7 @@ HF_ENDPOINT=https://hf-mirror.com
 | `MIN_EMBED_DURATION` | `1.5` | 提取 speaker embedding 时接受的最短 diarization turn 时长 |
 | `MAX_EMBED_DURATION` | `10.0` | 提取 speaker embedding 时单个 turn 使用的最长音频窗口 |
 | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | 空 | 逗号分隔的显式跳过 forced alignment 语言；只建议作为临时运营降级开关 |
+| `WHISPERX_ALIGN_DEVICE` | `cpu` | WhisperX forced alignment 运行设备；默认 CPU，避免 alignment 与 GPU ASR / speaker embedding 运行时互相影响 |
 | `WHISPERX_ALIGN_MODEL_MAP` | 空 | 逗号分隔的 `lang=model` 覆盖，例如 `zh=your-org/your-zh-align-model` |
 | `WHISPERX_ALIGN_MODEL_DIR` | 空 | 可选 alignment 模型缓存目录；当前 WhisperX 支持时会透传 |
 | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | 设为 1 时，在当前 WhisperX 版本支持的情况下只从缓存加载 alignment 模型 |
@@ -189,11 +190,12 @@ HF_ENDPOINT=https://hf-mirror.com
 所有可用配置项、哪些 Whisper / ASR 参数尚未暴露为 env，以及 AS-norm cohort
 保护语义，见 [`configuration.zh.md`](./configuration.zh.md)。
 
-中文词级 alignment 默认会尝试执行。Docker 镜像使用 PyTorch 2.6.0，可满足
-transformers 新安全检查对默认中文 `.bin` alignment 权重的加载要求。如果你使用
-自定义镜像且 torch 低于 2.6，请升级到 torch>=2.6，或改用提供 safetensors 的可信
-替代 alignment 模型；只有确认要临时降级到段级时间戳时，才设置
-`WHISPERX_ALIGN_DISABLED_LANGUAGES=zh`。
+中文词级 alignment 默认会尝试执行，并默认在 CPU 上运行，以避免 wav2vec2
+alignment 与 GPU ASR / speaker embedding 运行时互相影响。Docker 镜像使用
+PyTorch 2.6.0，可满足 transformers 新安全检查对默认中文 `.bin` alignment 权重的
+加载要求。如果你使用自定义镜像且 torch 低于 2.6，请升级到 torch>=2.6，或改用
+提供 safetensors 的可信替代 alignment 模型；只有确认要临时降级到段级时间戳时，
+才设置 `WHISPERX_ALIGN_DISABLED_LANGUAGES=zh`。
 
 ### 宿主目录所有者
 
diff --git a/tests/unit/test_pipeline_model_lifecycle.py b/tests/unit/test_pipeline_model_lifecycle.py
index 29377f9..dbfba9c 100644
--- a/tests/unit/test_pipeline_model_lifecycle.py
+++ b/tests/unit/test_pipeline_model_lifecycle.py
@@ -24,6 +24,10 @@ def _new_pipeline(*, device="cuda"):
     pipeline._whisper_device = None
     pipeline._diarization_device = None
     pipeline._embedding_device = None
+    pipeline._alignment_cache_key = None
+    pipeline._alignment_device = None
+    pipeline._alignment_model = None
+    pipeline._alignment_metadata = None
     pipeline.model_size = "tiny"
     pipeline.hf_token = None
     pipeline._whisper = None
@@ -49,6 +53,9 @@ def test_unload_models_drops_loaded_references_without_selecting_device(monkeypa
     pipeline._whisper = object()
     pipeline._diarization = object()
     pipeline._embedding_model = object()
+    pipeline._alignment_model = object()
+    pipeline._alignment_metadata = object()
+    pipeline._alignment_cache_key = ("zh", None, "default", None, False, "cpu")
     calls = []
 
     monkeypatch.setattr(
@@ -65,6 +72,9 @@ def test_unload_models_drops_loaded_references_without_selecting_device(monkeypa
     assert pipeline._whisper is None
     assert pipeline._diarization is None
     assert pipeline._embedding_model is None
+    assert pipeline._alignment_model is None
+    assert pipeline._alignment_metadata is None
+    assert pipeline._alignment_cache_key is None
     assert calls == []
 
 
@@ -333,6 +343,7 @@ def test_unload_models_clears_per_model_devices_and_reload_reselects(monkeypatch
     assert pipeline._whisper_device is None
     assert pipeline._diarization_device is None
     assert pipeline._embedding_device is None
+    assert pipeline._alignment_device is None
 
     _ = pipeline.whisper
     assert pipeline._whisper_device == "cuda:1"
diff --git a/tests/unit/test_provider_registry.py b/tests/unit/test_provider_registry.py
index 8ed7d4d..711376b 100644
--- a/tests/unit/test_provider_registry.py
+++ b/tests/unit/test_provider_registry.py
@@ -612,6 +612,80 @@ def fake_load_align_model(language_code, device):
     assert result.dedup_removed == 0
 
 
+def test_default_diarization_provider_caches_alignment_model_on_configured_device(
+    monkeypatch,
+    caplog,
+):
+    pipeline = TranscriptionPipeline.__new__(TranscriptionPipeline)
+    pipeline.device = "cuda:1"
+    calls = []
+
+    class FakeDiarizationResult:
+        def itertracks(self, yield_label=False):
+            assert yield_label is True
+            yield SimpleNamespace(start=0.0, end=1.2), None, "SPEAKER_00"
+
+    class FakeDiarizer:
+        def __call__(self, audio_path, **kwargs):
+            return FakeDiarizationResult()
+
+    pipeline._diarization = FakeDiarizer()
+    monkeypatch.setattr(diarization_default, "WHISPERX_ALIGN_DEVICE", "cpu")
+    whisperx = sys.modules["whisperx"]
+    monkeypatch.setattr(
+        whisperx,
+        "load_audio",
+        lambda audio_path: f"audio:{audio_path}",
+        raising=False,
+    )
+
+    def fake_load_align_model(language_code, device):
+        calls.append(("load_align_model", language_code, device))
+        return object(), {"language": language_code, "device": device}
+
+    def fake_align(
+        segments,
+        align_model,
+        align_metadata,
+        audio,
+        device,
+        return_char_alignments=False,
+    ):
+        calls.append(("align", align_metadata["device"], device))
+        return {"segments": segments}
+
+    monkeypatch.setattr(
+        whisperx,
+        "load_align_model",
+        fake_load_align_model,
+        raising=False,
+    )
+    monkeypatch.setattr(whisperx, "align", fake_align, raising=False)
+
+    request = DiarizationRequest(
+        pipeline=pipeline,
+        audio_path="demo.wav",
+        transcription_result={
+            "segments": [{"start": 0.0, "end": 1.2, "text": "你好"}],
+            "language": "zh",
+        },
+    )
+
+    with caplog.at_level("INFO", logger=diarization_default.logger.name):
+        default_diarization_provider.diarize(request)
+        default_diarization_provider.diarize(request)
+
+    assert calls == [
+        ("load_align_model", "zh", "cpu"),
+        ("align", "cpu", "cpu"),
+        ("align", "cpu", "cpu"),
+    ]
+    assert "Loaded WhisperX alignment model" in caplog.text
+    assert "cold_load=True" in caplog.text
+    assert "Reusing WhisperX alignment model (hot reuse" in caplog.text
+    assert pipeline._alignment_device == "cpu"
+
+
 def test_default_diarization_provider_skips_zh_alignment_when_explicitly_disabled(
     monkeypatch,
 ):

From 047c7eac016d7b8e81ba31889bbf30ae6a284992 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 05:18:52 +0800
Subject: [PATCH 07/10] fix: widen stock outro hallucination guard

---
 app/providers/asr/default.py            |  2 +-
 doc/changelog.en.md                     |  5 +++--
 doc/changelog.zh.md                     |  4 ++--
 tests/unit/test_asr_repetition_guard.py | 16 ++++++++++++++++
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/app/providers/asr/default.py b/app/providers/asr/default.py
index 37dd7a3..e37a49c 100644
--- a/app/providers/asr/default.py
+++ b/app/providers/asr/default.py
@@ -113,7 +113,7 @@ def _is_single_segment_hallucination(segment: dict[str, Any]) -> bool:
         return True
 
     outro_count, outro_ratio = _outro_marker_score(normalized)
-    if 3.0 <= duration <= 30.0 and outro_count >= 3 and outro_ratio >= 0.40:
+    if 3.0 <= duration <= 60.0 and outro_count >= 3 and outro_ratio >= 0.40:
         return True
 
     unit, repeat_count, repeat_ratio = _dominant_repeated_unit(normalized)
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 961d0c9..a5b4f4f 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -33,8 +33,9 @@
   `pipeline`, `asr`, `cuda`, or `cuda:0` after validating CUDA alignment
   stability.
 - The ASR hallucination guard now filters short single-segment stock outros
-  dominated by markers such as "like / subscribe / repost / tip", while keeping
-  normal contextual words in longer meeting transcripts.
+  dominated by markers such as "like / subscribe / repost / tip", including raw
+  ASR segments that are slightly over 30 seconds, while keeping normal
+  contextual words in longer meeting transcripts.
 
 ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29)
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index ba6d7fe..7e67ea8 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -27,8 +27,8 @@
   `WHISPERX_ALIGN_DEVICE=cpu` 与 GPU ASR、diarization、embedding 运行时隔离；
   需要 CUDA alignment 时可显式设为 `pipeline`、`asr`、`cuda` 或 `cuda:0`。
 - ASR hallucination guard 新增短单段 stock outro 过滤，拦截
-  “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉，同时保留
-  长会议中的正常上下文词。
+  “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉，并覆盖
+  ASR 原始段略超过 30 秒的边界样本，同时保留长会议中的正常上下文词。
 
 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29)
 
diff --git a/tests/unit/test_asr_repetition_guard.py b/tests/unit/test_asr_repetition_guard.py
index 96f0970..1ae9c0b 100644
--- a/tests/unit/test_asr_repetition_guard.py
+++ b/tests/unit/test_asr_repetition_guard.py
@@ -65,6 +65,22 @@ def test_suppresses_single_segment_stock_outro_hallucination():
     assert report["removed_duration"] == 17.653
 
 
+def test_suppresses_stock_outro_when_raw_asr_segment_is_slightly_over_30s():
+    segments = [
+        {
+            "start": 0.0,
+            "end": 30.36,
+            "text": "请不吝点赞 订阅 转发 打赏支持明镜与点点栏目",
+        }
+    ]
+
+    filtered, report = suppress_repetition_hallucinations(segments)
+
+    assert filtered == []
+    assert report["removed_segment_count"] == 1
+    assert report["removed_duration"] == 30.36
+
+
 def test_keeps_contextual_subscribe_word_in_normal_segment():
     segments = [
         {

From 4839fed3af240552c92aa313f58d76680742cb06 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 06:15:12 +0800
Subject: [PATCH 08/10] fix: stabilize embedding audio slicing

---
 app/providers/embedding/default.py   |  62 ++++++++++++----
 app/requirements.txt                 |   1 +
 doc/changelog.en.md                  |   4 ++
 doc/changelog.zh.md                  |   3 +
 tests/unit/test_provider_registry.py | 102 ++++++++++++++++++++++++++-
 5 files changed, 156 insertions(+), 16 deletions(-)

diff --git a/app/providers/embedding/default.py b/app/providers/embedding/default.py
index 15ec0f7..86f8b59 100644
--- a/app/providers/embedding/default.py
+++ b/app/providers/embedding/default.py
@@ -6,6 +6,8 @@
 import time
 
 import numpy as np
+import soundfile as sf
+import torch
 import torchaudio
 
 from config import MAX_EMBED_DURATION, MIN_EMBED_DURATION
@@ -18,6 +20,22 @@
 logger = logging.getLogger(__name__)
 
 
+def _load_full_waveform(audio_path: str):
+    """Load normalized audio once with libsndfile to avoid per-turn torch decode."""
+
+    load_started = time.perf_counter()
+    data, sample_rate = sf.read(audio_path, dtype="float32", always_2d=True)
+    waveform = torch.from_numpy(data.T.copy())
+    logger.info(
+        "embedding_audio_load_timing backend=soundfile elapsed_s=%.3f sample_rate=%d channels=%d frames=%d",
+        time.perf_counter() - load_started,
+        sample_rate,
+        waveform.shape[0],
+        waveform.shape[1],
+    )
+    return waveform, sample_rate
+
+
 def extract_embeddings_for_turns(
     pipeline,
     audio_path: str,
@@ -25,8 +43,16 @@ def extract_embeddings_for_turns(
 ) -> dict[str, np.ndarray]:
     """Extract averaged embeddings for each speaker cluster."""
 
-    info = torchaudio.info(audio_path)
-    native_sr = info.sample_rate
+    waveform = None
+    try:
+        waveform, native_sr = _load_full_waveform(audio_path)
+    except Exception as exc:
+        logger.warning(
+            "Falling back to torchaudio segment loading for embedding audio: %s",
+            exc,
+        )
+        info = torchaudio.info(audio_path)
+        native_sr = info.sample_rate
     target_sr = 16000
     min_samples = int(MIN_EMBED_DURATION * native_sr)
     max_samples = int(MAX_EMBED_DURATION * native_sr)
@@ -43,19 +69,25 @@ def extract_embeddings_for_turns(
         if num_frames > max_samples:
             num_frames = max_samples
 
-        try:
-            chunk, chunk_sr = torchaudio.load(
-                audio_path,
-                frame_offset=start_sample,
-                num_frames=num_frames,
-            )
-        except Exception as exc:
-            logger.warning(
-                "Failed to load embedding audio segment [%d:%d]: %s",
-                start_sample,
-                end_sample,
-                exc,
-            )
+        if waveform is not None:
+            chunk = waveform[:, start_sample : start_sample + num_frames].contiguous()
+            chunk_sr = native_sr
+        else:
+            try:
+                chunk, chunk_sr = torchaudio.load(
+                    audio_path,
+                    frame_offset=start_sample,
+                    num_frames=num_frames,
+                )
+            except Exception as exc:
+                logger.warning(
+                    "Failed to load embedding audio segment [%d:%d]: %s",
+                    start_sample,
+                    end_sample,
+                    exc,
+                )
+                continue
+        if chunk.shape[1] <= 0:
             continue
 
         if chunk_sr != target_sr:
diff --git a/app/requirements.txt b/app/requirements.txt
index aceccd1..80cf15e 100755
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -32,6 +32,7 @@ pandas>=2.2,<3.0
 huggingface_hub>=0.20.0
 matplotlib>=3.7
 pydub>=0.25.1
+soundfile>=0.13.0,<0.14.0
 # Stay on SciPy 1.11.x to avoid dependency/license drift while avoiding the
 # 1.11.0 sdist GPL-hit file: scipy/_lib/unuran/unuran/src/specfunct/log1p.c.
 scipy>=1.11.4,<1.12.0
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index a5b4f4f..3123c78 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -36,6 +36,10 @@
   dominated by markers such as "like / subscribe / repost / tip", including raw
   ASR segments that are slightly over 30 seconds, while keeping normal
   contextual words in longer meeting transcripts.
+- The embedding stage now reads the normalized WAV once and slices it by
+  diarization turns, avoiding repeated torchaudio native decoding for every
+  turn. It falls back to the previous segmented loader on read failure and adds
+  aggregate `embedding_audio_load_timing` logs.
 
 ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29)
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index 7e67ea8..ec23d5b 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -29,6 +29,9 @@
 - ASR hallucination guard 新增短单段 stock outro 过滤，拦截
   “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉，并覆盖
   ASR 原始段略超过 30 秒的边界样本，同时保留长会议中的正常上下文词。
+- embedding 阶段优先一次性读取规范化 WAV 后按 diarization turn 切片，避免每个
+  turn 反复走 torchaudio 原生解码路径；读取失败时仍回退到旧的分段加载，并新增
+  `embedding_audio_load_timing` 聚合日志。
 
 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29)
 
diff --git a/tests/unit/test_provider_registry.py b/tests/unit/test_provider_registry.py
index 711376b..e5e0bd7 100644
--- a/tests/unit/test_provider_registry.py
+++ b/tests/unit/test_provider_registry.py
@@ -951,7 +951,7 @@ def test_default_embedding_provider_moves_chunks_to_embedding_device(
     pipeline.device = "cuda:0"
     pipeline._embedding_device = "cuda:1"
     calls = []
-    perf_values = iter([30.0, 30.75])
+    perf_values = iter([29.0, 30.0, 30.75])
 
     class FakeTensor:
         def __init__(self, channels, frames):
@@ -1008,6 +1008,106 @@ class FakeInfo:
     ]
 
 
+def test_default_embedding_provider_prefers_single_soundfile_load(monkeypatch):
+    pipeline = TranscriptionPipeline.__new__(TranscriptionPipeline)
+    pipeline.device = "cpu"
+    calls = []
+
+    class FakeTensor:
+        def __init__(self, channels, frames):
+            self.shape = (channels, frames)
+
+        def __getitem__(self, key):
+            channel_key, frame_key = key
+            assert isinstance(channel_key, slice)
+            assert channel_key == slice(None)
+            start = frame_key.start or 0
+            stop = frame_key.stop or self.shape[1]
+            return FakeTensor(self.shape[0], max(stop - start, 0))
+
+        def contiguous(self):
+            calls.append(("contiguous", self.shape[1]))
+            return self
+
+        def mean(self, dim=0, keepdim=True):
+            assert dim == 0
+            assert keepdim is True
+            return FakeTensor(1, self.shape[1])
+
+        def to(self, device):
+            calls.append(("to", device, self.shape[1]))
+            return self
+
+    class FakeEmbeddingModel:
+        def __call__(self, payload):
+            calls.append(("embedding_model", payload["waveform"].shape[1]))
+            return [float(payload["waveform"].shape[1]), 2.0]
+
+    class FakeArray:
+        def __init__(self, shape):
+            self.shape = shape
+
+        @property
+        def T(self):
+            return FakeArray(tuple(reversed(self.shape)))
+
+        def copy(self):
+            return self
+
+    pipeline._embedding_model = FakeEmbeddingModel()
+    monkeypatch.setattr(
+        embedding_default.sf,
+        "read",
+        lambda audio_path, dtype, always_2d: (
+            FakeArray((48000, 1)),
+            16000,
+        ),
+        raising=False,
+    )
+    monkeypatch.setattr(
+        embedding_default.torch,
+        "from_numpy",
+        lambda data: calls.append(("from_numpy", data.shape)) or FakeTensor(1, 48000),
+        raising=False,
+    )
+    monkeypatch.setattr(
+        embedding_default.torchaudio,
+        "info",
+        lambda audio_path: (_ for _ in ()).throw(
+            AssertionError("torchaudio.info should not be used for canonical audio")
+        ),
+    )
+    monkeypatch.setattr(
+        embedding_default.torchaudio,
+        "load",
+        lambda *args, **kwargs: (_ for _ in ()).throw(
+            AssertionError("torchaudio.load should not be used for canonical audio")
+        ),
+    )
+
+    result = default_speaker_embedding_provider.extract_embeddings(
+        SpeakerEmbeddingRequest(
+            pipeline=pipeline,
+            audio_path="demo.wav",
+            diarization_turns=[
+                {"speaker": "SPEAKER_00", "start": 0.0, "end": 2.0},
+                {"speaker": "SPEAKER_00", "start": 2.0, "end": 4.0},
+            ],
+        )
+    )
+
+    assert result.speaker_embeddings["SPEAKER_00"].tolist() == [32000.0, 2.0]
+    assert calls == [
+        ("from_numpy", (1, 48000)),
+        ("contiguous", 32000),
+        ("contiguous", 32000),
+        ("to", "cpu", 32000),
+        ("embedding_model", 32000),
+        ("to", "cpu", 32000),
+        ("embedding_model", 32000),
+    ]
+
+
 def test_default_embedding_provider_uses_selected_device_after_first_lazy_load(
     monkeypatch,
 ):

From 64496d5b967891643fcaedc1c7aca3f680473f80 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 12:01:46 +0800
Subject: [PATCH 09/10] chore: release 0.7.6

---
 app/config.py                    |  2 +-
 doc/changelog.en.md              | 21 ++++++++++++++++-----
 doc/changelog.zh.md              | 19 +++++++++++++++----
 doc/configuration.en.md          | 19 ++++++++++++++-----
 doc/configuration.zh.md          | 17 ++++++++++++-----
 doc/security.en.md               |  2 +-
 doc/security.zh.md               |  2 +-
 tests/unit/test_main_lifespan.py |  4 ++--
 8 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/app/config.py b/app/config.py
index 9e1f665..90c8e2a 100644
--- a/app/config.py
+++ b/app/config.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 
 
-APP_VERSION = "0.7.5"
+APP_VERSION = "0.7.6"
 
 
 def _env_float(name: str, default: float) -> float:
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index 3123c78..dc40464 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -4,6 +4,8 @@
 
 ## Unreleased
 
+## 0.7.6 — Health, alignment, and embedding runtime fixes (2026-05-07)
+
 ### Security
 
 - Updated the dependency security baseline and FOSSA policy test coverage to
@@ -17,12 +19,8 @@
   SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups;
   the `nltk` package needed by alignment is installed explicitly.
 
-### Observability
+### Reliability
 
-- Added safe model-load and transcription-stage timing logs for ASR,
-  diarization, embedding, voiceprint match, enhancement, and pipeline stage
-  timing. Logs record only stage, model, elapsed time, and aggregate metrics;
-  they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens.
 - Transcription jobs no longer run full Python GC before/after every GPU job;
   active job boundaries only clear the CUDA cache, while full GC remains on the
   idle-unload path. This avoids long GIL holds that can make `/healthz` time out
@@ -41,6 +39,19 @@
   turn. It falls back to the previous segmented loader on read failure and adds
   aggregate `embedding_audio_load_timing` logs.
 
+### Observability
+
+- Added safe model-load and transcription-stage timing logs for ASR,
+  diarization, embedding, voiceprint match, enhancement, and pipeline stage
+  timing. Logs record only stage, model, elapsed time, and aggregate metrics;
+  they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens.
+
+### Validation
+
+- Internal live validation covered 0.7.6 health stability during GPU cleanup,
+  the WhisperX alignment runtime, the stock outro hallucination guard, and the
+  embedding audio slicing / single soundfile load path.
+
 ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29)
 
 ### Bug Fixes
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index ec23d5b..f33bc1a 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -4,6 +4,8 @@
 
 ## Unreleased
 
+## 0.7.6 — 健康检查、alignment 与 embedding 运行时修复 (2026-05-07)
+
 ### 安全
 
 - 更新依赖安全基线与 FOSSA policy test 覆盖，收紧公开依赖扫描流程。
@@ -15,11 +17,8 @@
   `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库；
   alignment 所需的 `nltk` 由运行时依赖显式安装。
 
-### 可观测性
+### 可靠性
 
-- 新增安全的模型加载与转写阶段耗时日志，覆盖 ASR、diarization、embedding、
-  voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、
-  耗时和聚合指标，不记录文件名、路径、job ID、speaker ID、host 或 token。
 - 转写 job 前后不再执行完整 Python GC，只清理 CUDA cache；完整 GC 保留在
   idle-unload 阶段，避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz`
   超时。
@@ -33,6 +32,18 @@
   turn 反复走 torchaudio 原生解码路径；读取失败时仍回退到旧的分段加载，并新增
   `embedding_audio_load_timing` 聚合日志。
 
+### 可观测性
+
+- 新增安全的模型加载与转写阶段耗时日志，覆盖 ASR、diarization、embedding、
+  voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、
+  耗时和聚合指标，不记录文件名、路径、job ID、speaker ID、host 或 token。
+
+### 验证
+
+- internal live validation 覆盖 0.7.6 的 GPU cleanup 期间健康检查稳定性、
+  WhisperX alignment runtime、stock outro hallucination guard，以及 embedding
+  audio slicing / single soundfile load 路径。
+
 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29)
 
 ### Bug 修复
diff --git a/doc/configuration.en.md b/doc/configuration.en.md
index d490bbe..d0185f5 100644
--- a/doc/configuration.en.md
+++ b/doc/configuration.en.md
@@ -2,7 +2,7 @@
 
 [简体中文](./configuration.zh.md) | **English**
 
-This is the public configuration index for VoScript v0.7.5. It covers the
+This is the public configuration index for VoScript v0.7.6. It covers the
 environment variables that the current code reads, the per-request override
 semantics of `POST /api/transcribe`, and internal defaults that are documented
 for operators but are not stable public knobs yet. Do not assume a Whisper,
@@ -39,7 +39,7 @@ parameters yet.
 | `JOBS_MAX_CACHE` | `200` | In-memory job LRU limit. Evicted completed jobs remain queryable from disk `status.json` / `result.json`. |
 | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU model idle-unload timeout, defaulting to 180 seconds (3 minutes). Set `0` to disable idle unload and keep models resident. When enabled, loaded models are released only after the serialized GPU runtime has been idle for this many seconds; on the next reload, ASR, diarization, and embedding each choose the visible CUDA device with the most free memory during their own lazy load. |
 
-`MODELS_DIR` and `LANGUAGE` are defined in the config module, but v0.7.5's main
+`MODELS_DIR` and `LANGUAGE` are defined in the config module, but v0.7.6's main
 HTTP transcription path does not use them as stable public tuning knobs:
 Whisper local checkpoint lookup still expects `/models/faster-whisper-<WHISPER_MODEL>`,
 and default language should be controlled with the request `language` field or
@@ -96,7 +96,7 @@ cache is incomplete.
 
 Current internal ASR defaults are `beam_size=5`, `vad_filter=True`,
 `vad_parameters.min_silence_duration_ms=500`, and `condition_on_previous_text=False`.
-These do not have env or API fields in v0.7.5. Do not configure nonexistent
+These do not have env or API fields in v0.7.6. Do not configure nonexistent
 variables such as `WHISPER_BEAM_SIZE`, `WHISPER_COMPUTE_TYPE`, or `WHISPER_VAD_*`.
 
 ## Denoising
@@ -108,7 +108,7 @@ variables such as `WHISPER_BEAM_SIZE`, `WHISPER_COMPUTE_TYPE`, or `WHISPER_VAD_*
 | API `denoise_model` | omitted | Omitted means inherit `DENOISE_MODEL`; explicit `none` disables denoising for this job only. |
 | API `snr_threshold` | omitted | Omitted means inherit `DENOISE_SNR_THRESHOLD`; explicit values override the DeepFilterNet SNR gate for this job only. |
 
-v0.7.5 defaults to `DENOISE_MODEL=none` for clean meeting-recorder audio. Enable
+v0.7.6 defaults to `DENOISE_MODEL=none` for clean meeting-recorder audio. Enable
 `deepfilternet` or `noisereduce` only for noisy environments, either per job or
 as a service default. If you need clean recordings to be skipped automatically,
 use `deepfilternet`; `noisereduce` runs whenever it is selected.
@@ -170,7 +170,7 @@ Cohort lifecycle:
   files to build and save a cohort.
 - After each enroll / update, the background `cohort-rebuild` thread wakes every
   60 seconds and rebuilds after the latest enrollment is at least 30 seconds old.
-- v0.7.5 protects larger loaded or persisted cohorts during automatic rebuilds:
+- v0.7.6 protects larger loaded or persisted cohorts during automatic rebuilds:
   clearing transcription results, having only a few embeddings, or having fewer
   source embeddings than the current cohort will not shrink the cohort automatically.
 - `POST /api/voiceprints/rebuild-cohort` is an explicit manual rebuild and uses
@@ -200,6 +200,15 @@ Stable anchors in completed transcription results:
 New fields are added under the optional-field principle. Clients should ignore
 unknown fields and tolerate missing `words`, `alignment`, and `warning`.
 
+## v0.7.6 Validation Wording
+
+v0.7.6 has internal live validation covering `/healthz` availability during GPU
+cleanup, WhisperX forced-alignment runtime isolation and model reuse, short
+single-segment stock outro hallucination filtering, and the embedding path that
+loads the normalized WAV once and slices it by diarization turns. Public
+documentation records only these behavior categories, not real task names,
+sample names, job IDs, speaker IDs, hosts, logs, or paths.
+
 ## v0.7.4 Validation Wording
 
 v0.7.4 has internal live validation covering transcription cleanup while
diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md
index 88b50b3..b2cfd12 100644
--- a/doc/configuration.zh.md
+++ b/doc/configuration.zh.md
@@ -2,7 +2,7 @@
 
 **简体中文** | [English](./configuration.en.md)
 
-本文是 VoScript v0.7.5 的公开配置索引，覆盖当前代码已经读取并生效的
+本文是 VoScript v0.7.6 的公开配置索引，覆盖当前代码已经读取并生效的
 环境变量、`POST /api/transcribe` 的请求级覆盖语义，以及还没有暴露为稳定
 配置项的内部默认值。没有在本文列出的 Whisper / diarization / AS-norm 变量，
 不要假定已经可用。
@@ -37,7 +37,7 @@
 | `JOBS_MAX_CACHE` | `200` | 内存 job LRU 上限；被淘汰的完成任务仍可从磁盘 `status.json` / `result.json` 查询。 |
 | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU 模型空闲卸载超时，默认 180 秒（3 分钟）。设为 `0` 可关闭空闲卸载并保持模型常驻。开启后，只有串行 GPU 运行时空闲达到该秒数才释放已加载模型；下一次 reload 时 ASR、diarization 和 embedding 会在各自 lazy load 时分别选择当前可见 CUDA 中空闲显存最多的设备。 |
 
-`MODELS_DIR` 和 `LANGUAGE` 在配置模块里有定义，但 v0.7.5 的主 HTTP 转写路径
+`MODELS_DIR` 和 `LANGUAGE` 在配置模块里有定义，但 v0.7.6 的主 HTTP 转写路径
 没有把它们作为稳定公开调参入口使用：Whisper 本地 checkpoint 查找仍使用
 `/models/faster-whisper-<WHISPER_MODEL>`，语言默认请通过请求字段 `language`
 控制或留空自动检测。
@@ -89,7 +89,7 @@ Hugging Face snapshot，缓存不完整时再走 Hub。
 
 当前内部 ASR 默认值：`beam_size=5`、`vad_filter=True`、
 `vad_parameters.min_silence_duration_ms=500`、`condition_on_previous_text=False`。
-这些值在 v0.7.5 还没有对应 env 或 API 字段；不要写 `WHISPER_BEAM_SIZE`、
+这些值在 v0.7.6 还没有对应 env 或 API 字段；不要写 `WHISPER_BEAM_SIZE`、
 `WHISPER_COMPUTE_TYPE`、`WHISPER_VAD_*` 之类未实现配置。
 
 ## 降噪
@@ -101,7 +101,7 @@ Hugging Face snapshot，缓存不完整时再走 Hub。
 | API `denoise_model` | 省略 | 省略表示继承 `DENOISE_MODEL`；显式传 `none` 表示只对本次任务关闭降噪。 |
 | API `snr_threshold` | 省略 | 省略表示继承 `DENOISE_SNR_THRESHOLD`；显式传值只覆盖本次任务的 DeepFilterNet SNR gate。 |
 
-v0.7.5 默认面向干净会议录音，因此 `DENOISE_MODEL=none`。只有噪声环境才建议按任务
+v0.7.6 默认面向干净会议录音，因此 `DENOISE_MODEL=none`。只有噪声环境才建议按任务
 或服务级启用 `deepfilternet` / `noisereduce`。如需“干净录音自动跳过”，请选择
 `deepfilternet`；`noisereduce` 一旦被选择就会运行。
 
@@ -159,7 +159,7 @@ cohort 生命周期：
 - 否则扫描持久化转写结果和 `emb_*.npy` 构建并保存 cohort。
 - 每次 enroll / update 后，后台 `cohort-rebuild` 线程每 60 秒检查一次，在最近一次
   enroll 至少过去 30 秒后自动重建。
-- v0.7.5 的后台自动重建会保护更大的已加载或已持久化 cohort：清空转写结果、
+- v0.7.6 的后台自动重建会保护更大的已加载或已持久化 cohort：清空转写结果、
   只有少量 embedding，或源数量少于现有 cohort 时，不会自动缩小 cohort。
 - `POST /api/voiceprints/rebuild-cohort` 是显式手动重建，仍按当前可用 embedding
   立即生成新 cohort。
@@ -184,6 +184,13 @@ cohort 生命周期：
 新增字段按可选字段原则扩展；客户端应忽略不认识的字段，并容忍 `words` /
 `alignment` / `warning` 缺失。
 
+## v0.7.6 验证口径
+
+v0.7.6 已用 internal live validation 覆盖：GPU cleanup 期间 `/healthz` 仍保持可用、
+WhisperX forced alignment 运行时隔离与模型复用、短单段 stock outro 幻觉过滤，以及
+embedding 单次读取规范化 WAV 后按 diarization turn 切片的路径。公开文档只记录行为
+类别，不发布真实任务名、样本名、job id、speaker id、主机、日志或路径。
+
 ## v0.7.4 验证口径
 
 v0.7.4 已用内部 live validation 验证：清空持久化转写结果后，只要既有声纹库和已加载 /
diff --git a/doc/security.en.md b/doc/security.en.md
index 3cc2ec7..3331141 100644
--- a/doc/security.en.md
+++ b/doc/security.en.md
@@ -22,7 +22,7 @@ Treat the service as if it were an internal database.
 
 ## Built-in hardening (on by default)
 
-As of 0.6.0 the following protections are in place out of the box:
+As of 0.7.6 the following protections are in place out of the box:
 
 1. **Container runs as a non-root user.** The Dockerfile creates an
    `app` user (uid/gid 1000 by default, overridable via `APP_UID`/
diff --git a/doc/security.zh.md b/doc/security.zh.md
index ededcb6..85d18f4 100644
--- a/doc/security.zh.md
+++ b/doc/security.zh.md
@@ -19,7 +19,7 @@
 
 ## 内置的硬化（默认启用）
 
-当前版本（0.6.0）默认开启以下保护：
+当前版本（0.7.6）默认开启以下保护：
 
 1. **容器以非 root 用户运行**。Dockerfile 创建 `app` 用户（uid/gid 1000，
    可通过 `APP_UID`/`APP_GID` 覆盖），`USER app`。即使服务代码被 RCE，
diff --git a/tests/unit/test_main_lifespan.py b/tests/unit/test_main_lifespan.py
index e6ce072..7346c60 100644
--- a/tests/unit/test_main_lifespan.py
+++ b/tests/unit/test_main_lifespan.py
@@ -54,8 +54,8 @@ def test_rebuild_thread_alive_during_lifespan(app_client):
     assert thread.daemon, "cohort-rebuild thread must be a daemon thread"
 
 
-def test_openapi_version_reports_075(app_client):
-    assert app_client.app.version == "0.7.5"
+def test_openapi_version_reports_076(app_client):
+    assert app_client.app.version == "0.7.6"
 
 
 def test_rebuild_thread_survives_tick_exception(app_client, monkeypatch):

From 439a5d269d292872c8035253a81c290fdc279690 Mon Sep 17 00:00:00 2001
From: Maple Gao <esanisa@gmail.com>
Date: Thu, 7 May 2026 12:50:09 +0800
Subject: [PATCH 10/10] fix: remove NLTK runtime dependency

---
 .github/workflows/claude-code-review.yml      |  1 +
 app/nltk/__init__.py                          |  1 +
 app/nltk/tokenize/__init__.py                 |  1 +
 app/nltk/tokenize/punkt.py                    | 89 +++++++++++++++++++
 app/requirements.txt                          |  5 +-
 doc/changelog.en.md                           |  3 +-
 doc/changelog.zh.md                           |  3 +-
 .../unit/test_dependency_runtime_baseline.py  |  4 +-
 tests/unit/test_whisperx_punkt_shim.py        | 26 ++++++
 9 files changed, 128 insertions(+), 5 deletions(-)
 create mode 100644 app/nltk/__init__.py
 create mode 100644 app/nltk/tokenize/__init__.py
 create mode 100644 app/nltk/tokenize/punkt.py
 create mode 100644 tests/unit/test_whisperx_punkt_shim.py

diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml
index 24b6f66..a47459c 100644
--- a/.github/workflows/claude-code-review.yml
+++ b/.github/workflows/claude-code-review.yml
@@ -52,5 +52,6 @@ jobs:
             synchronized English/Chinese documentation. Avoid formatting-only comments.
           claude_args: |
             --model ${{ env.CLAUDE_MODEL }}
+            --max-turns 30
         env:
           ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
diff --git a/app/nltk/__init__.py b/app/nltk/__init__.py
new file mode 100644
index 0000000..c272975
--- /dev/null
+++ b/app/nltk/__init__.py
@@ -0,0 +1 @@
+"""Minimal NLTK compatibility surface required by WhisperX alignment."""
diff --git a/app/nltk/tokenize/__init__.py b/app/nltk/tokenize/__init__.py
new file mode 100644
index 0000000..fadecc8
--- /dev/null
+++ b/app/nltk/tokenize/__init__.py
@@ -0,0 +1 @@
+"""Tokenization compatibility helpers for WhisperX."""
diff --git a/app/nltk/tokenize/punkt.py b/app/nltk/tokenize/punkt.py
new file mode 100644
index 0000000..5c43679
--- /dev/null
+++ b/app/nltk/tokenize/punkt.py
@@ -0,0 +1,89 @@
+"""Small Punkt-compatible sentence span tokenizer for WhisperX.
+
+WhisperX 3.3.1 imports ``PunktParameters`` and ``PunktSentenceTokenizer`` only
+to split an already bounded segment into sentence spans. Pulling the full NLTK
+distribution into the runtime introduces unrelated data/license surface, so this
+module implements the small API shape WhisperX uses.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from typing import Iterable
+
+
+@dataclass
+class PunktParameters:
+    """Subset of NLTK's PunktParameters used by WhisperX."""
+
+    abbrev_types: set[str] = field(default_factory=set)
+
+
+class PunktSentenceTokenizer:
+    """Sentence span splitter compatible with WhisperX's use of NLTK Punkt."""
+
+    _TERMINATORS = {".", "!", "?", "。", "！", "？"}
+
+    def __init__(self, params: PunktParameters | None = None) -> None:
+        self.params = params or PunktParameters()
+
+    def span_tokenize(self, text: str) -> Iterable[tuple[int, int]]:
+        """Yield half-open sentence spans in ``text``.
+
+        This intentionally implements conservative splitting: common
+        abbreviations configured by WhisperX are not treated as sentence
+        boundaries, and punctuation must be followed by whitespace or end of
+        string before a split is emitted.
+        """
+
+        start = 0
+        index = 0
+        length = len(text)
+        while index < length:
+            char = text[index]
+            if char not in self._TERMINATORS or self._is_abbreviation(text, index):
+                index += 1
+                continue
+
+            next_index = index + 1
+            while next_index < length and text[next_index] in {
+                '"',
+                "'",
+                ")",
+                "]",
+                "}",
+                "”",
+                "’",
+            }:
+                next_index += 1
+
+            if (
+                next_index < length
+                and char in {".", "!", "?"}
+                and not text[next_index].isspace()
+            ):
+                index += 1
+                continue
+
+            end = next_index
+            while end < length and text[end].isspace():
+                end += 1
+
+            yield (start, next_index)
+            start = end
+            index = end
+
+        if start < length:
+            yield (start, length)
+        elif length == 0:
+            return
+
+    def _is_abbreviation(self, text: str, dot_index: int) -> bool:
+        if text[dot_index] != ".":
+            return False
+        prefix = text[:dot_index]
+        match = re.search(r"([A-Za-z]+)$", prefix)
+        if not match:
+            return False
+        return match.group(1).lower() in self.params.abbrev_types
diff --git a/app/requirements.txt b/app/requirements.txt
index 80cf15e..d3765ad 100755
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -42,8 +42,9 @@ sqlite-vec>=0.1.0
 # Docker installs whisperx==3.3.1 with --no-deps after this file is resolved.
 # WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0,
 # which would replace the cuDNN9-compatible ASR stack above and look for
-# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image.
-nltk>=3.9,<4.0
+# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image. Its alignment
+# module imports only a tiny Punkt sentence-span API; app/nltk provides that
+# compatibility shim so the image does not need the full NLTK distribution.
 transformers>=5.0.0,<5.1.0
 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none")
 deepfilternet>=0.5.6
diff --git a/doc/changelog.en.md b/doc/changelog.en.md
index dc40464..5bf3d11 100644
--- a/doc/changelog.en.md
+++ b/doc/changelog.en.md
@@ -17,7 +17,8 @@
   transitive dependencies. Docker installs WhisperX with `--no-deps` so its old
   ASR transitive dependencies cannot replace the current `numpy<2` /
   SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups;
-  the `nltk` package needed by alignment is installed explicitly.
+  the small Punkt sentence-span API used by alignment is provided by an
+  internal compatibility shim instead of pulling the full NLTK distribution.
 
 ### Reliability
 
diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md
index f33bc1a..c542919 100644
--- a/doc/changelog.zh.md
+++ b/doc/changelog.zh.md
@@ -15,7 +15,8 @@
   `ctranslate2>=4.7.1,<5.0`，同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。
   Docker 会以 `--no-deps` 安装 WhisperX，避免其旧 ASR 传递依赖覆盖当前
   `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库；
-  alignment 所需的 `nltk` 由运行时依赖显式安装。
+  alignment 用到的少量 Punkt sentence-span API 由内部兼容 shim 提供，不再拉入完整
+  NLTK 分发包。
 
 ### 可靠性
 
diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py
index 657ea96..9bdc753 100644
--- a/tests/unit/test_dependency_runtime_baseline.py
+++ b/tests/unit/test_dependency_runtime_baseline.py
@@ -30,8 +30,10 @@ def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2():
 def test_docker_installs_whisperx_without_replacing_asr_runtime_stack():
     lines = _requirements_lines()
     dockerfile = _dockerfile_text()
+    root = Path(__file__).resolve().parents[2]
 
-    assert "nltk>=3.9,<4.0" in lines
+    assert not any(line.startswith("nltk") for line in lines)
+    assert (root / "app" / "nltk" / "tokenize" / "punkt.py").exists()
     assert "whisperx==3.3.1" not in lines
     assert (
         'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1'
diff --git a/tests/unit/test_whisperx_punkt_shim.py b/tests/unit/test_whisperx_punkt_shim.py
new file mode 100644
index 0000000..a43e793
--- /dev/null
+++ b/tests/unit/test_whisperx_punkt_shim.py
@@ -0,0 +1,26 @@
+from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer
+
+
+def test_punkt_shim_keeps_common_abbreviations_with_sentence() -> None:
+    params = PunktParameters()
+    params.abbrev_types = {"dr", "vs", "mr", "mrs", "prof"}
+    tokenizer = PunktSentenceTokenizer(params)
+
+    text = "Dr. Maple joined. The meeting ended?"
+
+    spans = list(tokenizer.span_tokenize(text))
+
+    assert [text[start:end] for start, end in spans] == [
+        "Dr. Maple joined.",
+        "The meeting ended?",
+    ]
+
+
+def test_punkt_shim_supports_cjk_terminators() -> None:
+    tokenizer = PunktSentenceTokenizer(PunktParameters())
+
+    text = "第一句。第二句！"
+
+    spans = list(tokenizer.span_tokenize(text))
+
+    assert [text[start:end] for start, end in spans] == ["第一句。", "第二句！"]