From 6e4ce522548c1f4d4dab6092cf969d41194d4da9 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 02:01:51 +0800 Subject: [PATCH 01/10] fix: avoid blocking health during GPU cleanup --- app/infra/job_runtime.py | 15 +++++++++--- tests/unit/test_job_runtime.py | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/app/infra/job_runtime.py b/app/infra/job_runtime.py index 553bfe2..9ea189a 100644 --- a/app/infra/job_runtime.py +++ b/app/infra/job_runtime.py @@ -87,11 +87,14 @@ def flush_torch_cuda_cache( """Best-effort CUDA cache flush used around serialized GPU work.""" try: - import gc as _gc - import torch as _torch - _gc.collect() + # Full Python GC can hold the GIL long enough to make FastAPI liveness + # probes time out after large alignment results. Keep active job + # boundaries lightweight; the idle-unload path remains the heavy cleanup + # point because it runs after the GPU pipeline has been idle. + if phase == "idle-unload": + _collect_python_gc() if _torch.cuda.is_available(): _torch.cuda.empty_cache() except Exception as exc: # pragma: no cover - guarded for runtime-only failures @@ -99,6 +102,12 @@ def flush_torch_cuda_cache( logger.warning("%s CUDA cache flush failed: %s", phase, exc) +def _collect_python_gc() -> None: + import gc as _gc + + _gc.collect() + + def run_serialized_gpu_work( work: Callable[[], _T], *, diff --git a/tests/unit/test_job_runtime.py b/tests/unit/test_job_runtime.py index c360031..6deca32 100644 --- a/tests/unit/test_job_runtime.py +++ b/tests/unit/test_job_runtime.py @@ -2,6 +2,9 @@ from __future__ import annotations +import sys +from types import SimpleNamespace + import pytest import infra.job_runtime as job_runtime @@ -77,6 +80,48 @@ def test_run_serialized_gpu_work_releases_semaphore_after_error(monkeypatch): assert events == ["pre-whisper", "pre-whisper", "retry", "post-pipeline"] +def test_flush_torch_cuda_cache_skips_python_gc_for_active_job_phases(monkeypatch): + events = [] + fake_torch = SimpleNamespace( + cuda=SimpleNamespace( + is_available=lambda: True, + empty_cache=lambda: events.append("empty_cache"), + ) + ) + + monkeypatch.setitem(sys.modules, "torch", fake_torch) + monkeypatch.setattr( + job_runtime, + "_collect_python_gc", + lambda: events.append("gc_collect"), + ) + + job_runtime.flush_torch_cuda_cache(phase="post-pipeline") + + assert events == ["empty_cache"] + + +def test_flush_torch_cuda_cache_keeps_full_gc_for_idle_unload(monkeypatch): + events = [] + fake_torch = SimpleNamespace( + cuda=SimpleNamespace( + is_available=lambda: True, + empty_cache=lambda: events.append("empty_cache"), + ) + ) + + monkeypatch.setitem(sys.modules, "torch", fake_torch) + monkeypatch.setattr( + job_runtime, + "_collect_python_gc", + lambda: events.append("gc_collect"), + ) + + job_runtime.flush_torch_cuda_cache(phase="idle-unload") + + assert events == ["gc_collect", "empty_cache"] + + def test_idle_unload_daemon_disabled_when_timeout_zero(): pipeline = _FakePipeline(loaded=True) From 947c42ad2e16718afcf7f82ec10dc33eb25d07ef Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 02:10:13 +0800 Subject: [PATCH 02/10] fix: keep ASR runtime on cuDNN9-compatible deps --- app/requirements.txt | 6 ++++- doc/changelog.en.md | 11 +++++++--- doc/changelog.zh.md | 8 +++++-- .../unit/test_dependency_runtime_baseline.py | 22 +++++++++++++++++++ 4 files changed, 41 insertions(+), 6 deletions(-) create mode 100644 tests/unit/test_dependency_runtime_baseline.py diff --git a/app/requirements.txt b/app/requirements.txt index ac1185e..a25305b 100755 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -12,7 +12,11 @@ pyannote.metrics>=3.2,<4.0 pyannote.pipeline>=3.0.1,<4.0 omegaconf==2.3.0 antlr4-python3-runtime==4.9.3 -faster-whisper==1.1.0 +# faster-whisper 1.1.0 resolves to a CTranslate2 wheel that looks for cuDNN 8 +# (`libcudnn_ops_infer.so.8`). The Docker base is CUDA 12.4 + cuDNN 9, so keep +# the ASR runtime on the cuDNN9-compatible CTranslate2 line verified remotely. +faster-whisper>=1.2.1,<2.0.0 +ctranslate2>=4.7.1,<5.0 fastapi>=0.115.0 uvicorn[standard]>=0.30.0 python-multipart>=0.0.26,<0.0.28 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 859ed78..5fd9db7 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -10,9 +10,10 @@ tighten the public dependency scanning flow. - Moved WhisperX alignment from the yanked `3.1.x` package series to `whisperx==3.3.1`, with compatible `pyannote.audio==3.3.2` and - `faster-whisper==1.1.0` pins, plus bounded `pyannote.*` and `pandas` - transitive dependencies, while keeping the current `numpy<2` / SciPy 1.11.x - dependency baseline. + cuDNN9-compatible `faster-whisper>=1.2.1,<2.0.0` / + `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas` + transitive dependencies. This keeps the current `numpy<2` / SciPy 1.11.x + dependency baseline while avoiding runtime lookups for cuDNN8 libraries. ### Observability @@ -20,6 +21,10 @@ diarization, embedding, voiceprint match, enhancement, and pipeline stage timing. Logs record only stage, model, elapsed time, and aggregate metrics; they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens. +- Transcription jobs no longer run full Python GC before/after every GPU job; + active job boundaries only clear the CUDA cache, while full GC remains on the + idle-unload path. This avoids long GIL holds that can make `/healthz` time out + after large alignment results complete. ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29) diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index 0f41a85..bf3bcc5 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -9,14 +9,18 @@ - 更新依赖安全基线与 FOSSA policy test 覆盖,收紧公开依赖扫描流程。 - 将 WhisperX alignment 从 PyPI yanked 的 `3.1.x` 系列迁移到 `whisperx==3.3.1`,并配套固定 `pyannote.audio==3.3.2` 与 - `faster-whisper==1.1.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界, - 保持当前 `numpy<2` / SciPy 1.11.x 依赖基线。 + cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` / + `ctranslate2>=4.7.1,<5.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界, + 保持当前 `numpy<2` / SciPy 1.11.x 依赖基线,避免运行时查找 cuDNN8 动态库。 ### 可观测性 - 新增安全的模型加载与转写阶段耗时日志,覆盖 ASR、diarization、embedding、 voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、 耗时和聚合指标,不记录文件名、路径、job ID、speaker ID、host 或 token。 +- 转写 job 前后不再执行完整 Python GC,只清理 CUDA cache;完整 GC 保留在 + idle-unload 阶段,避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz` + 超时。 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29) diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py new file mode 100644 index 0000000..d39c3c3 --- /dev/null +++ b/tests/unit/test_dependency_runtime_baseline.py @@ -0,0 +1,22 @@ +"""Regression tests for runtime dependency pins that affect Docker GPU loads.""" + +from __future__ import annotations + +from pathlib import Path + + +def _requirements_lines() -> list[str]: + root = Path(__file__).resolve().parents[2] + return [ + line.strip() + for line in (root / "app" / "requirements.txt").read_text().splitlines() + if line.strip() and not line.lstrip().startswith("#") + ] + + +def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2(): + lines = _requirements_lines() + + assert "faster-whisper>=1.2.1,<2.0.0" in lines + assert "ctranslate2>=4.7.1,<5.0" in lines + assert "faster-whisper==1.1.0" not in lines From 0c04619ec59b4f432271a0c4fb868f2bb9e5d750 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 02:15:47 +0800 Subject: [PATCH 03/10] fix: install WhisperX without replacing ASR runtime --- app/Dockerfile | 5 +++++ app/requirements.txt | 8 ++++---- doc/changelog.en.md | 5 +++-- doc/changelog.zh.md | 5 +++-- tests/unit/test_dependency_runtime_baseline.py | 17 +++++++++++++++++ 5 files changed, 32 insertions(+), 8 deletions(-) diff --git a/app/Dockerfile b/app/Dockerfile index 97d5ae7..37b7b5f 100755 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -45,6 +45,11 @@ RUN if [ -n "$PIP_INDEX_URL" ]; then \ else \ pip install --no-cache-dir -r requirements.txt; \ fi +RUN if [ -n "$PIP_INDEX_URL" ]; then \ + pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1; \ + else \ + pip install --no-cache-dir --no-deps whisperx==3.3.1; \ + fi COPY --chown=app:app . . diff --git a/app/requirements.txt b/app/requirements.txt index a25305b..1cd5b6b 100755 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -38,10 +38,10 @@ scipy>=1.11.4,<1.12.0 # sqlite-vec: fast on-disk top-k cosine search for voiceprints. sqlite-vec>=0.1.0 # whisperx: forced word-level alignment via wav2vec2. -# 3.3.1 is the newest non-yanked WhisperX release that still supports the -# current torch 2.6 / numpy<2 / SciPy 1.11.x runtime without moving to the -# WhisperX 3.3.3+ numpy>=2 or 3.8.x torch~=2.8 dependency stack. -whisperx==3.3.1 +# Docker installs whisperx==3.3.1 with --no-deps after this file is resolved. +# WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0, +# which would replace the cuDNN9-compatible ASR stack above and look for +# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image. transformers>=5.0.0,<5.1.0 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none") deepfilternet>=0.5.6 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 5fd9db7..14877f5 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -12,8 +12,9 @@ `whisperx==3.3.1`, with compatible `pyannote.audio==3.3.2` and cuDNN9-compatible `faster-whisper>=1.2.1,<2.0.0` / `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas` - transitive dependencies. This keeps the current `numpy<2` / SciPy 1.11.x - dependency baseline while avoiding runtime lookups for cuDNN8 libraries. + transitive dependencies. Docker installs WhisperX with `--no-deps` so its old + ASR transitive dependencies cannot replace the current `numpy<2` / + SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups. ### Observability diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index bf3bcc5..de1980a 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -10,8 +10,9 @@ - 将 WhisperX alignment 从 PyPI yanked 的 `3.1.x` 系列迁移到 `whisperx==3.3.1`,并配套固定 `pyannote.audio==3.3.2` 与 cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` / - `ctranslate2>=4.7.1,<5.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界, - 保持当前 `numpy<2` / SciPy 1.11.x 依赖基线,避免运行时查找 cuDNN8 动态库。 + `ctranslate2>=4.7.1,<5.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。 + Docker 会以 `--no-deps` 安装 WhisperX,避免其旧 ASR 传递依赖覆盖当前 + `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库。 ### 可观测性 diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py index d39c3c3..a0ca920 100644 --- a/tests/unit/test_dependency_runtime_baseline.py +++ b/tests/unit/test_dependency_runtime_baseline.py @@ -14,9 +14,26 @@ def _requirements_lines() -> list[str]: ] +def _dockerfile_text() -> str: + root = Path(__file__).resolve().parents[2] + return (root / "app" / "Dockerfile").read_text() + + def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2(): lines = _requirements_lines() assert "faster-whisper>=1.2.1,<2.0.0" in lines assert "ctranslate2>=4.7.1,<5.0" in lines assert "faster-whisper==1.1.0" not in lines + + +def test_docker_installs_whisperx_without_replacing_asr_runtime_stack(): + lines = _requirements_lines() + dockerfile = _dockerfile_text() + + assert "whisperx==3.3.1" not in lines + assert ( + 'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1' + in dockerfile + ) + assert "pip install --no-cache-dir --no-deps whisperx==3.3.1" in dockerfile From 222ebb291c4e7749f4a2a62a258b2d5cb03f7b02 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 02:22:53 +0800 Subject: [PATCH 04/10] fix: include WhisperX alignment runtime dependency --- app/requirements.txt | 1 + doc/changelog.en.md | 3 ++- doc/changelog.zh.md | 3 ++- tests/unit/test_dependency_runtime_baseline.py | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/app/requirements.txt b/app/requirements.txt index 1cd5b6b..aceccd1 100755 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -42,6 +42,7 @@ sqlite-vec>=0.1.0 # WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0, # which would replace the cuDNN9-compatible ASR stack above and look for # cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image. +nltk>=3.9,<4.0 transformers>=5.0.0,<5.1.0 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none") deepfilternet>=0.5.6 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 14877f5..fcf573d 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -14,7 +14,8 @@ `ctranslate2>=4.7.1,<5.0`, plus bounded `pyannote.*` and `pandas` transitive dependencies. Docker installs WhisperX with `--no-deps` so its old ASR transitive dependencies cannot replace the current `numpy<2` / - SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups. + SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups; + the `nltk` package needed by alignment is installed explicitly. ### Observability diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index de1980a..f594f58 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -12,7 +12,8 @@ cuDNN9 兼容的 `faster-whisper>=1.2.1,<2.0.0` / `ctranslate2>=4.7.1,<5.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。 Docker 会以 `--no-deps` 安装 WhisperX,避免其旧 ASR 传递依赖覆盖当前 - `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库。 + `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库; + alignment 所需的 `nltk` 由运行时依赖显式安装。 ### 可观测性 diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py index a0ca920..657ea96 100644 --- a/tests/unit/test_dependency_runtime_baseline.py +++ b/tests/unit/test_dependency_runtime_baseline.py @@ -31,6 +31,7 @@ def test_docker_installs_whisperx_without_replacing_asr_runtime_stack(): lines = _requirements_lines() dockerfile = _dockerfile_text() + assert "nltk>=3.9,<4.0" in lines assert "whisperx==3.3.1" not in lines assert ( 'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1' From 4d3222e6560ff6fdbae6e4482f207eb34f939c72 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 03:07:24 +0800 Subject: [PATCH 05/10] fix: filter stock outro hallucinations --- app/providers/asr/default.py | 27 +++++++++++++++++++++ doc/changelog.en.md | 3 +++ doc/changelog.zh.md | 3 +++ tests/unit/test_asr_repetition_guard.py | 31 +++++++++++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/app/providers/asr/default.py b/app/providers/asr/default.py index 24b8a9a..37dd7a3 100644 --- a/app/providers/asr/default.py +++ b/app/providers/asr/default.py @@ -16,6 +16,18 @@ "简体中文输出", "以下是普通话的对话", ) +_OUTRO_HALLUCINATION_MARKERS = ( + "请不吝点赞", + "点赞", + "订阅", + "转发", + "打赏", + "打赏支持", + "明镜与点点栏目", + "谢谢观看", + "感谢观看", + "下期再见", +) def _duration(segment: dict[str, Any]) -> float: @@ -51,6 +63,17 @@ def _prompt_marker_key(normalized_text: str) -> str: return "" +def _outro_marker_score(normalized_text: str) -> tuple[int, float]: + if not normalized_text: + return 0, 0.0 + + matched = { + marker for marker in _OUTRO_HALLUCINATION_MARKERS if marker in normalized_text + } + marker_chars = sum(len(marker) for marker in matched) + return len(matched), marker_chars / len(normalized_text) + + def _dominant_repeated_unit(normalized_text: str) -> tuple[str, int, float]: """Return the dominant repeated short unit, repeat count, and coverage ratio.""" @@ -89,6 +112,10 @@ def _is_single_segment_hallucination(segment: dict[str, Any]) -> bool: if duration >= 3.0 and marker_count >= 2 and marker_ratio >= 0.55: return True + outro_count, outro_ratio = _outro_marker_score(normalized) + if 3.0 <= duration <= 30.0 and outro_count >= 3 and outro_ratio >= 0.40: + return True + unit, repeat_count, repeat_ratio = _dominant_repeated_unit(normalized) return ( bool(unit) and duration >= 12.0 and repeat_count >= 4 and repeat_ratio >= 0.82 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index fcf573d..8533575 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -27,6 +27,9 @@ active job boundaries only clear the CUDA cache, while full GC remains on the idle-unload path. This avoids long GIL holds that can make `/healthz` time out after large alignment results complete. +- The ASR hallucination guard now filters short single-segment stock outros + dominated by markers such as "like / subscribe / repost / tip", while keeping + normal contextual words in longer meeting transcripts. ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29) diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index f594f58..735cc84 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -23,6 +23,9 @@ - 转写 job 前后不再执行完整 Python GC,只清理 CUDA cache;完整 GC 保留在 idle-unload 阶段,避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz` 超时。 +- ASR hallucination guard 新增短单段 stock outro 过滤,拦截 + “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉,同时保留 + 长会议中的正常上下文词。 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29) diff --git a/tests/unit/test_asr_repetition_guard.py b/tests/unit/test_asr_repetition_guard.py index 7d5adbf..96f0970 100644 --- a/tests/unit/test_asr_repetition_guard.py +++ b/tests/unit/test_asr_repetition_guard.py @@ -49,6 +49,37 @@ def test_suppresses_long_repeated_non_prompt_segment(): assert report["removed_duration"] == 20.0 +def test_suppresses_single_segment_stock_outro_hallucination(): + segments = [ + { + "start": 0.438, + "end": 18.091, + "text": "请不吝点赞 订阅 转发 打赏支持明镜与点点栏目", + } + ] + + filtered, report = suppress_repetition_hallucinations(segments) + + assert filtered == [] + assert report["removed_segment_count"] == 1 + assert report["removed_duration"] == 17.653 + + +def test_keeps_contextual_subscribe_word_in_normal_segment(): + segments = [ + { + "start": 0.0, + "end": 8.0, + "text": "这个功能里订阅提醒只是用户消息设置的一部分,后面还有支付通知。", + } + ] + + filtered, report = suppress_repetition_hallucinations(segments) + + assert filtered == segments + assert report["removed_segment_count"] == 0 + + def test_keeps_normal_short_repetition_below_hallucination_thresholds(): segments = [ {"start": 0.0, "end": 1.0, "text": "对"}, From 70dba4090925eb021b1ee5eb6ceca8f93c42c8e6 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 03:57:18 +0800 Subject: [PATCH 06/10] fix: isolate whisperx alignment runtime --- .env.example | 5 ++ app/config.py | 1 + app/pipeline/orchestrator.py | 15 +++- app/providers/diarization/default.py | 89 +++++++++++++++++---- doc/changelog.en.md | 5 ++ doc/changelog.zh.md | 3 + doc/configuration.en.md | 1 + doc/configuration.zh.md | 1 + doc/quickstart.en.md | 15 ++-- doc/quickstart.zh.md | 12 +-- tests/unit/test_pipeline_model_lifecycle.py | 11 +++ tests/unit/test_provider_registry.py | 74 +++++++++++++++++ 12 files changed, 205 insertions(+), 27 deletions(-) diff --git a/.env.example b/.env.example index 0a5b034..d7f2cd9 100644 --- a/.env.example +++ b/.env.example @@ -86,6 +86,11 @@ HF_HUB_ETAG_TIMEOUT=3 # want to complete jobs without word-level timestamps for selected languages. WHISPERX_ALIGN_DISABLED_LANGUAGES= +# Alignment defaults to CPU to isolate wav2vec2 alignment from the GPU ASR and +# speaker-embedding runtimes. Set to pipeline/asr/cuda/cuda:0 only if you have +# validated the target CUDA stack is stable for WhisperX alignment. +WHISPERX_ALIGN_DEVICE=cpu + # Optional comma-separated language=model overrides. # Example: WHISPERX_ALIGN_MODEL_MAP=zh=your-org/your-zh-align-model WHISPERX_ALIGN_MODEL_MAP= diff --git a/app/config.py b/app/config.py index 7460c2e..9e1f665 100644 --- a/app/config.py +++ b/app/config.py @@ -96,6 +96,7 @@ def _env_mapping(name: str) -> dict[str, str]: # WhisperX forced-alignment controls. Languages are attempted by default; use # WHISPERX_ALIGN_DISABLED_LANGUAGES only for an explicit operational fallback. +WHISPERX_ALIGN_DEVICE: str = _env_str("WHISPERX_ALIGN_DEVICE", "cpu").lower() WHISPERX_ALIGN_DISABLED_LANGUAGES: frozenset[str] = _env_csv_set( "WHISPERX_ALIGN_DISABLED_LANGUAGES", "", diff --git a/app/pipeline/orchestrator.py b/app/pipeline/orchestrator.py index 1f6bec0..a808c2e 100644 --- a/app/pipeline/orchestrator.py +++ b/app/pipeline/orchestrator.py @@ -266,6 +266,10 @@ def __init__( self._whisper_device = None self._diarization_device = None self._embedding_device = None + self._alignment_cache_key = None + self._alignment_device = None + self._alignment_model = None + self._alignment_metadata = None self.model_size = model_size or WHISPER_MODEL self.hf_token = hf_token or HF_TOKEN self._whisper = None @@ -284,16 +288,25 @@ def runner(self) -> PipelineRunner: def has_loaded_models(self) -> bool: return any( getattr(self, name, None) is not None - for name in ("_whisper", "_diarization", "_embedding_model") + for name in ( + "_whisper", + "_diarization", + "_embedding_model", + "_alignment_model", + ) ) def unload_models(self) -> None: self._whisper = None self._diarization = None self._embedding_model = None + self._alignment_model = None + self._alignment_metadata = None + self._alignment_cache_key = None self._whisper_device = None self._diarization_device = None self._embedding_device = None + self._alignment_device = None def _select_device_for_lazy_load(self, device_attr: str) -> str: selected_device = getattr(self, device_attr, None) diff --git a/app/providers/diarization/default.py b/app/providers/diarization/default.py index 3f2021d..2afa3ab 100644 --- a/app/providers/diarization/default.py +++ b/app/providers/diarization/default.py @@ -12,6 +12,7 @@ from typing import Any from config import ( + WHISPERX_ALIGN_DEVICE, WHISPERX_ALIGN_CACHE_ONLY, WHISPERX_ALIGN_DISABLED_LANGUAGES, WHISPERX_ALIGN_MODEL_DIR, @@ -107,6 +108,35 @@ def _alignment_disabled(language: str) -> bool: ) +def _resolve_alignment_device(pipeline) -> str: + configured = (WHISPERX_ALIGN_DEVICE or "cpu").strip().lower() + if configured in {"pipeline", "asr"}: + return str(getattr(pipeline, "device", "cpu") or "cpu") + if configured == "auto": + selector = getattr(pipeline, "_select_device_for_lazy_load", None) + if callable(selector): + return str(selector("_alignment_device")) + return str(getattr(pipeline, "device", "cpu") or "cpu") + return configured or "cpu" + + +def _alignment_cache_key( + *, + language: str, + model_name: str | None, + model_source: str, + device: str, +) -> tuple[str, str | None, str, str | None, bool, str]: + return ( + language, + model_name, + model_source, + WHISPERX_ALIGN_MODEL_DIR, + WHISPERX_ALIGN_CACHE_ONLY, + device, + ) + + def _language_disabled_hint(language: str) -> str: return ( f"Remove {language} from WHISPERX_ALIGN_DISABLED_LANGUAGES to retry " @@ -248,32 +278,61 @@ def align_diarized_segments_with_metadata( preflight_message = _torch_preflight_message(language, model_name) if preflight_message: logger.info(preflight_message) + alignment_device = _resolve_alignment_device(pipeline) audio = whisperx.load_audio(audio_path) load_kwargs = _load_align_model_kwargs( whisperx.load_align_model, language, - pipeline.device, + alignment_device, ) - load_started = time.perf_counter() - with _cache_only_alignment_environment(): - align_model, align_metadata = whisperx.load_align_model( - **load_kwargs, - ) - logger.info( - "Loaded WhisperX alignment model in %.2fs " - "(cold_load=True, language=%s, model_source=%s, device=%s)", - time.perf_counter() - load_started, - language, - model_source, - pipeline.device, + cache_key = _alignment_cache_key( + language=language, + model_name=model_name, + model_source=model_source, + device=alignment_device, ) + cached_key = getattr(pipeline, "_alignment_cache_key", None) + align_model = getattr(pipeline, "_alignment_model", None) + align_metadata = getattr(pipeline, "_alignment_metadata", None) + if ( + cached_key == cache_key + and align_model is not None + and align_metadata is not None + ): + logger.info( + "Reusing WhisperX alignment model (hot reuse, language=%s, model_source=%s, device=%s)", + language, + model_source, + alignment_device, + ) + else: + setattr(pipeline, "_alignment_model", None) + setattr(pipeline, "_alignment_metadata", None) + setattr(pipeline, "_alignment_cache_key", None) + load_started = time.perf_counter() + with _cache_only_alignment_environment(): + align_model, align_metadata = whisperx.load_align_model( + **load_kwargs, + ) + setattr(pipeline, "_alignment_model", align_model) + setattr(pipeline, "_alignment_metadata", align_metadata) + setattr(pipeline, "_alignment_cache_key", cache_key) + setattr(pipeline, "_alignment_device", alignment_device) + logger.info( + "Loaded WhisperX alignment model in %.2fs " + "(cold_load=True, language=%s, model_source=%s, device=%s)", + time.perf_counter() - load_started, + language, + model_source, + alignment_device, + ) processing_started = time.perf_counter() aligned_result = whisperx.align( segments, align_model, align_metadata, audio, - pipeline.device, + alignment_device, return_char_alignments=False, ) processing_elapsed_s = time.perf_counter() - processing_started @@ -283,7 +342,7 @@ def align_diarized_segments_with_metadata( processing_elapsed_s, language, len(segments), - pipeline.device, + alignment_device, ) logger.info("WhisperX forced alignment succeeded for language=%s", language) metadata = { diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 8533575..961d0c9 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -27,6 +27,11 @@ active job boundaries only clear the CUDA cache, while full GC remains on the idle-unload path. This avoids long GIL holds that can make `/healthz` time out after large alignment results complete. +- WhisperX forced-alignment models are now cached by language / model / device + and default to `WHISPERX_ALIGN_DEVICE=cpu`, isolating alignment from GPU ASR, + diarization, and embedding runtimes. Operators can explicitly set + `pipeline`, `asr`, `cuda`, or `cuda:0` after validating CUDA alignment + stability. - The ASR hallucination guard now filters short single-segment stock outros dominated by markers such as "like / subscribe / repost / tip", while keeping normal contextual words in longer meeting transcripts. diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index 735cc84..ba6d7fe 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -23,6 +23,9 @@ - 转写 job 前后不再执行完整 Python GC,只清理 CUDA cache;完整 GC 保留在 idle-unload 阶段,避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz` 超时。 +- WhisperX forced alignment 模型现在按语言 / 模型 / 设备缓存复用,并默认通过 + `WHISPERX_ALIGN_DEVICE=cpu` 与 GPU ASR、diarization、embedding 运行时隔离; + 需要 CUDA alignment 时可显式设为 `pipeline`、`asr`、`cuda` 或 `cuda:0`。 - ASR hallucination guard 新增短单段 stock outro 过滤,拦截 “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉,同时保留 长会议中的正常上下文词。 diff --git a/doc/configuration.en.md b/doc/configuration.en.md index d9fb5b0..d490bbe 100644 --- a/doc/configuration.en.md +++ b/doc/configuration.en.md @@ -120,6 +120,7 @@ use `deepfilternet`; `noisereduce` runs whenever it is selected. | API `min_speakers` / `max_speakers` | `0` | Per-request speaker-count bounds. `0` means auto and is not passed to pyannote. | | `PYANNOTE_MIN_DURATION_OFF` | `0.5` | pyannote `_binarize.min_duration_off`, used to merge short pauses and reduce over-segmentation. If the pyannote object does not support it, the service logs a warning and continues. | | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | empty | Comma-separated languages that skip forced alignment when no model override is present. Use only as a temporary operational fallback. | +| `WHISPERX_ALIGN_DEVICE` | `cpu` | Runtime device for WhisperX forced alignment. CPU is the default to isolate wav2vec2 alignment from GPU ASR / speaker-embedding runtimes; set to `pipeline` / `asr` / `cuda` / `cuda:0` only after validating CUDA alignment stability. | | `WHISPERX_ALIGN_MODEL_MAP` | empty | Comma-separated `lang=model` overrides, for example `zh=org/model`. | | `WHISPERX_ALIGN_MODEL_DIR` | empty | Optional alignment model directory; passed through only when the installed WhisperX supports that parameter. | | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | When `1`, requests cache-only alignment model loading, only when supported by the installed WhisperX. | diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md index 5a8ca32..88b50b3 100644 --- a/doc/configuration.zh.md +++ b/doc/configuration.zh.md @@ -112,6 +112,7 @@ v0.7.5 默认面向干净会议录音,因此 `DENOISE_MODEL=none`。只有噪 | API `min_speakers` / `max_speakers` | `0` | 请求级说话人数约束;`0` 表示自动,不传入 pyannote。 | | `PYANNOTE_MIN_DURATION_OFF` | `0.5` | pyannote `_binarize.min_duration_off`,用于合并短暂停顿、减少过度切分。若当前 pyannote 对象不支持该属性,服务会记录警告并继续运行。 | | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | 空 | 逗号分隔语言列表;命中且没有模型覆盖时跳过 forced alignment。只建议作为临时降级开关。 | +| `WHISPERX_ALIGN_DEVICE` | `cpu` | WhisperX forced alignment 的运行设备。默认 CPU,用于隔离 wav2vec2 alignment 与 GPU ASR / speaker embedding 运行时;确认 CUDA alignment 稳定后可设为 `pipeline` / `asr` / `cuda` / `cuda:0`。 | | `WHISPERX_ALIGN_MODEL_MAP` | 空 | 逗号分隔 `lang=model` 覆盖,例如 `zh=org/model`。 | | `WHISPERX_ALIGN_MODEL_DIR` | 空 | 可选 alignment 模型目录;仅在当前 WhisperX 版本支持该参数时透传。 | | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | 为 `1` 时,请求 WhisperX 只使用缓存加载 alignment 模型;仅在当前 WhisperX 版本支持时透传。 | diff --git a/doc/quickstart.en.md b/doc/quickstart.en.md index 0e327eb..9be0076 100644 --- a/doc/quickstart.en.md +++ b/doc/quickstart.en.md @@ -204,6 +204,7 @@ A few worth knowing about: | `MIN_EMBED_DURATION` | `1.5` | Minimum diarization turn duration used for speaker embedding extraction | | `MAX_EMBED_DURATION` | `10.0` | Maximum per-turn audio window used for speaker embedding extraction | | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | empty | Comma-separated languages that explicitly skip WhisperX forced alignment; use only as a temporary operational fallback | +| `WHISPERX_ALIGN_DEVICE` | `cpu` | Runtime device for WhisperX forced alignment; CPU is the default to keep alignment isolated from GPU ASR / speaker-embedding runtimes | | `WHISPERX_ALIGN_MODEL_MAP` | empty | Comma-separated `lang=model` overrides, for example `zh=your-org/your-zh-align-model` | | `WHISPERX_ALIGN_MODEL_DIR` | empty | Optional alignment model cache directory passed through when the installed WhisperX supports it | | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | Set to 1 to request cache-only alignment model loading when supported by the installed WhisperX version | @@ -217,12 +218,14 @@ For every supported setting, the Whisper / ASR parameters that are not exposed as env yet, and AS-norm cohort preservation semantics, see [`configuration.en.md`](./configuration.en.md). -Chinese word-level alignment is attempted by default. The Docker image uses -PyTorch 2.6.0 so recent transformers safety checks can load the default -Chinese `.bin` alignment weights. If you run a custom image with older torch, -use torch>=2.6 or a trusted replacement alignment model that provides -safetensors; only set `WHISPERX_ALIGN_DISABLED_LANGUAGES=zh` if you -intentionally want a temporary segment-level fallback. +Chinese word-level alignment is attempted by default and runs on CPU by +default to keep wav2vec2 alignment isolated from GPU ASR / speaker-embedding +runtimes. The Docker image uses PyTorch 2.6.0 so recent transformers safety +checks can load the default Chinese `.bin` alignment weights. If you run a +custom image with older torch, use torch>=2.6 or a trusted replacement +alignment model that provides safetensors; only set +`WHISPERX_ALIGN_DISABLED_LANGUAGES=zh` if you intentionally want a temporary +segment-level fallback. ### Host directory ownership diff --git a/doc/quickstart.zh.md b/doc/quickstart.zh.md index e02133e..a62634e 100644 --- a/doc/quickstart.zh.md +++ b/doc/quickstart.zh.md @@ -178,6 +178,7 @@ HF_ENDPOINT=https://hf-mirror.com | `MIN_EMBED_DURATION` | `1.5` | 提取 speaker embedding 时接受的最短 diarization turn 时长 | | `MAX_EMBED_DURATION` | `10.0` | 提取 speaker embedding 时单个 turn 使用的最长音频窗口 | | `WHISPERX_ALIGN_DISABLED_LANGUAGES` | 空 | 逗号分隔的显式跳过 forced alignment 语言;只建议作为临时运营降级开关 | +| `WHISPERX_ALIGN_DEVICE` | `cpu` | WhisperX forced alignment 运行设备;默认 CPU,避免 alignment 与 GPU ASR / speaker embedding 运行时互相影响 | | `WHISPERX_ALIGN_MODEL_MAP` | 空 | 逗号分隔的 `lang=model` 覆盖,例如 `zh=your-org/your-zh-align-model` | | `WHISPERX_ALIGN_MODEL_DIR` | 空 | 可选 alignment 模型缓存目录;当前 WhisperX 支持时会透传 | | `WHISPERX_ALIGN_CACHE_ONLY` | `0` | 设为 1 时,在当前 WhisperX 版本支持的情况下只从缓存加载 alignment 模型 | @@ -189,11 +190,12 @@ HF_ENDPOINT=https://hf-mirror.com 所有可用配置项、哪些 Whisper / ASR 参数尚未暴露为 env,以及 AS-norm cohort 保护语义,见 [`configuration.zh.md`](./configuration.zh.md)。 -中文词级 alignment 默认会尝试执行。Docker 镜像使用 PyTorch 2.6.0,可满足 -transformers 新安全检查对默认中文 `.bin` alignment 权重的加载要求。如果你使用 -自定义镜像且 torch 低于 2.6,请升级到 torch>=2.6,或改用提供 safetensors 的可信 -替代 alignment 模型;只有确认要临时降级到段级时间戳时,才设置 -`WHISPERX_ALIGN_DISABLED_LANGUAGES=zh`。 +中文词级 alignment 默认会尝试执行,并默认在 CPU 上运行,以避免 wav2vec2 +alignment 与 GPU ASR / speaker embedding 运行时互相影响。Docker 镜像使用 +PyTorch 2.6.0,可满足 transformers 新安全检查对默认中文 `.bin` alignment 权重的 +加载要求。如果你使用自定义镜像且 torch 低于 2.6,请升级到 torch>=2.6,或改用 +提供 safetensors 的可信替代 alignment 模型;只有确认要临时降级到段级时间戳时, +才设置 `WHISPERX_ALIGN_DISABLED_LANGUAGES=zh`。 ### 宿主目录所有者 diff --git a/tests/unit/test_pipeline_model_lifecycle.py b/tests/unit/test_pipeline_model_lifecycle.py index 29377f9..dbfba9c 100644 --- a/tests/unit/test_pipeline_model_lifecycle.py +++ b/tests/unit/test_pipeline_model_lifecycle.py @@ -24,6 +24,10 @@ def _new_pipeline(*, device="cuda"): pipeline._whisper_device = None pipeline._diarization_device = None pipeline._embedding_device = None + pipeline._alignment_cache_key = None + pipeline._alignment_device = None + pipeline._alignment_model = None + pipeline._alignment_metadata = None pipeline.model_size = "tiny" pipeline.hf_token = None pipeline._whisper = None @@ -49,6 +53,9 @@ def test_unload_models_drops_loaded_references_without_selecting_device(monkeypa pipeline._whisper = object() pipeline._diarization = object() pipeline._embedding_model = object() + pipeline._alignment_model = object() + pipeline._alignment_metadata = object() + pipeline._alignment_cache_key = ("zh", None, "default", None, False, "cpu") calls = [] monkeypatch.setattr( @@ -65,6 +72,9 @@ def test_unload_models_drops_loaded_references_without_selecting_device(monkeypa assert pipeline._whisper is None assert pipeline._diarization is None assert pipeline._embedding_model is None + assert pipeline._alignment_model is None + assert pipeline._alignment_metadata is None + assert pipeline._alignment_cache_key is None assert calls == [] @@ -333,6 +343,7 @@ def test_unload_models_clears_per_model_devices_and_reload_reselects(monkeypatch assert pipeline._whisper_device is None assert pipeline._diarization_device is None assert pipeline._embedding_device is None + assert pipeline._alignment_device is None _ = pipeline.whisper assert pipeline._whisper_device == "cuda:1" diff --git a/tests/unit/test_provider_registry.py b/tests/unit/test_provider_registry.py index 8ed7d4d..711376b 100644 --- a/tests/unit/test_provider_registry.py +++ b/tests/unit/test_provider_registry.py @@ -612,6 +612,80 @@ def fake_load_align_model(language_code, device): assert result.dedup_removed == 0 +def test_default_diarization_provider_caches_alignment_model_on_configured_device( + monkeypatch, + caplog, +): + pipeline = TranscriptionPipeline.__new__(TranscriptionPipeline) + pipeline.device = "cuda:1" + calls = [] + + class FakeDiarizationResult: + def itertracks(self, yield_label=False): + assert yield_label is True + yield SimpleNamespace(start=0.0, end=1.2), None, "SPEAKER_00" + + class FakeDiarizer: + def __call__(self, audio_path, **kwargs): + return FakeDiarizationResult() + + pipeline._diarization = FakeDiarizer() + monkeypatch.setattr(diarization_default, "WHISPERX_ALIGN_DEVICE", "cpu") + whisperx = sys.modules["whisperx"] + monkeypatch.setattr( + whisperx, + "load_audio", + lambda audio_path: f"audio:{audio_path}", + raising=False, + ) + + def fake_load_align_model(language_code, device): + calls.append(("load_align_model", language_code, device)) + return object(), {"language": language_code, "device": device} + + def fake_align( + segments, + align_model, + align_metadata, + audio, + device, + return_char_alignments=False, + ): + calls.append(("align", align_metadata["device"], device)) + return {"segments": segments} + + monkeypatch.setattr( + whisperx, + "load_align_model", + fake_load_align_model, + raising=False, + ) + monkeypatch.setattr(whisperx, "align", fake_align, raising=False) + + request = DiarizationRequest( + pipeline=pipeline, + audio_path="demo.wav", + transcription_result={ + "segments": [{"start": 0.0, "end": 1.2, "text": "你好"}], + "language": "zh", + }, + ) + + with caplog.at_level("INFO", logger=diarization_default.logger.name): + default_diarization_provider.diarize(request) + default_diarization_provider.diarize(request) + + assert calls == [ + ("load_align_model", "zh", "cpu"), + ("align", "cpu", "cpu"), + ("align", "cpu", "cpu"), + ] + assert "Loaded WhisperX alignment model" in caplog.text + assert "cold_load=True" in caplog.text + assert "Reusing WhisperX alignment model (hot reuse" in caplog.text + assert pipeline._alignment_device == "cpu" + + def test_default_diarization_provider_skips_zh_alignment_when_explicitly_disabled( monkeypatch, ): From 047c7eac016d7b8e81ba31889bbf30ae6a284992 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 05:18:52 +0800 Subject: [PATCH 07/10] fix: widen stock outro hallucination guard --- app/providers/asr/default.py | 2 +- doc/changelog.en.md | 5 +++-- doc/changelog.zh.md | 4 ++-- tests/unit/test_asr_repetition_guard.py | 16 ++++++++++++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/app/providers/asr/default.py b/app/providers/asr/default.py index 37dd7a3..e37a49c 100644 --- a/app/providers/asr/default.py +++ b/app/providers/asr/default.py @@ -113,7 +113,7 @@ def _is_single_segment_hallucination(segment: dict[str, Any]) -> bool: return True outro_count, outro_ratio = _outro_marker_score(normalized) - if 3.0 <= duration <= 30.0 and outro_count >= 3 and outro_ratio >= 0.40: + if 3.0 <= duration <= 60.0 and outro_count >= 3 and outro_ratio >= 0.40: return True unit, repeat_count, repeat_ratio = _dominant_repeated_unit(normalized) diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 961d0c9..a5b4f4f 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -33,8 +33,9 @@ `pipeline`, `asr`, `cuda`, or `cuda:0` after validating CUDA alignment stability. - The ASR hallucination guard now filters short single-segment stock outros - dominated by markers such as "like / subscribe / repost / tip", while keeping - normal contextual words in longer meeting transcripts. + dominated by markers such as "like / subscribe / repost / tip", including raw + ASR segments that are slightly over 30 seconds, while keeping normal + contextual words in longer meeting transcripts. ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29) diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index ba6d7fe..7e67ea8 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -27,8 +27,8 @@ `WHISPERX_ALIGN_DEVICE=cpu` 与 GPU ASR、diarization、embedding 运行时隔离; 需要 CUDA alignment 时可显式设为 `pipeline`、`asr`、`cuda` 或 `cuda:0`。 - ASR hallucination guard 新增短单段 stock outro 过滤,拦截 - “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉,同时保留 - 长会议中的正常上下文词。 + “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉,并覆盖 + ASR 原始段略超过 30 秒的边界样本,同时保留长会议中的正常上下文词。 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29) diff --git a/tests/unit/test_asr_repetition_guard.py b/tests/unit/test_asr_repetition_guard.py index 96f0970..1ae9c0b 100644 --- a/tests/unit/test_asr_repetition_guard.py +++ b/tests/unit/test_asr_repetition_guard.py @@ -65,6 +65,22 @@ def test_suppresses_single_segment_stock_outro_hallucination(): assert report["removed_duration"] == 17.653 +def test_suppresses_stock_outro_when_raw_asr_segment_is_slightly_over_30s(): + segments = [ + { + "start": 0.0, + "end": 30.36, + "text": "请不吝点赞 订阅 转发 打赏支持明镜与点点栏目", + } + ] + + filtered, report = suppress_repetition_hallucinations(segments) + + assert filtered == [] + assert report["removed_segment_count"] == 1 + assert report["removed_duration"] == 30.36 + + def test_keeps_contextual_subscribe_word_in_normal_segment(): segments = [ { From 4839fed3af240552c92aa313f58d76680742cb06 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 06:15:12 +0800 Subject: [PATCH 08/10] fix: stabilize embedding audio slicing --- app/providers/embedding/default.py | 62 ++++++++++++---- app/requirements.txt | 1 + doc/changelog.en.md | 4 ++ doc/changelog.zh.md | 3 + tests/unit/test_provider_registry.py | 102 ++++++++++++++++++++++++++- 5 files changed, 156 insertions(+), 16 deletions(-) diff --git a/app/providers/embedding/default.py b/app/providers/embedding/default.py index 15ec0f7..86f8b59 100644 --- a/app/providers/embedding/default.py +++ b/app/providers/embedding/default.py @@ -6,6 +6,8 @@ import time import numpy as np +import soundfile as sf +import torch import torchaudio from config import MAX_EMBED_DURATION, MIN_EMBED_DURATION @@ -18,6 +20,22 @@ logger = logging.getLogger(__name__) +def _load_full_waveform(audio_path: str): + """Load normalized audio once with libsndfile to avoid per-turn torch decode.""" + + load_started = time.perf_counter() + data, sample_rate = sf.read(audio_path, dtype="float32", always_2d=True) + waveform = torch.from_numpy(data.T.copy()) + logger.info( + "embedding_audio_load_timing backend=soundfile elapsed_s=%.3f sample_rate=%d channels=%d frames=%d", + time.perf_counter() - load_started, + sample_rate, + waveform.shape[0], + waveform.shape[1], + ) + return waveform, sample_rate + + def extract_embeddings_for_turns( pipeline, audio_path: str, @@ -25,8 +43,16 @@ def extract_embeddings_for_turns( ) -> dict[str, np.ndarray]: """Extract averaged embeddings for each speaker cluster.""" - info = torchaudio.info(audio_path) - native_sr = info.sample_rate + waveform = None + try: + waveform, native_sr = _load_full_waveform(audio_path) + except Exception as exc: + logger.warning( + "Falling back to torchaudio segment loading for embedding audio: %s", + exc, + ) + info = torchaudio.info(audio_path) + native_sr = info.sample_rate target_sr = 16000 min_samples = int(MIN_EMBED_DURATION * native_sr) max_samples = int(MAX_EMBED_DURATION * native_sr) @@ -43,19 +69,25 @@ def extract_embeddings_for_turns( if num_frames > max_samples: num_frames = max_samples - try: - chunk, chunk_sr = torchaudio.load( - audio_path, - frame_offset=start_sample, - num_frames=num_frames, - ) - except Exception as exc: - logger.warning( - "Failed to load embedding audio segment [%d:%d]: %s", - start_sample, - end_sample, - exc, - ) + if waveform is not None: + chunk = waveform[:, start_sample : start_sample + num_frames].contiguous() + chunk_sr = native_sr + else: + try: + chunk, chunk_sr = torchaudio.load( + audio_path, + frame_offset=start_sample, + num_frames=num_frames, + ) + except Exception as exc: + logger.warning( + "Failed to load embedding audio segment [%d:%d]: %s", + start_sample, + end_sample, + exc, + ) + continue + if chunk.shape[1] <= 0: continue if chunk_sr != target_sr: diff --git a/app/requirements.txt b/app/requirements.txt index aceccd1..80cf15e 100755 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -32,6 +32,7 @@ pandas>=2.2,<3.0 huggingface_hub>=0.20.0 matplotlib>=3.7 pydub>=0.25.1 +soundfile>=0.13.0,<0.14.0 # Stay on SciPy 1.11.x to avoid dependency/license drift while avoiding the # 1.11.0 sdist GPL-hit file: scipy/_lib/unuran/unuran/src/specfunct/log1p.c. scipy>=1.11.4,<1.12.0 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index a5b4f4f..3123c78 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -36,6 +36,10 @@ dominated by markers such as "like / subscribe / repost / tip", including raw ASR segments that are slightly over 30 seconds, while keeping normal contextual words in longer meeting transcripts. +- The embedding stage now reads the normalized WAV once and slices it by + diarization turns, avoiding repeated torchaudio native decoding for every + turn. It falls back to the previous segmented loader on read failure and adds + aggregate `embedding_audio_load_timing` logs. ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29) diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index 7e67ea8..ec23d5b 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -29,6 +29,9 @@ - ASR hallucination guard 新增短单段 stock outro 过滤,拦截 “点赞 / 订阅 / 转发 / 打赏”等多个短视频尾巴标记高度集中的非重复幻觉,并覆盖 ASR 原始段略超过 30 秒的边界样本,同时保留长会议中的正常上下文词。 +- embedding 阶段优先一次性读取规范化 WAV 后按 diarization turn 切片,避免每个 + turn 反复走 torchaudio 原生解码路径;读取失败时仍回退到旧的分段加载,并新增 + `embedding_audio_load_timing` 聚合日志。 ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29) diff --git a/tests/unit/test_provider_registry.py b/tests/unit/test_provider_registry.py index 711376b..e5e0bd7 100644 --- a/tests/unit/test_provider_registry.py +++ b/tests/unit/test_provider_registry.py @@ -951,7 +951,7 @@ def test_default_embedding_provider_moves_chunks_to_embedding_device( pipeline.device = "cuda:0" pipeline._embedding_device = "cuda:1" calls = [] - perf_values = iter([30.0, 30.75]) + perf_values = iter([29.0, 30.0, 30.75]) class FakeTensor: def __init__(self, channels, frames): @@ -1008,6 +1008,106 @@ class FakeInfo: ] +def test_default_embedding_provider_prefers_single_soundfile_load(monkeypatch): + pipeline = TranscriptionPipeline.__new__(TranscriptionPipeline) + pipeline.device = "cpu" + calls = [] + + class FakeTensor: + def __init__(self, channels, frames): + self.shape = (channels, frames) + + def __getitem__(self, key): + channel_key, frame_key = key + assert isinstance(channel_key, slice) + assert channel_key == slice(None) + start = frame_key.start or 0 + stop = frame_key.stop or self.shape[1] + return FakeTensor(self.shape[0], max(stop - start, 0)) + + def contiguous(self): + calls.append(("contiguous", self.shape[1])) + return self + + def mean(self, dim=0, keepdim=True): + assert dim == 0 + assert keepdim is True + return FakeTensor(1, self.shape[1]) + + def to(self, device): + calls.append(("to", device, self.shape[1])) + return self + + class FakeEmbeddingModel: + def __call__(self, payload): + calls.append(("embedding_model", payload["waveform"].shape[1])) + return [float(payload["waveform"].shape[1]), 2.0] + + class FakeArray: + def __init__(self, shape): + self.shape = shape + + @property + def T(self): + return FakeArray(tuple(reversed(self.shape))) + + def copy(self): + return self + + pipeline._embedding_model = FakeEmbeddingModel() + monkeypatch.setattr( + embedding_default.sf, + "read", + lambda audio_path, dtype, always_2d: ( + FakeArray((48000, 1)), + 16000, + ), + raising=False, + ) + monkeypatch.setattr( + embedding_default.torch, + "from_numpy", + lambda data: calls.append(("from_numpy", data.shape)) or FakeTensor(1, 48000), + raising=False, + ) + monkeypatch.setattr( + embedding_default.torchaudio, + "info", + lambda audio_path: (_ for _ in ()).throw( + AssertionError("torchaudio.info should not be used for canonical audio") + ), + ) + monkeypatch.setattr( + embedding_default.torchaudio, + "load", + lambda *args, **kwargs: (_ for _ in ()).throw( + AssertionError("torchaudio.load should not be used for canonical audio") + ), + ) + + result = default_speaker_embedding_provider.extract_embeddings( + SpeakerEmbeddingRequest( + pipeline=pipeline, + audio_path="demo.wav", + diarization_turns=[ + {"speaker": "SPEAKER_00", "start": 0.0, "end": 2.0}, + {"speaker": "SPEAKER_00", "start": 2.0, "end": 4.0}, + ], + ) + ) + + assert result.speaker_embeddings["SPEAKER_00"].tolist() == [32000.0, 2.0] + assert calls == [ + ("from_numpy", (1, 48000)), + ("contiguous", 32000), + ("contiguous", 32000), + ("to", "cpu", 32000), + ("embedding_model", 32000), + ("to", "cpu", 32000), + ("embedding_model", 32000), + ] + + def test_default_embedding_provider_uses_selected_device_after_first_lazy_load( monkeypatch, ): From 64496d5b967891643fcaedc1c7aca3f680473f80 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 12:01:46 +0800 Subject: [PATCH 09/10] chore: release 0.7.6 --- app/config.py | 2 +- doc/changelog.en.md | 21 ++++++++++++++++----- doc/changelog.zh.md | 19 +++++++++++++++---- doc/configuration.en.md | 19 ++++++++++++++----- doc/configuration.zh.md | 17 ++++++++++++----- doc/security.en.md | 2 +- doc/security.zh.md | 2 +- tests/unit/test_main_lifespan.py | 4 ++-- 8 files changed, 62 insertions(+), 24 deletions(-) diff --git a/app/config.py b/app/config.py index 9e1f665..90c8e2a 100644 --- a/app/config.py +++ b/app/config.py @@ -9,7 +9,7 @@ from pathlib import Path -APP_VERSION = "0.7.5" +APP_VERSION = "0.7.6" def _env_float(name: str, default: float) -> float: diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 3123c78..dc40464 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -4,6 +4,8 @@ ## Unreleased +## 0.7.6 — Health, alignment, and embedding runtime fixes (2026-05-07) + ### Security - Updated the dependency security baseline and FOSSA policy test coverage to @@ -17,12 +19,8 @@ SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups; the `nltk` package needed by alignment is installed explicitly. -### Observability +### Reliability -- Added safe model-load and transcription-stage timing logs for ASR, - diarization, embedding, voiceprint match, enhancement, and pipeline stage - timing. Logs record only stage, model, elapsed time, and aggregate metrics; - they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens. - Transcription jobs no longer run full Python GC before/after every GPU job; active job boundaries only clear the CUDA cache, while full GC remains on the idle-unload path. This avoids long GIL holds that can make `/healthz` time out @@ -41,6 +39,19 @@ turn. It falls back to the previous segmented loader on read failure and adds aggregate `embedding_audio_load_timing` logs. +### Observability + +- Added safe model-load and transcription-stage timing logs for ASR, + diarization, embedding, voiceprint match, enhancement, and pipeline stage + timing. Logs record only stage, model, elapsed time, and aggregate metrics; + they do not include filenames, paths, job IDs, speaker IDs, hosts, or tokens. + +### Validation + +- Internal live validation covered 0.7.6 health stability during GPU cleanup, + the WhisperX alignment runtime, the stock outro hallucination guard, and the + embedding audio slicing / single soundfile load path. + ## 0.7.5 — Idle GPU model unload and CI quality gates (2026-04-29) ### Bug Fixes diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index ec23d5b..f33bc1a 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -4,6 +4,8 @@ ## Unreleased +## 0.7.6 — 健康检查、alignment 与 embedding 运行时修复 (2026-05-07) + ### 安全 - 更新依赖安全基线与 FOSSA policy test 覆盖,收紧公开依赖扫描流程。 @@ -15,11 +17,8 @@ `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库; alignment 所需的 `nltk` 由运行时依赖显式安装。 -### 可观测性 +### 可靠性 -- 新增安全的模型加载与转写阶段耗时日志,覆盖 ASR、diarization、embedding、 - voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、 - 耗时和聚合指标,不记录文件名、路径、job ID、speaker ID、host 或 token。 - 转写 job 前后不再执行完整 Python GC,只清理 CUDA cache;完整 GC 保留在 idle-unload 阶段,避免大段 alignment 结果完成后长时间持有 GIL 导致 `/healthz` 超时。 @@ -33,6 +32,18 @@ turn 反复走 torchaudio 原生解码路径;读取失败时仍回退到旧的分段加载,并新增 `embedding_audio_load_timing` 聚合日志。 +### 可观测性 + +- 新增安全的模型加载与转写阶段耗时日志,覆盖 ASR、diarization、embedding、 + voiceprint match、enhancement 与 pipeline stage timing。日志只记录阶段、模型、 + 耗时和聚合指标,不记录文件名、路径、job ID、speaker ID、host 或 token。 + +### 验证 + +- internal live validation 覆盖 0.7.6 的 GPU cleanup 期间健康检查稳定性、 + WhisperX alignment runtime、stock outro hallucination guard,以及 embedding + audio slicing / single soundfile load 路径。 + ## 0.7.5 — GPU 模型空闲卸载与 CI 质量门禁 (2026-04-29) ### Bug 修复 diff --git a/doc/configuration.en.md b/doc/configuration.en.md index d490bbe..d0185f5 100644 --- a/doc/configuration.en.md +++ b/doc/configuration.en.md @@ -2,7 +2,7 @@ [简体中文](./configuration.zh.md) | **English** -This is the public configuration index for VoScript v0.7.5. It covers the +This is the public configuration index for VoScript v0.7.6. It covers the environment variables that the current code reads, the per-request override semantics of `POST /api/transcribe`, and internal defaults that are documented for operators but are not stable public knobs yet. Do not assume a Whisper, @@ -39,7 +39,7 @@ parameters yet. | `JOBS_MAX_CACHE` | `200` | In-memory job LRU limit. Evicted completed jobs remain queryable from disk `status.json` / `result.json`. | | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU model idle-unload timeout, defaulting to 180 seconds (3 minutes). Set `0` to disable idle unload and keep models resident. When enabled, loaded models are released only after the serialized GPU runtime has been idle for this many seconds; on the next reload, ASR, diarization, and embedding each choose the visible CUDA device with the most free memory during their own lazy load. | -`MODELS_DIR` and `LANGUAGE` are defined in the config module, but v0.7.5's main +`MODELS_DIR` and `LANGUAGE` are defined in the config module, but v0.7.6's main HTTP transcription path does not use them as stable public tuning knobs: Whisper local checkpoint lookup still expects `/models/faster-whisper-`, and default language should be controlled with the request `language` field or @@ -96,7 +96,7 @@ cache is incomplete. Current internal ASR defaults are `beam_size=5`, `vad_filter=True`, `vad_parameters.min_silence_duration_ms=500`, and `condition_on_previous_text=False`. -These do not have env or API fields in v0.7.5. Do not configure nonexistent +These do not have env or API fields in v0.7.6. Do not configure nonexistent variables such as `WHISPER_BEAM_SIZE`, `WHISPER_COMPUTE_TYPE`, or `WHISPER_VAD_*`. ## Denoising @@ -108,7 +108,7 @@ variables such as `WHISPER_BEAM_SIZE`, `WHISPER_COMPUTE_TYPE`, or `WHISPER_VAD_* | API `denoise_model` | omitted | Omitted means inherit `DENOISE_MODEL`; explicit `none` disables denoising for this job only. | | API `snr_threshold` | omitted | Omitted means inherit `DENOISE_SNR_THRESHOLD`; explicit values override the DeepFilterNet SNR gate for this job only. | -v0.7.5 defaults to `DENOISE_MODEL=none` for clean meeting-recorder audio. Enable +v0.7.6 defaults to `DENOISE_MODEL=none` for clean meeting-recorder audio. Enable `deepfilternet` or `noisereduce` only for noisy environments, either per job or as a service default. If you need clean recordings to be skipped automatically, use `deepfilternet`; `noisereduce` runs whenever it is selected. @@ -170,7 +170,7 @@ Cohort lifecycle: files to build and save a cohort. - After each enroll / update, the background `cohort-rebuild` thread wakes every 60 seconds and rebuilds after the latest enrollment is at least 30 seconds old. -- v0.7.5 protects larger loaded or persisted cohorts during automatic rebuilds: +- v0.7.6 protects larger loaded or persisted cohorts during automatic rebuilds: clearing transcription results, having only a few embeddings, or having fewer source embeddings than the current cohort will not shrink the cohort automatically. - `POST /api/voiceprints/rebuild-cohort` is an explicit manual rebuild and uses @@ -200,6 +200,15 @@ Stable anchors in completed transcription results: New fields are added under the optional-field principle. Clients should ignore unknown fields and tolerate missing `words`, `alignment`, and `warning`. +## v0.7.6 Validation Wording + +v0.7.6 has internal live validation covering `/healthz` availability during GPU +cleanup, WhisperX forced-alignment runtime isolation and model reuse, short +single-segment stock outro hallucination filtering, and the embedding path that +loads the normalized WAV once and slices it by diarization turns. Public +documentation records only these behavior categories, not real task names, +sample names, job IDs, speaker IDs, hosts, logs, or paths. + ## v0.7.4 Validation Wording v0.7.4 has internal live validation covering transcription cleanup while diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md index 88b50b3..b2cfd12 100644 --- a/doc/configuration.zh.md +++ b/doc/configuration.zh.md @@ -2,7 +2,7 @@ **简体中文** | [English](./configuration.en.md) -本文是 VoScript v0.7.5 的公开配置索引,覆盖当前代码已经读取并生效的 +本文是 VoScript v0.7.6 的公开配置索引,覆盖当前代码已经读取并生效的 环境变量、`POST /api/transcribe` 的请求级覆盖语义,以及还没有暴露为稳定 配置项的内部默认值。没有在本文列出的 Whisper / diarization / AS-norm 变量, 不要假定已经可用。 @@ -37,7 +37,7 @@ | `JOBS_MAX_CACHE` | `200` | 内存 job LRU 上限;被淘汰的完成任务仍可从磁盘 `status.json` / `result.json` 查询。 | | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU 模型空闲卸载超时,默认 180 秒(3 分钟)。设为 `0` 可关闭空闲卸载并保持模型常驻。开启后,只有串行 GPU 运行时空闲达到该秒数才释放已加载模型;下一次 reload 时 ASR、diarization 和 embedding 会在各自 lazy load 时分别选择当前可见 CUDA 中空闲显存最多的设备。 | -`MODELS_DIR` 和 `LANGUAGE` 在配置模块里有定义,但 v0.7.5 的主 HTTP 转写路径 +`MODELS_DIR` 和 `LANGUAGE` 在配置模块里有定义,但 v0.7.6 的主 HTTP 转写路径 没有把它们作为稳定公开调参入口使用:Whisper 本地 checkpoint 查找仍使用 `/models/faster-whisper-`,语言默认请通过请求字段 `language` 控制或留空自动检测。 @@ -89,7 +89,7 @@ Hugging Face snapshot,缓存不完整时再走 Hub。 当前内部 ASR 默认值:`beam_size=5`、`vad_filter=True`、 `vad_parameters.min_silence_duration_ms=500`、`condition_on_previous_text=False`。 -这些值在 v0.7.5 还没有对应 env 或 API 字段;不要写 `WHISPER_BEAM_SIZE`、 +这些值在 v0.7.6 还没有对应 env 或 API 字段;不要写 `WHISPER_BEAM_SIZE`、 `WHISPER_COMPUTE_TYPE`、`WHISPER_VAD_*` 之类未实现配置。 ## 降噪 @@ -101,7 +101,7 @@ Hugging Face snapshot,缓存不完整时再走 Hub。 | API `denoise_model` | 省略 | 省略表示继承 `DENOISE_MODEL`;显式传 `none` 表示只对本次任务关闭降噪。 | | API `snr_threshold` | 省略 | 省略表示继承 `DENOISE_SNR_THRESHOLD`;显式传值只覆盖本次任务的 DeepFilterNet SNR gate。 | -v0.7.5 默认面向干净会议录音,因此 `DENOISE_MODEL=none`。只有噪声环境才建议按任务 +v0.7.6 默认面向干净会议录音,因此 `DENOISE_MODEL=none`。只有噪声环境才建议按任务 或服务级启用 `deepfilternet` / `noisereduce`。如需“干净录音自动跳过”,请选择 `deepfilternet`;`noisereduce` 一旦被选择就会运行。 @@ -159,7 +159,7 @@ cohort 生命周期: - 否则扫描持久化转写结果和 `emb_*.npy` 构建并保存 cohort。 - 每次 enroll / update 后,后台 `cohort-rebuild` 线程每 60 秒检查一次,在最近一次 enroll 至少过去 30 秒后自动重建。 -- v0.7.5 的后台自动重建会保护更大的已加载或已持久化 cohort:清空转写结果、 +- v0.7.6 的后台自动重建会保护更大的已加载或已持久化 cohort:清空转写结果、 只有少量 embedding,或源数量少于现有 cohort 时,不会自动缩小 cohort。 - `POST /api/voiceprints/rebuild-cohort` 是显式手动重建,仍按当前可用 embedding 立即生成新 cohort。 @@ -184,6 +184,13 @@ cohort 生命周期: 新增字段按可选字段原则扩展;客户端应忽略不认识的字段,并容忍 `words` / `alignment` / `warning` 缺失。 +## v0.7.6 验证口径 + +v0.7.6 已用 internal live validation 覆盖:GPU cleanup 期间 `/healthz` 仍保持可用、 +WhisperX forced alignment 运行时隔离与模型复用、短单段 stock outro 幻觉过滤,以及 +embedding 单次读取规范化 WAV 后按 diarization turn 切片的路径。公开文档只记录行为 +类别,不发布真实任务名、样本名、job id、speaker id、主机、日志或路径。 + ## v0.7.4 验证口径 v0.7.4 已用内部 live validation 验证:清空持久化转写结果后,只要既有声纹库和已加载 / diff --git a/doc/security.en.md b/doc/security.en.md index 3cc2ec7..3331141 100644 --- a/doc/security.en.md +++ b/doc/security.en.md @@ -22,7 +22,7 @@ Treat the service as if it were an internal database. ## Built-in hardening (on by default) -As of 0.6.0 the following protections are in place out of the box: +As of 0.7.6 the following protections are in place out of the box: 1. **Container runs as a non-root user.** The Dockerfile creates an `app` user (uid/gid 1000 by default, overridable via `APP_UID`/ diff --git a/doc/security.zh.md b/doc/security.zh.md index ededcb6..85d18f4 100644 --- a/doc/security.zh.md +++ b/doc/security.zh.md @@ -19,7 +19,7 @@ ## 内置的硬化(默认启用) -当前版本(0.6.0)默认开启以下保护: +当前版本(0.7.6)默认开启以下保护: 1. **容器以非 root 用户运行**。Dockerfile 创建 `app` 用户(uid/gid 1000, 可通过 `APP_UID`/`APP_GID` 覆盖),`USER app`。即使服务代码被 RCE, diff --git a/tests/unit/test_main_lifespan.py b/tests/unit/test_main_lifespan.py index e6ce072..7346c60 100644 --- a/tests/unit/test_main_lifespan.py +++ b/tests/unit/test_main_lifespan.py @@ -54,8 +54,8 @@ def test_rebuild_thread_alive_during_lifespan(app_client): assert thread.daemon, "cohort-rebuild thread must be a daemon thread" -def test_openapi_version_reports_075(app_client): - assert app_client.app.version == "0.7.5" +def test_openapi_version_reports_076(app_client): + assert app_client.app.version == "0.7.6" def test_rebuild_thread_survives_tick_exception(app_client, monkeypatch): From 439a5d269d292872c8035253a81c290fdc279690 Mon Sep 17 00:00:00 2001 From: Maple Gao Date: Thu, 7 May 2026 12:50:09 +0800 Subject: [PATCH 10/10] fix: remove NLTK runtime dependency --- .github/workflows/claude-code-review.yml | 1 + app/nltk/__init__.py | 1 + app/nltk/tokenize/__init__.py | 1 + app/nltk/tokenize/punkt.py | 89 +++++++++++++++++++ app/requirements.txt | 5 +- doc/changelog.en.md | 3 +- doc/changelog.zh.md | 3 +- .../unit/test_dependency_runtime_baseline.py | 4 +- tests/unit/test_whisperx_punkt_shim.py | 26 ++++++ 9 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 app/nltk/__init__.py create mode 100644 app/nltk/tokenize/__init__.py create mode 100644 app/nltk/tokenize/punkt.py create mode 100644 tests/unit/test_whisperx_punkt_shim.py diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 24b6f66..a47459c 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -52,5 +52,6 @@ jobs: synchronized English/Chinese documentation. Avoid formatting-only comments. claude_args: | --model ${{ env.CLAUDE_MODEL }} + --max-turns 30 env: ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} diff --git a/app/nltk/__init__.py b/app/nltk/__init__.py new file mode 100644 index 0000000..c272975 --- /dev/null +++ b/app/nltk/__init__.py @@ -0,0 +1 @@ +"""Minimal NLTK compatibility surface required by WhisperX alignment.""" diff --git a/app/nltk/tokenize/__init__.py b/app/nltk/tokenize/__init__.py new file mode 100644 index 0000000..fadecc8 --- /dev/null +++ b/app/nltk/tokenize/__init__.py @@ -0,0 +1 @@ +"""Tokenization compatibility helpers for WhisperX.""" diff --git a/app/nltk/tokenize/punkt.py b/app/nltk/tokenize/punkt.py new file mode 100644 index 0000000..5c43679 --- /dev/null +++ b/app/nltk/tokenize/punkt.py @@ -0,0 +1,89 @@ +"""Small Punkt-compatible sentence span tokenizer for WhisperX. + +WhisperX 3.3.1 imports ``PunktParameters`` and ``PunktSentenceTokenizer`` only +to split an already bounded segment into sentence spans. Pulling the full NLTK +distribution into the runtime introduces unrelated data/license surface, so this +module implements the small API shape WhisperX uses. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Iterable + + +@dataclass +class PunktParameters: + """Subset of NLTK's PunktParameters used by WhisperX.""" + + abbrev_types: set[str] = field(default_factory=set) + + +class PunktSentenceTokenizer: + """Sentence span splitter compatible with WhisperX's use of NLTK Punkt.""" + + _TERMINATORS = {".", "!", "?", "。", "!", "?"} + + def __init__(self, params: PunktParameters | None = None) -> None: + self.params = params or PunktParameters() + + def span_tokenize(self, text: str) -> Iterable[tuple[int, int]]: + """Yield half-open sentence spans in ``text``. + + This intentionally implements conservative splitting: common + abbreviations configured by WhisperX are not treated as sentence + boundaries, and punctuation must be followed by whitespace or end of + string before a split is emitted. + """ + + start = 0 + index = 0 + length = len(text) + while index < length: + char = text[index] + if char not in self._TERMINATORS or self._is_abbreviation(text, index): + index += 1 + continue + + next_index = index + 1 + while next_index < length and text[next_index] in { + '"', + "'", + ")", + "]", + "}", + "”", + "’", + }: + next_index += 1 + + if ( + next_index < length + and char in {".", "!", "?"} + and not text[next_index].isspace() + ): + index += 1 + continue + + end = next_index + while end < length and text[end].isspace(): + end += 1 + + yield (start, next_index) + start = end + index = end + + if start < length: + yield (start, length) + elif length == 0: + return + + def _is_abbreviation(self, text: str, dot_index: int) -> bool: + if text[dot_index] != ".": + return False + prefix = text[:dot_index] + match = re.search(r"([A-Za-z]+)$", prefix) + if not match: + return False + return match.group(1).lower() in self.params.abbrev_types diff --git a/app/requirements.txt b/app/requirements.txt index 80cf15e..d3765ad 100755 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -42,8 +42,9 @@ sqlite-vec>=0.1.0 # Docker installs whisperx==3.3.1 with --no-deps after this file is resolved. # WhisperX 3.3.1 still hard-pins faster-whisper==1.1.0 / ctranslate2<4.5.0, # which would replace the cuDNN9-compatible ASR stack above and look for -# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image. -nltk>=3.9,<4.0 +# cuDNN8 runtime libraries in the CUDA 12.4 + cuDNN9 base image. Its alignment +# module imports only a tiny Punkt sentence-span API; app/nltk provides that +# compatibility shim so the image does not need the full NLTK distribution. transformers>=5.0.0,<5.1.0 # Voice enhancement / noise reduction (optional — loaded only when DENOISE_MODEL != "none") deepfilternet>=0.5.6 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index dc40464..5bf3d11 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -17,7 +17,8 @@ transitive dependencies. Docker installs WhisperX with `--no-deps` so its old ASR transitive dependencies cannot replace the current `numpy<2` / SciPy 1.11.x / cuDNN9 runtime baseline or trigger cuDNN8 library lookups; - the `nltk` package needed by alignment is installed explicitly. + the small Punkt sentence-span API used by alignment is provided by an + internal compatibility shim instead of pulling the full NLTK distribution. ### Reliability diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index f33bc1a..c542919 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -15,7 +15,8 @@ `ctranslate2>=4.7.1,<5.0`,同时收紧 `pyannote.*` 与 `pandas` 传递依赖边界。 Docker 会以 `--no-deps` 安装 WhisperX,避免其旧 ASR 传递依赖覆盖当前 `numpy<2` / SciPy 1.11.x / cuDNN9 运行时基线并在运行时查找 cuDNN8 动态库; - alignment 所需的 `nltk` 由运行时依赖显式安装。 + alignment 用到的少量 Punkt sentence-span API 由内部兼容 shim 提供,不再拉入完整 + NLTK 分发包。 ### 可靠性 diff --git a/tests/unit/test_dependency_runtime_baseline.py b/tests/unit/test_dependency_runtime_baseline.py index 657ea96..9bdc753 100644 --- a/tests/unit/test_dependency_runtime_baseline.py +++ b/tests/unit/test_dependency_runtime_baseline.py @@ -30,8 +30,10 @@ def test_faster_whisper_runtime_stays_on_cudnn9_compatible_ctranslate2(): def test_docker_installs_whisperx_without_replacing_asr_runtime_stack(): lines = _requirements_lines() dockerfile = _dockerfile_text() + root = Path(__file__).resolve().parents[2] - assert "nltk>=3.9,<4.0" in lines + assert not any(line.startswith("nltk") for line in lines) + assert (root / "app" / "nltk" / "tokenize" / "punkt.py").exists() assert "whisperx==3.3.1" not in lines assert ( 'pip install --no-cache-dir -i "$PIP_INDEX_URL" --no-deps whisperx==3.3.1' diff --git a/tests/unit/test_whisperx_punkt_shim.py b/tests/unit/test_whisperx_punkt_shim.py new file mode 100644 index 0000000..a43e793 --- /dev/null +++ b/tests/unit/test_whisperx_punkt_shim.py @@ -0,0 +1,26 @@ +from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer + + +def test_punkt_shim_keeps_common_abbreviations_with_sentence() -> None: + params = PunktParameters() + params.abbrev_types = {"dr", "vs", "mr", "mrs", "prof"} + tokenizer = PunktSentenceTokenizer(params) + + text = "Dr. Maple joined. The meeting ended?" + + spans = list(tokenizer.span_tokenize(text)) + + assert [text[start:end] for start, end in spans] == [ + "Dr. Maple joined.", + "The meeting ended?", + ] + + +def test_punkt_shim_supports_cjk_terminators() -> None: + tokenizer = PunktSentenceTokenizer(PunktParameters()) + + text = "第一句。第二句!" + + spans = list(tokenizer.span_tokenize(text)) + + assert [text[start:end] for start, end in spans] == ["第一句。", "第二句!"]