From 0db96d706f11bb7d848dae95d9ca81439d6e66a8 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:33:58 +0100 Subject: [PATCH 1/4] feat(catalog): add DeepSeek V4 + GLM-5 frontier families; bump mlx-vlm + vllm Release upstream polish. Deps (loose floor bumps, no code change): - mlx-vlm 0.6.0 -> 0.6.3 - vllm 0.22.0 -> 0.22.1 ([vllm] + [triattention] extras) Discover catalog -- two frontier sparse-MoE families (text-only, verified HF repos + real on-disk sizes): - DeepSeek V4: Flash (284B / ~13B active, 1M ctx, baked-in MTP head) + Pro (1.6T). mlx-community 4-bit Flash (154 GB) is the local-viable entry; official BF16 + 8-bit + Pro listed for awareness. - GLM-5 / GLM-5.1: GlmMoeDsa MoE (256 experts / 8 active, ~200K ctx). unsloth GGUF (Q4_K_M ~515 GB) + mlx-community MXFP4 + zai-org BF16. Both text-only (configs carry no vision_config) so capabilities omit vision -- no broken composer affordance. Tests + gate: - tests/test_catalog_text_families.py: parse + required-field + text-only + discover-payload checks. - E2E phase 0 "new model families" check asserts both surface in the live /api/workspace catalog with their full variant set. Validated: phase 0 PASS, 11 checks. Tracked follow-ups (not in this change): MTPLX installer already auto-updates to v1.0.1 (re-test FU-079 empty-output vs its new /v1 streaming); dflash-mlx v0.1.9 migration stays deferred (FU-057); llama-cpp-turboquant branch drifted (FU-065 commit-pin needs a verified test-build). --- backend_service/catalog/text_models.py | 187 +++++++++++++++++++++++++ pyproject.toml | 6 +- scripts/e2e_test_suite.py | 20 +++ tests/test_catalog_text_families.py | 62 ++++++++ 4 files changed, 272 insertions(+), 3 deletions(-) create mode 100644 tests/test_catalog_text_families.py diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py index 5fbb153..ed77948 100644 --- a/backend_service/catalog/text_models.py +++ b/backend_service/catalog/text_models.py @@ -881,6 +881,193 @@ "Co-developed with NVIDIA for efficient local deployment.", ], }, + { + # Frontier sparse-MoE family (DeepseekV4ForCausalLM, 256 routed experts + # / 6 active, 1M context via YaRN, baked-in MTP head -> speculative + # decoding). Text-only. Listed for discovery awareness — even the + # "small" Flash variant is 154 GB at 4-bit, so these target top-end + # desktops / workstations, not laptops. + "id": "deepseek-v4", + "name": "DeepSeek V4", + "provider": "DeepSeek", + "headline": "Frontier MoE reasoning + agentic coding; the Flash variant is the local-viable one.", + "summary": "DeepSeek V4 — Flash (284B / ~13B active) for top-end desktops, Pro (1.6T) for the frontier.", + "description": ( + "DeepSeek V4 is a sparse Mixture-of-Experts family (256 routed experts, ~6 active per token) " + "with 1M-token context via YaRN and a baked-in MTP head for speculative decoding. V4-Flash " + "activates ~13B of 284B total parameters; V4-Pro is the 1.6T flagship. Text-only, MIT-licensed." + ), + "updatedLabel": "Released 2026", + "popularityLabel": "Frontier family", + "likesLabel": "DeepSeek official", + "badges": ["Reasoning", "Coding", "Agents", "Long context"], + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "defaultVariantId": "mlx-community/DeepSeek-V4-Flash-4bit", + "variants": [ + { + "id": "mlx-community/DeepSeek-V4-Flash-4bit", + "name": "DeepSeek V4 Flash MLX 4-bit", + "repo": "mlx-community/DeepSeek-V4-Flash-4bit", + "link": "https://huggingface.co/mlx-community/DeepSeek-V4-Flash-4bit", + "paramsB": 284.0, + "sizeGb": 154.0, + "format": "MLX", + "quantization": "4-bit", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "MoE 284B / ~13B active. 4-bit MLX needs ~160 GB unified memory (M3/M4 Ultra). MTP head enables speculative decoding.", + "contextWindow": "1M", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2026-04", + }, + { + "id": "mlx-community/DeepSeek-V4-Flash-8bit", + "name": "DeepSeek V4 Flash MLX 8-bit", + "repo": "mlx-community/DeepSeek-V4-Flash-8bit", + "link": "https://huggingface.co/mlx-community/DeepSeek-V4-Flash-8bit", + "paramsB": 284.0, + "sizeGb": 284.0, + "format": "MLX", + "quantization": "8-bit", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "8-bit MLX conversion — higher fidelity, ~290 GB unified memory.", + "contextWindow": "1M", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2026-04", + }, + { + "id": "deepseek-ai/DeepSeek-V4-Flash", + "name": "DeepSeek V4 Flash (BF16)", + "repo": "deepseek-ai/DeepSeek-V4-Flash", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash", + "paramsB": 284.0, + "sizeGb": 568.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Official BF16 weights — convert to MLX/GGUF locally or run on a multi-GPU box.", + "contextWindow": "1M", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2026-04", + }, + { + "id": "deepseek-ai/DeepSeek-V4-Pro", + "name": "DeepSeek V4 Pro (frontier)", + "repo": "deepseek-ai/DeepSeek-V4-Pro", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro", + "paramsB": 1600.0, + "sizeGb": 3200.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "1.6T flagship (~49B active). Frontier / awareness — needs a GPU cluster; not a local launch path.", + "contextWindow": "1M", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2026-04", + }, + ], + "readme": [ + "DeepSeek V4 is a sparse-MoE family with 1M-token context and baked-in MTP heads for speculative decoding.", + "V4-Flash (284B / ~13B active) is the local-viable variant: the mlx-community 4-bit conversion is ~154 GB and runs on M3/M4 Ultra-class unified memory.", + "V4-Pro (1.6T) is listed for awareness; it targets multi-GPU clusters rather than a single desktop.", + ], + }, + { + # Frontier sparse-MoE family (GlmMoeDsa arch, 256 routed experts / 8 + # active, ~200K context). Text-only. Z.ai / Tsinghua. Listed for + # discovery awareness — even 4-bit GGUF is ~515 GB, so this is a + # cluster / very-high-end-workstation family, not a laptop one. + "id": "glm-5", + "name": "GLM-5", + "provider": "Z.ai", + "headline": "Z.ai / Tsinghua frontier MoE — agentic coding rivaling closed frontier models.", + "summary": "GLM-5 / GLM-5.1 sparse MoE (256 experts), ~200K context. Frontier-scale — top-end hardware only.", + "description": ( + "GLM-5 is a large sparse Mixture-of-Experts model (GlmMoeDsa architecture, 256 routed experts, " + "8 active per token) with ~200K context. GLM-5.1 is the refined release. Strong agentic coding " + "and reasoning. Text-only, open weights — frontier-scale, so even a 4-bit GGUF is ~500 GB." + ), + "updatedLabel": "Released 2026", + "popularityLabel": "Frontier family", + "likesLabel": "Z.ai official", + "badges": ["Coding", "Reasoning", "Agents", "Long context"], + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "defaultVariantId": "unsloth/GLM-5.1-GGUF", + "variants": [ + { + "id": "unsloth/GLM-5.1-GGUF", + "name": "GLM-5.1 GGUF", + "repo": "unsloth/GLM-5.1-GGUF", + "link": "https://huggingface.co/unsloth/GLM-5.1-GGUF", + "paramsB": 735.0, + "sizeGb": 515.0, + "format": "GGUF", + "quantization": "Q4_K_M", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Q4_K_M ~515 GB; the same repo ships smaller UD-IQ2 quants down to ~250 GB. Frontier-scale llama.cpp run.", + "contextWindow": "200K", + "launchMode": "direct", + "backend": "llama.cpp", + "releaseDate": "2026-05", + }, + { + "id": "mlx-community/GLM-5.1-MXFP4-Q8", + "name": "GLM-5.1 MLX MXFP4", + "repo": "mlx-community/GLM-5.1-MXFP4-Q8", + "link": "https://huggingface.co/mlx-community/GLM-5.1-MXFP4-Q8", + "paramsB": 735.0, + "sizeGb": 449.0, + "format": "MLX", + "quantization": "MXFP4", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "MXFP4 MoE quant for Apple Silicon — ~450 GB unified memory.", + "contextWindow": "200K", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2026-05", + }, + { + "id": "zai-org/GLM-5.1", + "name": "GLM-5.1 (BF16)", + "repo": "zai-org/GLM-5.1", + "link": "https://huggingface.co/zai-org/GLM-5.1", + "paramsB": 735.0, + "sizeGb": 1507.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Official BF16 weights — convert / quantize locally or run on a multi-GPU box.", + "contextWindow": "200K", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2026-05", + }, + { + "id": "zai-org/GLM-5", + "name": "GLM-5 (BF16)", + "repo": "zai-org/GLM-5", + "link": "https://huggingface.co/zai-org/GLM-5", + "paramsB": 735.0, + "sizeGb": 1507.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Initial GLM-5 release; GLM-5.1 is the refined follow-up — prefer it unless reproducing a baseline.", + "contextWindow": "200K", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2026-04", + }, + ], + "readme": [ + "GLM-5 is Z.ai / Tsinghua's frontier sparse-MoE family (GlmMoeDsa, 256 experts / 8 active), strong on agentic coding.", + "GLM-5.1 is the refined release; unsloth + mlx-community publish GGUF and MXFP4 quants.", + "Frontier-scale: a 4-bit GGUF is ~515 GB, so this family targets clusters and very-high-end workstations.", + ], + }, ] diff --git a/pyproject.toml b/pyproject.toml index c999578..623203a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,13 +35,13 @@ mlx-lm = [ # AutoProcessor); without it ``mlx_vlm.load`` raises ImportError on # the Qwen2.5-VL family during processor build. mlx-vlm = [ - "mlx-vlm>=0.6.0", + "mlx-vlm>=0.6.3", "torchvision>=0.20", ] -triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "vllm>=0.22.0"] +triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "vllm>=0.22.1"] triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "mlx-lm>=0.22.0"] turboquant = ["turboquant-mlx-full>=0.6.2"] -vllm = ["vllm>=0.22.0"] +vllm = ["vllm>=0.22.1"] dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@fada1eb2b75cd1c875ca6547b6518783fd3d2956"] dflash = ["dflash>=0.1.0"] desktop = [ diff --git a/scripts/e2e_test_suite.py b/scripts/e2e_test_suite.py index 7c03118..6a962c4 100755 --- a/scripts/e2e_test_suite.py +++ b/scripts/e2e_test_suite.py @@ -295,6 +295,25 @@ def _resolve_hf_guard(): ok = ("owner/name" in blob) or ("400" in blob) return ("pass" if ok else "fail"), ("" if ok else f"unexpected: {err[:160]}"), {} + # New-feature gate for the frontier families added this release. Asserts + # they surface in the live Discover catalog (/api/workspace) with their + # full variant set — a shape check, no model load (these are 150 GB+). + def _new_model_families(): + rc, payload, err = _cli_json("call", "GET", "/api/workspace", timeout=15.0) + if rc != 0 or not isinstance(payload, dict): + return "fail", f"workspace fetch failed: {err[:160]}", {} + fams = {f.get("id"): f for f in (payload.get("featuredModels") or [])} + missing = [] + for fid in ("deepseek-v4", "glm-5"): + fam = fams.get(fid) + if fam is None: + missing.append(f"{fid}: absent") + elif len(fam.get("variants") or []) < 4: + missing.append(f"{fid}: only {len(fam.get('variants') or [])} variants") + if missing: + return "fail", "; ".join(missing)[:200], {"missing": missing} + return "pass", "", {"families": ["deepseek-v4", "glm-5"]} + for name, fn in [ ("health", _health), ("routes", _routes), ("gpu-status", _gpu), ("mtplx-status", _mtplx), ("inventory", _inventory), @@ -303,6 +322,7 @@ def _resolve_hf_guard(): ("ollama-compat (#3)", _ollama_compat), ("model import scan (#4)", _model_import_scan), ("run-from-hf guard (#5)", _resolve_hf_guard), + ("new model families (DeepSeek V4 / GLM-5)", _new_model_families), ]: phase.checks.append(_check(name, fn)) phase.status = "fail" if any(c.status == "fail" for c in phase.checks) else "pass" diff --git a/tests/test_catalog_text_families.py b/tests/test_catalog_text_families.py new file mode 100644 index 0000000..be3e445 --- /dev/null +++ b/tests/test_catalog_text_families.py @@ -0,0 +1,62 @@ +"""Catalog gate for the frontier text families added for the release +(DeepSeek V4, GLM-5). Asserts they parse, carry every field the discover +payload builder reads, and surface in the family payloads — so a malformed +entry can't ship a broken Discover tab. +""" + +import unittest + +from backend_service.catalog.text_models import MODEL_FAMILIES + +_REQUIRED_FAMILY_FIELDS = { + "id", "name", "provider", "headline", "summary", "description", + "updatedLabel", "popularityLabel", "likesLabel", "badges", "capabilities", + "defaultVariantId", "variants", "readme", +} +_REQUIRED_VARIANT_FIELDS = { + "id", "name", "repo", "link", "paramsB", "sizeGb", "format", + "quantization", "capabilities", "note", "contextWindow", "launchMode", "backend", +} + + +class NewTextFamiliesTests(unittest.TestCase): + def setUp(self): + self.by_id = {f["id"]: f for f in MODEL_FAMILIES} + + def test_deepseek_v4_and_glm5_present(self): + self.assertIn("deepseek-v4", self.by_id) + self.assertIn("glm-5", self.by_id) + + def test_new_families_have_required_shape(self): + for fid in ("deepseek-v4", "glm-5"): + fam = self.by_id[fid] + self.assertEqual(_REQUIRED_FAMILY_FIELDS - set(fam), set(), f"{fid} family fields") + self.assertTrue(fam["variants"], f"{fid} has variants") + variant_ids = [v["id"] for v in fam["variants"]] + self.assertIn(fam["defaultVariantId"], variant_ids, f"{fid} default variant valid") + for v in fam["variants"]: + self.assertEqual(_REQUIRED_VARIANT_FIELDS - set(v), set(), f"{fid}/{v['id']} variant fields") + self.assertEqual(v["link"], f"https://huggingface.co/{v['repo']}", f"{fid}/{v['id']} link") + self.assertIn(v["backend"], ("mlx", "llama.cpp", "vllm")) + self.assertIn(v["launchMode"], ("direct", "convert")) + + def test_new_families_are_text_only_no_vision(self): + # DeepSeek V4 + GLM-5 configs carry no vision_config — the catalog + # must not advertise vision (would render a broken composer affordance). + for fid in ("deepseek-v4", "glm-5"): + fam = self.by_id[fid] + self.assertNotIn("vision", fam["capabilities"], f"{fid} family vision tag") + for v in fam["variants"]: + self.assertNotIn("vision", v["capabilities"], f"{fid}/{v['id']} vision tag") + + def test_new_families_surface_in_discover_payloads(self): + from backend_service.helpers.discovery import _model_family_payloads + + payloads = _model_family_payloads({"totalMemoryGb": 64, "availableMemoryGb": 32}, []) + ids = {p.get("id") for p in payloads} + self.assertIn("deepseek-v4", ids) + self.assertIn("glm-5", ids) + + +if __name__ == "__main__": + unittest.main() From c11aac02f54494d26a6535c83cc81cfaeb508203 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:38:55 +0100 Subject: [PATCH 2/4] docs(follow-ups): record 2026-06-11 upstream scan in FU-065/067/079 - FU-065: turbo branch drifted 2cbfdc62 -> 73eb521d (reproducibility risk confirmed; pin still deferred pending a verified test-compile). - FU-079: MTPLX hit v1.0.0/v1.0.1 (installer auto-updates from 0.3.5); v1.0.0 added real /v1 token streaming -> re-test the empty-output against v1.0.1. - FU-067: dflash-mlx v0.1.9 now tagged; FU-057 migration stays deferred. --- CLAUDE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 061c310..e3f30ec 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -182,7 +182,7 @@ no longer relevant. | ~~FU-062~~ | ~~Bump `turboquant-mlx-full` floor `>=0.3.0` → `>=0.4.0`~~ | **Shipped 2026-05-25 (v0.9.3).** | Upstream `turboquant-mlx-full` 0.4.1 on PyPI (installed was 0.3.0, FU-001 pin). v0.4.0 added **expert streaming** — pages router-selected MoE experts from disk per token, runs models whose weights exceed available RAM. Live-validated upstream against `Qwen3.6-35B-A3B` (35B sparse) on a 16 GB Mac mini in under 4 GB RAM, output bit-identical to fully-resident model. Compounds with our existing Hadamard rotation + Lloyd-Max codebook K/V compression. Floor bump only — no API changes required, runtime continues to call `TurboQuantKVCache` with the same signature. Pin lives in [pyproject.toml](pyproject.toml) `[turboquant]` extra. Apple Silicon only (CUDA users stay on the `llama-server-turbo` binary path via FU-001's parallel track). | | ~~FU-063~~ | ~~Bump `mlx-vlm` floor `>=0.4.0` → `>=0.5.0`~~ | **Shipped 2026-05-25 (v0.9.3).** | Upstream `mlx-vlm` 0.5.0 on PyPI (installed was 0.4.4). Minor bump, no API breakage at our call surface (`mlx_vlm.load` + `mlx_vlm.generate` from [mlx_worker_multimodal.py](backend_service/mlx_worker_multimodal.py)). Floor bump in [pyproject.toml](pyproject.toml) `[mlx-vlm]` extra; loose `>=` semantics mean existing 0.4.x installs are still satisfied locally, but fresh installs pick up the newer wheel which carries the upstream Qwen3.5-VL + GLM-4.5V fixes. | | ~~FU-064~~ | ~~Add `ggml-org/Qwen3.6-{27B,35B-A3B}-GGUF` non-MTP catalog rows~~ | **Shipped 2026-05-25 (v0.9.3).** | ggml-org published canonical Q8_0 non-MTP companion packs on 2026-05-22 alongside the MTP variants we wired in FU-047. Two new rows in [text_models.py](backend_service/catalog/text_models.py) `qwen-3-6` family: `ggml-org/Qwen3.6-27B-GGUF` (Q8_0, 29 GB, dense) + `ggml-org/Qwen3.6-35B-A3B-GGUF` (Q8_0, 37 GB, MoE). Catalog note steers users at the MTP siblings when they want spec-dec. No runtime changes — direct `llama.cpp` lane, same as the lmstudio-community Q4_K_M variants already shipping. | -| FU-065 | Pin `llama-cpp-turboquant` to a commit hash instead of branch HEAD | Trigger: any user-reported build divergence between two install runs, OR a release-build gate where reproducibility matters more than tracking upstream. | [scripts/build-llama-turbo.sh](scripts/build-llama-turbo.sh) + [scripts/update-llama-turbo.sh](scripts/update-llama-turbo.sh) currently clone `TheTom/llama-cpp-turboquant` at branch `feature/turboquant-kv-cache` (`LLAMA_TURBO_BRANCH` env var), then `git reset --hard origin/$TURBO_BRANCH`. Two installs at different times can ship different binaries — the same drift problem FU-033 fixed for `dflash-mlx`. Today's branch HEAD is `2cbfdc62a1a047b01377948dfdede8cb6a744866`. Plan: add `LLAMA_TURBO_COMMIT="${LLAMA_TURBO_COMMIT:-2cbfdc62...}"` to both scripts, `git checkout "$LLAMA_TURBO_COMMIT"` after fetch, surface the hash in `llama-server-turbo.version`, and add a sync-assert to `pre-build-check` that compares the build-script pin to a value in [pyproject.toml](pyproject.toml) or a dedicated `UPSTREAM_PINS.md`. Defer because (a) branch is single-purpose with low churn — author is the same TheTom we already trust for `turboquant_plus`; (b) we already have the v0.9.2 → v0.9.3 release with this code path working. | +| FU-065 | Pin `llama-cpp-turboquant` to a commit hash instead of branch HEAD | Trigger: any user-reported build divergence between two install runs, OR a release-build gate where reproducibility matters more than tracking upstream. | [scripts/build-llama-turbo.sh](scripts/build-llama-turbo.sh) + [scripts/update-llama-turbo.sh](scripts/update-llama-turbo.sh) currently clone `TheTom/llama-cpp-turboquant` at branch `feature/turboquant-kv-cache` (`LLAMA_TURBO_BRANCH` env var), then `git reset --hard origin/$TURBO_BRANCH`. Two installs at different times can ship different binaries — the same drift problem FU-033 fixed for `dflash-mlx`. Today's branch HEAD is `2cbfdc62a1a047b01377948dfdede8cb6a744866`. Plan: add `LLAMA_TURBO_COMMIT="${LLAMA_TURBO_COMMIT:-2cbfdc62...}"` to both scripts, `git checkout "$LLAMA_TURBO_COMMIT"` after fetch, surface the hash in `llama-server-turbo.version`, and add a sync-assert to `pre-build-check` that compares the build-script pin to a value in [pyproject.toml](pyproject.toml) or a dedicated `UPSTREAM_PINS.md`. Defer because (a) branch is single-purpose with low churn — author is the same TheTom we already trust for `turboquant_plus`; (b) we already have the v0.9.2 → v0.9.3 release with this code path working. **2026-06-11 release scan:** branch HEAD has drifted `2cbfdc62…` → `73eb521daebc85da7c91d37178940b99a5524cf6` — confirms the reproducibility risk this row tracks. Pin still deferred: pinning the *drifted* `73eb521d` is unsafe without a verified test-compile (could ship a broken turbo binary), and reverting-pinning to the known-good `2cbfdc62` drops upstream work. When picked up, pin to a commit that's been build-tested on the M4 Max box. | | FU-066 | Audit `cache-strategy-matrix` runner against bumped `turboquant-mlx-full` 0.4.x | When FU-062's bump lands in CI or when a user reports a TurboQuant regression. | The runner's TurboQuant cell (`mlx-community/Qwen3-0.6B-4bit × cacheStrategy=turboquant cacheBits=3`) passed against 0.3.0 with output hash `b4337bc07457` (FU-051 evidence). 0.4.x's expert-streaming code path is a no-op for dense 0.6B but flips on for MoE models like `mlx-community/Qwen3.6-35B-A3B-4bit` — worth a one-time live capture of an MoE turboquant cell against the 0.4.x wheel to lock in a baseline hash. No code changes; just record the number once the bumped wheel is installed on the M4 Max box. | | ~~FU-072~~ | ~~Restore `vision` capability to Qwen3.5 + Qwen3.6 families (reverse FU-040)~~ | **Shipped 2026-05-28.** | FU-040 (2026-05-10) removed `vision` from Qwen3.6-27B + family, asserting the dense model was text-only with vision on "a separate `Qwen3.6-27B-VL` we don't ship." Re-checking upstream on 2026-05-28: **every** Qwen3.5/3.6 `config.json` now ships `architectures: [Qwen3_5ForConditionalGeneration]` / `[Qwen3_5MoeForConditionalGeneration]` with `vision_config` + `image_token_id` + `vision_start/end_token_id` — the base models are natively multimodal. `mlx-vlm` ships `qwen3_5` + `qwen3_5_moe` model support, and the `ggml-org/*-GGUF` packs include an `mmproj-*.gguf` sibling (auto-wired by `llama_cpp_engine._resolve_mmproj_path` → `--mmproj`). The catalog was also internally inconsistent (Qwen3.5-9B tagged vision, Qwen3.5-4B not, same arch). Re-added `vision` across both families in [text_models.py](backend_service/catalog/text_models.py): qwen-3-6 family-level + all 11 variants; qwen-3-5 family-level + `Qwen3.5-4B` (vision+video, matching its 9B sibling) + `lmstudio-community/Qwen3.5-9B-GGUF`. **Safety net (why this can't resurrect the FU-040 broken-button bug):** the composer "Attach image" affordance ([ChatComposer.tsx:129](src/features/chat/ChatComposer.tsx)) reads the *runtime* `supportsVision`, which [catalog/capabilities.py](backend_service/catalog/capabilities.py) demotes to False for the MLX worker (carries no images today) and gates on actual `--mmproj` resolution for GGUF ([llama_cpp_engine.py:737](backend_service/inference/llama_cpp_engine.py) `visionEnabled=attempt_mmproj_path is not None`). So the catalog `vision` tag now drives only the variant-picker / discover badges (capability-in-principle), while the functional button stays runtime-accurate. `gemma-4` was already correctly vision-tagged (mlx-vlm `gemma4` support) — left untouched. Catalog parses + `test_capabilities` / `test_mmproj_vision` green. | | ~~FU-075~~ | ~~MLX spec-dec silently broken — stale `configure_full_attention_split` import~~ | **Shipped 2026-05-29.** | **Highest-impact bug this sweep.** Inspecting the matrix runtimeNotes (not just pass/fail) revealed the MLX DFlash / DDTree / MTPLX cells were *passing the weak non-empty-output check while NOT actually running spec-dec* — `actual_strategy: native`, note `dflash-mlx could not be imported (cannot import name 'configure_full_attention_split' from 'dflash_mlx.runtime')`. Root cause: dflash-mlx 0.1.5 moved the pre-0.1.5 top-level `configure_full_attention_split` onto the per-family `target_ops` adapter (the FU-006 migration that rewrote `ddtree.py` — but [mlx_worker_lifecycle.py:153](backend_service/mlx_worker_lifecycle.py) was missed). Python evaluates the whole `from … import a, b` line, so the failed `configure_full_attention_split` symbol killed the co-imported `load_draft_bundle` too → `_dflash_generator` never loaded → **every** MLX spec-dec path fell back to standard generation for all users. Fix: import `load_draft_bundle` + `resolve_target_ops` (both still top-level), resolve the adapter, and call `target_ops.configure_full_attention_split(...)` only for the `hybrid_gdn` family (it's a no-op for pure-attention Qwen3/3.5/3.6 — upstream only calls it there). Live-verified after fix: DFlash note "DFLASH speculative decoding active (draft: z-lab/Qwen3-4B-DFlash-b16)", DDTree "DDTree active (budget=16)". | @@ -190,14 +190,14 @@ no longer relevant. | ~~FU-077~~ | ~~MTPLX isolated venv had a truncated install (missing server deps)~~ | **Shipped 2026-05-29.** | After FU-076 routed correctly, `MtplxEngine` startup died: `ModuleNotFoundError: No module named 'numpy'` — and then `safetensors`, `uvicorn`, `fastapi`, `pydantic`, `mlx-lm`, `rich`… The `~/.chaosengine/mtplx-venv` was a *truncated* install (interrupted `pip install mtplx`), but the installer's verify only ran `import mtplx`, which succeeds because the server deps are imported lazily by `mtplx.server.openai` (not at package top level). Fixed the live venv with a full `pip install --upgrade mtplx` (0.3.5 → 0.3.7, pulled all deps). Hardened [scripts/install-mtplx.sh](scripts/install-mtplx.sh): the verify now imports `mtplx.server.openai` (the real server entrypoint) and auto-retries a full dependency install once before failing loudly, so a truncated install can't pass silently again. | | ~~FU-078~~ | ~~MtplxEngine handed MTPLX a bare repo id instead of the local snapshot path~~ | **Shipped 2026-05-29.** | Final MTPLX blocker: `mtplx quickstart` died with "model is not available locally. Run: mtplx pull Qwen/Qwen3.5-4B" — it resolves a model *id* against its own registry/cache, not the HF hub cache. [mtplx_engine.py](backend_service/inference/mtplx_engine.py) set `model_arg = path or runtime_target or model_ref`, and for raw HF-org repos `path` is None while `runtime_target` is the *repo id* (`Qwen/Qwen3.5-4B`), so MTPLX got an id it couldn't find. Fix: whenever the candidate isn't an existing local directory, resolve the already-downloaded HF snapshot dir via `snapshot_download(model_ref, local_files_only=True)` (no network) and pass that. Live-verified: MTPLX now **loads + engages** (note "MTPLX MTP speculative decoding active (draft tokens: 1, model: Qwen3.5-4B)", reports 17.8 tok/s) instead of failing to start. Also fixed the matrix runner's `0.0 tok/s` (read `done.assistant.metrics.tokS`, not a non-existent top-level `tokensPerSecond`) + captured `dflashAcceptanceRate`. **Verified-genuine after these fixes: DFlash (33.2 tok/s), DDTree (31.4 tok/s), GGUF-MTP (14.7 tok/s), turboquant MLX/GGUF, triattention, native** — all stream real output with real throughput. MTPLX still has one remaining issue → FU-079. | | ~~FU-080~~ | ~~Backend cold start dragged in torch via cache-strategy availability probes~~ | **Shipped 2026-05-29.** | `python -X importtime backend_service.app` measured **2.6 s**, of which **1.64 s was `diffusers.hooks`** (→ `torch` → `torch._dynamo` → `sympy`) — blowing the CLAUDE.md "< 2 s backend startup" target. Traced the chain: state init → system snapshot → `_get_cache_strategies()` → `registry.available()` instantiates every strategy and calls `is_available()`, and the 5 diffusion strategies (fbcache / taylorseer / magcache / pab / fastercache) answered availability by **actually importing `diffusers.hooks`** — pulling the whole torch stack onto the cold-start path on every launch. Fix: new [cache_compression/_diffusers_probe.py](cache_compression/_diffusers_probe.py) `diffusers_at_least(major, minor)` reads the installed version via `importlib.metadata.version` (metadata only — never executes `diffusers.__init__`, so no torch). Each `is_available()` now gates on the version (fbcache ≥0.36, the other four ≥0.38); the real `diffusers.hooks` import stays lazy inside each `apply_*` method (still raises a clean NotImplementedError on a broken install). Result: `diffusers` / `torch` / `mlx` are **no longer in `sys.modules` after `import backend_service.app`**, import time dropped **2.6 s → ~0.85 s**, and cold-start → first `/api/health` 200 is **2.34 s** (the native-backend MLX subprocess probe was already async — "detection still running" on first health, never blocked startup). Two subprocess-isolated regression guards in [tests/test_cache_strategies.py](tests/test_cache_strategies.py) (`StartupImportPurityTests`) assert neither `registry.available()` nor `import backend_service.app` pulls torch/diffusers, so this can't silently regress. All 5 diffusion strategies still report `available=True` against the installed diffusers 0.38. | -| FU-079 | MTPLX proxy doesn't surface incremental tokens to the chat stream (empty output) | Active — MTPLX-specific, lower priority (FU-048: MTPLX is ~flat-to-slower vs the alternatives, which all work). | After FU-075–078, the matrix MTPLX cell flipped from "fake pass via DFlash fallback" to **engine genuinely engaged but `FAIL — empty output`**: the loaded-model note confirms "MTPLX MTP active (draft tokens: 1)" and the done event carries a real `tokS` (17.8), but the streamed assistant text is empty (output SHA `e3b0c44298fc` = the empty-string hash). Confirmed the chat stream's incremental token field IS `{"token": "..."}` (DFlash/DDTree/GGUF-MTP/native all stream through it fine on the same `/api/chat/generate/stream` endpoint), so the gap is in `MtplxEngine`'s OpenAI-`/v1`-proxy → SSE adapter: it surfaces final metrics but not per-token deltas, leaving `full_text` empty for both the matrix runner AND the real Chat UI. Plan: inspect `MtplxEngine.generate` / its streaming proxy in [mtplx_engine.py](backend_service/inference/mtplx_engine.py), map the mtplx server's `/v1/chat/completions` SSE `choices[].delta.content` chunks onto our `{"token": ...}` event shape. Until fixed, MTPLX loads but produces no visible output — DFlash is the working MLX spec-dec lane for the same models (and faster per FU-048). | +| FU-079 | MTPLX proxy doesn't surface incremental tokens to the chat stream (empty output) | Active — MTPLX-specific, lower priority (FU-048: MTPLX is ~flat-to-slower vs the alternatives, which all work). | After FU-075–078, the matrix MTPLX cell flipped from "fake pass via DFlash fallback" to **engine genuinely engaged but `FAIL — empty output`**: the loaded-model note confirms "MTPLX MTP active (draft tokens: 1)" and the done event carries a real `tokS` (17.8), but the streamed assistant text is empty (output SHA `e3b0c44298fc` = the empty-string hash). Confirmed the chat stream's incremental token field IS `{"token": "..."}` (DFlash/DDTree/GGUF-MTP/native all stream through it fine on the same `/api/chat/generate/stream` endpoint), so the gap is in `MtplxEngine`'s OpenAI-`/v1`-proxy → SSE adapter: it surfaces final metrics but not per-token deltas, leaving `full_text` empty for both the matrix runner AND the real Chat UI. Plan: inspect `MtplxEngine.generate` / its streaming proxy in [mtplx_engine.py](backend_service/inference/mtplx_engine.py), map the mtplx server's `/v1/chat/completions` SSE `choices[].delta.content` chunks onto our `{"token": ...}` event shape. Until fixed, MTPLX loads but produces no visible output — DFlash is the working MLX spec-dec lane for the same models (and faster per FU-048). **2026-06-11 release scan:** MTPLX reached **v1.0.0 + v1.0.1** (PyPI; was 0.3.5 on this box). The installer ([scripts/install-mtplx.sh](scripts/install-mtplx.sh)) is unpinned (`pip install --upgrade mtplx`), so a fresh install now auto-pulls v1.0.1 — no code change needed. v1.0.0 release notes claim `/v1/completions` now "streams tokens as they are generated, with real finish reasons and usage", which **may resolve this empty-output** at the source. Still HTTP-server-only (the FU-048 in-process-API root persists). **Action: re-test FU-079 against v1.0.1 with a live MTPLX run** (reinstall the mtplx venv → load an MTP model → confirm the chat stream surfaces per-token `{"token": …}` deltas). If v1.0.0's streaming fixed it, this row closes with no adapter change. | | ~~FU-074~~ | ~~GGUF MTP speculative decoding had no UI toggle~~ | **Shipped 2026-05-28.** | FU-047 wired the GGUF MTP backend (`--spec-type draft-mtp`, gated on the `speculativeDecoding` request flag in [llama_cpp_engine.py:531](backend_service/inference/llama_cpp_engine.py)) + the `ggufMtpAvailable` capability flag, but never surfaced a UI control. The launch modal's only spec-dec toggles are DFlash (hidden for GGUF — "not supported with llama.cpp models") and MTPLX (Apple-Silicon MLX only), so a user loading `ggml-org/Qwen3.6-27B-MTP-GGUF` had **no way to enable** the lane — only the matrix runner could, by POSTing `speculativeDecoding=true` directly. The button audit (this turn) caught it. Added an `isMtpGgufRepo(repo)` helper in [runtimeSupport.ts](src/components/runtimeSupport.ts) (mirrors backend `is_mtp_gguf_repo`: MTP-flavoured name on a GGUF repo) + a "GGUF MTP" toggle in [RuntimeControls.tsx](src/components/RuntimeControls.tsx), shown only when `isGgufBackend && isMtpGgufRepo(selectedCanonicalRepo)` (FU-034 hide-when-not-applicable). It binds to the same `speculativeDecoding` flag the backend reads; no cache-strategy lock (GGUF KV cache is orthogonal to MTP draft decode, unlike MLX DFlash which forces native). Also patched the DFlash-availability reset effect (was clearing `speculativeDecoding` for any non-DFlash model — would have instantly un-ticked the GGUF-MTP box) to keep it on for `ggufMtpModelSupported`. Old binaries without `--spec-type` fall back to standard decode + a runtimeNote (backend FU-047 path) — acceptable since the bundled llama-server is current; a future refinement could additionally gate the toggle on the `ggufMtpAvailable` capability for old-binary boxes (needs the flag threaded through the ~8 RuntimeControls call sites). 8 new `isMtpGgufRepo` unit tests in [runtimeSupport.test.ts](src/components/__tests__/runtimeSupport.test.ts). Verified live: matrix `gguf MTP (Qwen3.6-27B)` cell PASS (sha 74a1eca8b3b4). | | ~~FU-073~~ | ~~Matrix MTPLX cell targeted a non-MTP VL model~~ | **Shipped 2026-05-28.** | `scripts/cache-strategy-matrix.py` `MID_MLX_MTPLX_CAPABLE` was `mlx-community/Qwen3.5-4B-bf16` — a VL conversion (ships `video_preprocessor_config.json`) that carries no MTP heads and is absent from both `MTP_MODEL_MAP` and `_MTP_ALIASES`, so the MTPLX cell could never have exercised MTP even with the model on disk (it'd fail the `has_mtp_heads_strict` tensor probe). Switched to the canonical `Qwen/Qwen3.5-4B`, which is a direct `MTP_MODEL_MAP` key (verified `mtp.layers.*` + `mtp.fc.weight` in its safetensors index), a catalog variant (so it passes the `library_refs` check), and downloaded to exercise the lane. Pairs with the FU-070 download-skip classifier so the cell reports honestly on boxes without the model. | | ~~FU-071~~ | ~~DDTree availability probe checks pre-0.1.5 symbol names~~ | **Shipped 2026-05-28.** | The cache-strategy matrix `ddtree spec-dec` cell skipped with *DDTree runtime not available* even though `dflash_mlx` 0.1.5.1 is installed and `backend_service/ddtree.py` works. Root cause: `dflash.is_ddtree_available()` ([dflash/__init__.py](dflash/__init__.py)) source-greps the installed `dflash_mlx.runtime` for three required symbols and the list was stale — it required `target_forward_with_hidden_states`, which dflash-mlx 0.1.5 **renamed** to the per-family adapter `target_ops.forward_with_hidden_capture` (the same FU-006 migration that rewrote our `ddtree.py` to call `resolve_target_ops(target_model)`). The probe was never updated alongside that rewrite, so it required a symbol that (a) no longer exists in any modern dflash-mlx build (`grep -c` = 0 in the installed `runtime.py`) and (b) our own code no longer uses. Confirmed the real contract our DDTree path imports: `resolve_target_ops` (ddtree.py adapter entry), `load_draft_bundle` (worker lifecycle), `stream_dflash_generate` (speculative). Updated `required_symbols` to those three; dropped the obsolete name + the unused `load_target_bundle`. `dflash.is_ddtree_available()` now returns `True` on this M4 Max box. 4 new `DDTreeAvailabilityProbeTests` in [tests/test_dflash.py](tests/test_dflash.py) mock the runtime source so a future rename can't silently regress the probe again. Note: when FU-057 bumps dflash-mlx to 0.1.7 (which removes `configure_full_attention_split` and reshapes `stream_dflash_generate`), this probe + the lifecycle import need re-checking in lockstep. | | ~~FU-070~~ | ~~Matrix runner: classify missing-download as SKIP, not FAIL~~ | **Shipped 2026-05-28.** | The full `scripts/cache-strategy-matrix.py` sweep on 2026-05-28 reported the `gguf MTP (Qwen3.6-27B)` cell as **FAIL** — `POST /api/models/load -> 500: Cannot load 'ggml-org/Qwen3.6-27B-MTP-GGUF': No .gguf, .safetensors, or pytorch weights found in HF cache entry.` Root cause: the repo had an empty `~/.cache/huggingface/hub/models--ggml-org--Qwen3.6-27B-MTP-GGUF/` dir (4.0 KB, only `refs/main`, dated May 16 — an interrupted pull), and the runner's `skip_reason` library check uses `caps.library_refs`, which is built from the **catalog** (every variant repo from `/api/workspace`), not from what's actually downloaded. So a catalogued-but-undownloaded model passes the library check and only errors at load — reported as a product FAIL when it's really a missing download (same false-positive class as FU-053). Fix: new pure helper `classify_load_skip(msg)` in [scripts/cache-strategy-matrix.py](scripts/cache-strategy-matrix.py) matches the backend's 'no weights found in HF cache entry' markers; `run_cell` now wraps the load call separately and converts that specific error into `skipped=True, skip_reason="weights not downloaded ()"` instead of a failure. Genuine load errors (OOM, etc.) still surface as fails. 4 unit tests in [tests/test_cache_strategy_matrix_runner.py](tests/test_cache_strategy_matrix_runner.py) (`ClassifyLoadSkipTests`) pin the classification. The dflash/mtplx cells already skipped correctly because their target models (`mlx-community/Qwen3-4B-bf16` / `Qwen3.5-4B-bf16`) aren't catalog variants so they never entered `library_refs`. **To actually exercise the GGUF-MTP lane (FU-047/FU-052 trip-wire), download `ggml-org/Qwen3.6-27B-MTP-GGUF` first**, then re-run full. | | ~~FU-069~~ | ~~Bump `turboquant-mlx-full` floor `>=0.4.0` → `>=0.5.0`~~ | **Shipped 2026-05-28.** | Upstream `turboquant-mlx-full` 0.5.0 on PyPI (FU-062 had just floored at 0.4.0 on 2026-05-25). v0.5.0 builds on the v0.4.0 expert-streaming path (FU-062) with **parallel expert prefetch** — the missing MoE experts for each layer are read on a thread pool (`--prefetch-workers`, default `8`) so SSD latency hides behind compute. Upstream-reported **~1.9× faster decode** at a tight cache budget, still bit-identical output. `--prefetch-workers 1` restores the serial v0.4.0 behaviour. No API change at our call surface — runtime still constructs `TurboQuantKVCache` with the same signature; the new flag is converter/runtime-side. Floor bump only in [pyproject.toml](pyproject.toml) `[turboquant]` extra; loose `>=` so existing 0.4.x installs stay satisfied locally. Apple Silicon only. Folds in the spirit of FU-066 (the matrix MoE-turboquant baseline should be captured against 0.5.0 once the wheel is installed on the M4 Max box). | | ~~FU-068~~ | ~~MLX probe timeout 12 s → 20 s~~ | **Shipped 2026-05-25 (v0.9.3).** | E2E full-sweep Phase 1 surfaced three intermittent fails on a freshly-booted backend — `MLX native cache` / `MLX TurboQuant cache` / `fused attention flag` all returned `MLX backend requested but unavailable: ...mlx_worker probe timed out after 12.0 seconds`. Measured cold-start: `time .venv/bin/python -m backend_service.mlx_worker probe` = **12.43 s** on M4 Max / Python 3.11 against current `mlx 0.31.2` + `mlx-lm 0.31.3` + `mlx-vlm 0.4.4` — 0.4 s past the 12.0 s ceiling. The 12.0 s value was an arbitrary default from the v0.8.0 `capabilities.py` extract (commit `f91709e`), never tuned. Bumped to **20.0 s** in [backend_service/inference/capabilities.py](backend_service/inference/capabilities.py) `_probe_native_backends` — ~60% headroom over today's envelope. Phase 5 video gen + Phase 1 GGUF / DFlash / cache-preview already passed (proves MLX itself works once the probe lands), so this was a pure cold-boot probe timing issue, not a regression from the FU-062 / FU-063 floor bumps (which are loose `>=`, no installed package changed). | -| FU-067 | Watch dflash-mlx for v0.1.8+ migration guide (FU-057 is multi-hour, deferred) | Trigger: (a) upstream publishes v0.1.8 with a stability commitment + migration guide, OR (b) we hit a concrete user-visible bug on the orphan `fada1eb` pin, OR (c) a shipped catalog model needs a v0.1.6+ feature (adaptive verify / Gemma4 backend / Qwen3-Next GDN). | Dup of FU-057's trigger but resurfaced after the v0.9.3 upstream scan confirmed v0.1.7 is now on PyPI (`pip install dflash-mlx==0.1.7` resolves) and tagged at commit `210a0fc1`. Plan-of-record stays FU-057's six-step migration. Re-checking quarterly via `git ls-remote --tags` for `v0.1.8` / `v0.2.0` release tags — if upstream publishes a migration guide alongside, the cost drops dramatically. | +| FU-067 | Watch dflash-mlx for v0.1.8+ migration guide (FU-057 is multi-hour, deferred) | Trigger: (a) upstream publishes v0.1.8 with a stability commitment + migration guide, OR (b) we hit a concrete user-visible bug on the orphan `fada1eb` pin, OR (c) a shipped catalog model needs a v0.1.6+ feature (adaptive verify / Gemma4 backend / Qwen3-Next GDN). | Dup of FU-057's trigger but resurfaced after the v0.9.3 upstream scan confirmed v0.1.7 is now on PyPI (`pip install dflash-mlx==0.1.7` resolves) and tagged at commit `210a0fc1`. Plan-of-record stays FU-057's six-step migration. Re-checking quarterly via `git ls-remote --tags` for `v0.1.8` / `v0.2.0` release tags — if upstream publishes a migration guide alongside, the cost drops dramatically. **2026-06-11 release scan:** **v0.1.9** is now tagged (branch HEAD `7f884380`; tags `v0.1.5.1…v0.1.9`). Still no published migration guide, so FU-057's six-step rewrite stays the plan of record and remains deferred. Newest migration target is now v0.1.9 (was v0.1.7/v0.1.8). | | ~~FU-061~~ | ~~"Watching upstream" badge + disabled download for tracked-only image seeds~~ | **Shipped 2026-05-18.** | User-reported gap: downloaded `baidu/ERNIE-Image-Turbo` from Image Discover (it sits in `LATEST_IMAGE_TRACKED_SEEDS`), expected it in the Studio dropdown, didn't appear. Root cause: tracked seeds are discovery-only — Studio's dropdown is fed by `IMAGE_MODEL_FAMILIES` which requires explicit pipeline routing (flow-match flags, sampler registry, scheduler defaults). ERNIE-Image (+ Nucleus-Image, Z-Image, HiDream, GLM-Image, FLUX.2 family) has no diffusers-routable Studio variant yet. Fix path A picked over path B (full per-family pipeline wiring) — surgical UX disambiguation. **Backend:** new `_is_launchable_image_repo(repo_id)` helper in [backend_service/helpers/images.py](backend_service/helpers/images.py) returns True only when `repo_id` resolves to a curated `IMAGE_MODEL_FAMILIES` variant. Wired into both payload sites — `_tracked_latest_seed_payloads` (line 411) + the live-HF lane (line 622) — so every Discover row carries `trackedOnly: bool`. **Frontend:** new `trackedOnly?: boolean` field on `ImageModelVariant` ([src/types/image.ts](src/types/image.ts)). [ImageDiscoverTab.tsx](src/features/images/ImageDiscoverTab.tsx) chip row gains a "Watching upstream" badge + tooltip when `trackedOnly`. Action column branches first on `trackedOnly` → renders a disabled `IconActionButton` with tooltip "Watching upstream — Studio playback for this family isn't wired yet. Catalog entry is for awareness; download won't unlock Studio." instead of the Generate / Download / Resume CTAs. Backward-compat: existing curated families have `trackedOnly: undefined` → falsy → no UX change. **Tests:** new `TrackedOnlyFlagTests` in [tests/test_image_discover.py](tests/test_image_discover.py) — 5 cases covering `_is_launchable_image_repo` (FLUX.1-dev + SDXL = true; ERNIE-Image / Nucleus-Image = false; empty = false), `trackedOnly: True` on ERNIE seed payload, and the negative case where a tracked seed that IS in IMAGE_MODEL_FAMILIES must NOT carry the flag (forward-compat for catalog evolution). **Follow-up path B (deferred):** wire ERNIE-Image / Nucleus-Image / Z-Image / HiDream / GLM-Image / FLUX.2 family as real launchable families via per-family pipeline detection in `image_runtime`. Multi-hour per family, gated on diffusers' upstream support landing for each architecture. | --- From fa0bbd925cd564d9d6757b2a0adffb1191cf2d04 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:34:45 +0100 Subject: [PATCH 3/4] feat(catalog): add Gemma 4 (E2B + 31B) and MiniMax M2.7 families MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemma 4 (gemma-4 family): - E2B: 2B multimodal, 128K ctx — official QAT Q4_0 GGUF (~1.5 GB) + BF16 - 31B: 31B multimodal, 256K ctx — MLX 8-bit, unsloth Q4_K_M GGUF, official QAT GGUF, BF16 - Both carry vision capability (Gemma4ForConditionalGeneration + vision_config confirmed) MiniMax M2.7 (minimax-m2 family): - 256 routed experts / 8 active, 200K ctx, ~240B total params / ~480 GB BF16 - mlx-community MXFP4 (~120 GB), unsloth GGUF Q4_K_M (~130 GB), official BF16 Qwen3.7 skipped — no official Qwen/Qwen3.7-* repo exists on HF as of 2026-06-12. Tests: 7 catalog gate checks updated to cover all 4 frontier families (shape, vision vs text-only, context windows, discover payload presence). --- backend_service/catalog/text_models.py | 210 +++++++++++++++++++++++++ tests/test_catalog_text_families.py | 51 ++++-- 2 files changed, 248 insertions(+), 13 deletions(-) diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py index ed77948..d27f5c1 100644 --- a/backend_service/catalog/text_models.py +++ b/backend_service/catalog/text_models.py @@ -1068,6 +1068,216 @@ "Frontier-scale: a 4-bit GGUF is ~515 GB, so this family targets clusters and very-high-end workstations.", ], }, + { + # Google Gemma 4 — multimodal (image+text) family. Gemma4ForConditionalGeneration + # architecture with vision_config baked in; all sizes accept image inputs. + # E2B = Embedded 2B (128K ctx, ~4 GB BF16) — edge/mobile target. + # 31B = full model (256K ctx, 62.5 GB BF16) — desktop / workstation target. + # Both carry a baked-in mmproj; llama_cpp_engine wires --mmproj automatically + # when the repo has an mmproj shard (FU-072 pattern). + "id": "gemma-4", + "name": "Gemma 4", + "provider": "Google", + "headline": "Google's multimodal open model family — from edge-optimised 2B to capable 31B.", + "summary": "Gemma 4 E2B (2B, 128K) and 31B (256K) — both natively multimodal with vision_config.", + "description": ( + "Gemma 4 is Google's multimodal open-weight family (Gemma4ForConditionalGeneration). " + "The Embedded 2B (E2B) targets on-device and mobile deployment with 128K context; " + "the 31B is the full desktop/workstation variant with 256K context. " + "Both accept image + text inputs natively. Apache-2.0 licensed. " + "Google publishes QAT Q4_0 GGUFs; mlx-community and unsloth publish 4-bit and 8-bit quants." + ), + "updatedLabel": "Released 2025", + "popularityLabel": "Google official", + "likesLabel": "Google official", + "badges": ["Multimodal", "Vision", "Coding", "Long context"], + "capabilities": ["vision", "coding"], + "defaultVariantId": "mlx-community/gemma-4-31b-8bit", + "variants": [ + { + "id": "mlx-community/gemma-4-31b-8bit", + "name": "Gemma 4 31B MLX 8-bit", + "repo": "mlx-community/gemma-4-31b-8bit", + "link": "https://huggingface.co/mlx-community/gemma-4-31b-8bit", + "paramsB": 31.0, + "sizeGb": 32.0, + "format": "MLX", + "quantization": "8-bit", + "capabilities": ["vision", "coding"], + "note": "8-bit MLX quant — good balance of fidelity and VRAM. Needs ~34 GB unified memory.", + "contextWindow": "256K", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2025-05", + }, + { + "id": "unsloth/gemma-4-31B-it-GGUF", + "name": "Gemma 4 31B GGUF (Q4_K_M)", + "repo": "unsloth/gemma-4-31B-it-GGUF", + "link": "https://huggingface.co/unsloth/gemma-4-31B-it-GGUF", + "paramsB": 31.0, + "sizeGb": 19.0, + "format": "GGUF", + "quantization": "Q4_K_M", + "capabilities": ["vision", "coding"], + "note": "Q4_K_M GGUF with mmproj shard for vision. Runs on 24 GB VRAM or Apple Silicon.", + "contextWindow": "256K", + "launchMode": "direct", + "backend": "llama.cpp", + "releaseDate": "2025-05", + }, + { + "id": "google/gemma-4-31B-it-qat-q4_0-gguf", + "name": "Gemma 4 31B Official QAT GGUF", + "repo": "google/gemma-4-31B-it-qat-q4_0-gguf", + "link": "https://huggingface.co/google/gemma-4-31B-it-qat-q4_0-gguf", + "paramsB": 31.0, + "sizeGb": 17.0, + "format": "GGUF", + "quantization": "Q4_0 (QAT)", + "capabilities": ["vision", "coding"], + "note": "Google's official QAT (Quantization-Aware Training) Q4_0 — higher fidelity than post-hoc Q4 at same size.", + "contextWindow": "256K", + "launchMode": "direct", + "backend": "llama.cpp", + "releaseDate": "2025-05", + }, + { + "id": "google/gemma-4-31B-it", + "name": "Gemma 4 31B (BF16)", + "repo": "google/gemma-4-31B-it", + "link": "https://huggingface.co/google/gemma-4-31B-it", + "paramsB": 31.0, + "sizeGb": 62.5, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["vision", "coding"], + "note": "Official BF16 weights — convert to MLX/GGUF locally or run on a 80 GB+ GPU.", + "contextWindow": "256K", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2025-05", + }, + { + "id": "google/gemma-4-E2B-it-qat-q4_0-gguf", + "name": "Gemma 4 E2B Official QAT GGUF", + "repo": "google/gemma-4-E2B-it-qat-q4_0-gguf", + "link": "https://huggingface.co/google/gemma-4-E2B-it-qat-q4_0-gguf", + "paramsB": 2.0, + "sizeGb": 1.5, + "format": "GGUF", + "quantization": "Q4_0 (QAT)", + "capabilities": ["vision", "coding"], + "note": "Embedded 2B — edge/mobile optimised. QAT Q4_0 is ~1.5 GB; runs on CPU or any GPU. 128K context.", + "contextWindow": "128K", + "launchMode": "direct", + "backend": "llama.cpp", + "releaseDate": "2025-05", + }, + { + "id": "google/gemma-4-E2B-it", + "name": "Gemma 4 E2B (BF16)", + "repo": "google/gemma-4-E2B-it", + "link": "https://huggingface.co/google/gemma-4-E2B-it", + "paramsB": 2.0, + "sizeGb": 4.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["vision", "coding"], + "note": "Official BF16 Embedded 2B — convert to GGUF/MLX. Small enough to run on any modern GPU.", + "contextWindow": "128K", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2025-05", + }, + ], + "readme": [ + "Gemma 4 is Google's multimodal open-weight family — all sizes accept image + text inputs.", + "E2B (Embedded 2B, 128K context) targets edge and mobile deployment; the QAT Q4_0 GGUF is ~1.5 GB.", + "The 31B (256K context) is the full-capability variant: mlx-community's 8-bit quant at ~32 GB is the recommended desktop path.", + "Google ships official QAT GGUFs for both sizes — quantization-aware training gives better quality than post-hoc quant at the same file size.", + ], + }, + { + # MiniMax M2.7 — frontier-scale sparse MoE (MiniMaxM2ForCausalLM, 256 routed + # experts / 8 active, 200K context). BF16 total ~480 GB; text-only. + # Strong on long-context reasoning and character consistency. + # Community GGUF: unsloth/MiniMax-M2.7-GGUF. MLX: mlx-community/MiniMax-M2.7-4bit-mxfp4. + "id": "minimax-m2", + "name": "MiniMax M2", + "provider": "MiniMax", + "headline": "MiniMax frontier MoE — 200K context, strong character consistency and long-context reasoning.", + "summary": "MiniMax M2.7 — 256-expert sparse MoE, 200K context. Frontier-scale, top-end hardware only.", + "description": ( + "MiniMax M2.7 is MiniMax's frontier sparse Mixture-of-Experts model (MiniMaxM2ForCausalLM, " + "256 routed experts / 8 active per token) with 200K token context. " + "Compared with M2.5, M2.7 adds strengthened character consistency and emotional intelligence. " + "Text-only. Recommended inference params: temperature=1.0, top_p=0.95, top_k=40. " + "Frontier-scale: BF16 is ~480 GB; even a 4-bit GGUF is ~130 GB." + ), + "updatedLabel": "Released 2026", + "popularityLabel": "Frontier family", + "likesLabel": "MiniMax official", + "badges": ["Reasoning", "Long context", "Agents", "Coding"], + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "defaultVariantId": "mlx-community/MiniMax-M2.7-4bit-mxfp4", + "variants": [ + { + "id": "mlx-community/MiniMax-M2.7-4bit-mxfp4", + "name": "MiniMax M2.7 MLX MXFP4", + "repo": "mlx-community/MiniMax-M2.7-4bit-mxfp4", + "link": "https://huggingface.co/mlx-community/MiniMax-M2.7-4bit-mxfp4", + "paramsB": 240.0, + "sizeGb": 120.0, + "format": "MLX", + "quantization": "MXFP4", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "MoE ~240B / ~10B active. MXFP4 MLX quant — ~120 GB unified memory (M3/M4 Ultra-class).", + "contextWindow": "200K", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2026-05", + }, + { + "id": "unsloth/MiniMax-M2.7-GGUF", + "name": "MiniMax M2.7 GGUF", + "repo": "unsloth/MiniMax-M2.7-GGUF", + "link": "https://huggingface.co/unsloth/MiniMax-M2.7-GGUF", + "paramsB": 240.0, + "sizeGb": 130.0, + "format": "GGUF", + "quantization": "Q4_K_M", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Q4_K_M ~130 GB — needs a large-RAM workstation or multi-GPU box with NVLink.", + "contextWindow": "200K", + "launchMode": "direct", + "backend": "llama.cpp", + "releaseDate": "2026-05", + }, + { + "id": "MiniMaxAI/MiniMax-M2.7", + "name": "MiniMax M2.7 (BF16)", + "repo": "MiniMaxAI/MiniMax-M2.7", + "link": "https://huggingface.co/MiniMaxAI/MiniMax-M2.7", + "paramsB": 240.0, + "sizeGb": 481.0, + "format": "Transformers", + "quantization": "BF16", + "capabilities": ["reasoning", "coding", "agents", "tool-use"], + "note": "Official BF16 weights — convert to GGUF/MLX. Frontier-scale, ~480 GB.", + "contextWindow": "200K", + "launchMode": "convert", + "backend": "mlx", + "releaseDate": "2026-05", + }, + ], + "readme": [ + "MiniMax M2.7 is MiniMax's frontier sparse-MoE model (256 experts / 8 active), with 200K context.", + "M2.7 improves on M2.5 with stronger character consistency and long-context reasoning.", + "The mlx-community MXFP4 quant (~120 GB) is the Apple Silicon path; unsloth Q4_K_M GGUF (~130 GB) targets high-RAM Linux workstations.", + "Frontier-scale — even 4-bit quantization requires 120+ GB of memory.", + ], + }, ] diff --git a/tests/test_catalog_text_families.py b/tests/test_catalog_text_families.py index be3e445..633f5cb 100644 --- a/tests/test_catalog_text_families.py +++ b/tests/test_catalog_text_families.py @@ -1,7 +1,7 @@ """Catalog gate for the frontier text families added for the release -(DeepSeek V4, GLM-5). Asserts they parse, carry every field the discover -payload builder reads, and surface in the family payloads — so a malformed -entry can't ship a broken Discover tab. +(DeepSeek V4, GLM-5, Gemma 4, MiniMax M2). Asserts they parse, carry every +field the discover payload builder reads, and surface in the family payloads +— so a malformed entry can't ship a broken Discover tab. """ import unittest @@ -23,12 +23,14 @@ class NewTextFamiliesTests(unittest.TestCase): def setUp(self): self.by_id = {f["id"]: f for f in MODEL_FAMILIES} - def test_deepseek_v4_and_glm5_present(self): - self.assertIn("deepseek-v4", self.by_id) - self.assertIn("glm-5", self.by_id) + _ALL_NEW_FAMILIES = ("deepseek-v4", "glm-5", "gemma-4", "minimax-m2") + + def test_all_new_families_present(self): + for fid in self._ALL_NEW_FAMILIES: + self.assertIn(fid, self.by_id, f"{fid} missing from MODEL_FAMILIES") def test_new_families_have_required_shape(self): - for fid in ("deepseek-v4", "glm-5"): + for fid in self._ALL_NEW_FAMILIES: fam = self.by_id[fid] self.assertEqual(_REQUIRED_FAMILY_FIELDS - set(fam), set(), f"{fid} family fields") self.assertTrue(fam["variants"], f"{fid} has variants") @@ -40,22 +42,45 @@ def test_new_families_have_required_shape(self): self.assertIn(v["backend"], ("mlx", "llama.cpp", "vllm")) self.assertIn(v["launchMode"], ("direct", "convert")) - def test_new_families_are_text_only_no_vision(self): - # DeepSeek V4 + GLM-5 configs carry no vision_config — the catalog - # must not advertise vision (would render a broken composer affordance). - for fid in ("deepseek-v4", "glm-5"): + def test_text_only_families_have_no_vision(self): + # DeepSeek V4 / GLM-5 / MiniMax M2 carry no vision_config in their HF + # configs — must not advertise vision (broken composer affordance if so). + for fid in ("deepseek-v4", "glm-5", "minimax-m2"): fam = self.by_id[fid] self.assertNotIn("vision", fam["capabilities"], f"{fid} family vision tag") for v in fam["variants"]: self.assertNotIn("vision", v["capabilities"], f"{fid}/{v['id']} vision tag") + def test_gemma4_carries_vision_capability(self): + # All Gemma 4 sizes are multimodal (Gemma4ForConditionalGeneration + vision_config). + fam = self.by_id["gemma-4"] + self.assertIn("vision", fam["capabilities"]) + for v in fam["variants"]: + self.assertIn("vision", v["capabilities"], f"gemma-4/{v['id']} missing vision tag") + + def test_gemma4_contexts(self): + # E2B = 128K, 31B = 256K — verify the catalog reflects the config.json values. + e2b_variants = [v for v in self.by_id["gemma-4"]["variants"] if "E2B" in v["repo"]] + b31_variants = [v for v in self.by_id["gemma-4"]["variants"] if "31B" in v["repo"] or "31b" in v["repo"]] + self.assertTrue(e2b_variants, "no E2B variants found") + self.assertTrue(b31_variants, "no 31B variants found") + for v in e2b_variants: + self.assertEqual(v["contextWindow"], "128K", f"{v['id']} E2B context wrong") + for v in b31_variants: + self.assertEqual(v["contextWindow"], "256K", f"{v['id']} 31B context wrong") + + def test_minimax_m27_context(self): + fam = self.by_id["minimax-m2"] + for v in fam["variants"]: + self.assertEqual(v["contextWindow"], "200K", f"minimax-m2/{v['id']} context wrong") + def test_new_families_surface_in_discover_payloads(self): from backend_service.helpers.discovery import _model_family_payloads payloads = _model_family_payloads({"totalMemoryGb": 64, "availableMemoryGb": 32}, []) ids = {p.get("id") for p in payloads} - self.assertIn("deepseek-v4", ids) - self.assertIn("glm-5", ids) + for fid in self._ALL_NEW_FAMILIES: + self.assertIn(fid, ids, f"{fid} missing from discover payloads") if __name__ == "__main__": From 0d4886403dbcf6047a1025fc00a7848789db5f09 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Mon, 15 Jun 2026 10:18:59 +0100 Subject: [PATCH 4/4] chore(deps): bump turboquant-mlx-full >=0.6.2->0.8.0, vllm >=0.22.1->0.23.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2026-06-15 upstream scan: - turboquant-mlx-full 0.8.0: adds Mamba/hybrid arch support + GPT-OSS-120B optimizations. Same TurboQuantKVCache call surface, backward compatible. Floor: 0.6.2 → 0.8.0. - vllm 0.23.0 released. Floor: 0.22.1 → 0.23.0 (both [vllm] and [triattention] extras). No action needed: - mlx-vlm 0.6.3: already at floor, unchanged. - mlx-lm 0.31.3: installed version, loose >=0.22.0 floor sufficient. - mlx 0.31.2: installed version, loose >=0.22.0 floor sufficient. - diffusers 0.38.0: at floor, no new release. - TriAttention: still at pinned c3744ee6 (v0.2.0), no upstream change. Deferred (tracker notes updated): - dflash-mlx: v0.1.10 now tagged; FU-057/067 migration still deferred. - llama-server-turbo branch: HEAD drifted to 7985f6b9; FU-065 deferred. - TurboQuant+: v0.3.2.3 latest, no PyPI wheel, FU-032 trigger not met. - MTPLX: now v1.0.4; FU-079 re-test still pending. --- CLAUDE.md | 10 +++++----- pyproject.toml | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index e3f30ec..f5767a1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -152,7 +152,7 @@ no longer relevant. | FU-029 | KVTC (NVIDIA ICLR 2026) KV cache strategy | **Deferred 2026-05-10 — CUDA-only upstream, awaiting MLX/Metal port + PyPI release.** | Targeting [OnlyTerp/kvtc](https://github.com/OnlyTerp/kvtc) (Apache 2.0). PCA + adaptive quantization + entropy coding — 8–32× compression vs the dropped ChaosEngine's 3.7×, peer-reviewed at ICLR 2026, beats TurboQuant by 37% at comparable quality on long-context. Upstream blockers: (a) CUDA-only — repo's roadmap mentions MLX/Metal as "planned" but not yet implemented, so the Apple Silicon dev box cannot validate end-to-end; (b) not on PyPI — distributed as a `src.*` repo intended for `git clone`; (c) integration shape is a HuggingFace `DynamicCache` wrapper (not a llama.cpp cache type), so the existing GGUF lane has no path. Re-evaluate when either upstream ships MLX support or a Windows/Linux+CUDA development box becomes available. Apple Silicon users continue on TurboQuant-MLX (also ICLR 2026, native today). | | ~~FU-030~~ | ~~Drop ChaosEngine + RotorQuant strategy slots~~ | **Shipped 2026-05-10.** | ChaosEngine (cryptopoly/ChaosEngine — 1 commit upstream, eclipsed by KVTC at ICLR 2026 with the same PCA approach but 8–32× compression vs 3.7×) and RotorQuant (shipped as a misleading alias for TurboQuant — same ``--cache-type-k turbo{N}`` flags + same Python module marker) both removed from the registry. Persisted user configs that still reference these ids coerce silently to ``turboquant`` via a new ``CacheStrategyRegistry.resolve_legacy_id`` helper + module-level ``_LEGACY_STRATEGY_ALIASES`` map ([cache_compression/__init__.py](cache_compression/__init__.py)). Mirror coercion in frontend ([src/components/runtimeSupport.ts](src/components/runtimeSupport.ts) ``LEGACY_STRATEGY_ALIASES`` + ``canonicalStrategyId``). Two-level llama.cpp fallback chain (was three-level: requested → ChaosEngine → native; now requested → native) in [backend_service/inference/llama_cpp_engine.py](backend_service/inference/llama_cpp_engine.py). Vendored ChaosEngine bundling stripped from [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (3 helper functions removed: ``stageVendoredChaosEngine`` + ``ensureSetuptoolsForPep639`` + ``resolveChaosEngineVendor``). Pre-build probe asserts the legacy-id coercion works in CI. ``[rotorquant]`` extra removed from [pyproject.toml](pyproject.toml). ``CHAOSENGINE_VENDOR_PATH`` env var dropped. Cache strategy speed/quality maps in [helpers/cache.py](backend_service/helpers/cache.py) trimmed to remaining strategies. | | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | -| FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | +| FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.8.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now. **2026-06-15 scan:** latest tags are v0.3.2.1–v0.3.2.3 (HEAD `7f601a13`). Still no PyPI wheel, still no v1.0 tag. FU-032 trigger not met; updated comparison baseline from 0.3.0 to 0.8.0 since our floor advanced. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | | ~~FU-041~~ | ~~Qwen3-Coder-Next-MLX-4bit was mis-canonicalised as Qwen3.6-27B-4bit~~ | **Shipped 2026-05-10.** | User-spotted mismatch: their local install at `/Users/dan/AI_Models/lmstudio-community/Qwen3-Coder-Next-MLX-4bit` was surfacing as canonical repo `mlx-community/Qwen3.6-27B-4bit` in the diagnostics snapshot, picking up the wrong catalog row and the wrong DFlash drafter. Inspecting the on-disk `config.json` confirmed the model is **Qwen3-Next** (architectures `Qwen3NextForCausalLM`, `model_type: "qwen3_next"`, sparse MoE with 512 experts, hidden_size 2048, ~3B active per token) — fundamentally different from the dense Qwen3.6-27B (`qwen3` arch, hidden_size 5120). Root cause: there was no catalog variant for the lmstudio-community community MLX 4-bit conversion of Coder-Next, so the fuzzy matcher in `src/utils/library.ts::libraryVariantMatchScore` settled for the closest "MLX + 4-bit + Qwen3" entry, which happened to be the unrelated `mlx-community/Qwen3.6-27B-4bit` row. Fix: (1) added an explicit `lmstudio-community/Qwen3-Coder-Next-MLX-4bit` variant to the `qwen3-coder-next` family in `backend_service/catalog/text_models.py` with the correct params (80B sparse, ~45 GB on disk, qwen3_next family capabilities). (2) Reverted the FU-038 DFlash aliases that wrongly pointed `mlx-community/Qwen3.6-27B-4bit / bf16 / 8bit` at `Qwen/Qwen3-Coder-Next` — those quants are the dense 27B Coder and have no drafter today. (3) Replaced them with the correct `lmstudio-community/Qwen3-Coder-Next-MLX-4bit` alias plus an `-Instruct` sibling for completeness. New regression tests in `tests/test_dflash.py` pin both the new alias resolution and that the dense 27B-4bit MUST NOT alias to the MoE drafter. | | ~~FU-040~~ | ~~Tool-call parser misses open-only `` + Qwen3.6-27B false-positive vision tag~~ | **Shipped 2026-05-10.** | Surfaced by a Coder-Next chat session: tool calls rendered as raw `{"name": "web_search", ...}` text in the assistant bubble with no execution, while in a separate turn the "Attach image" affordance appeared even though Qwen3.6-27B is text-only. Three fixes. (1) **Tool-call parser widened.** Old regex `\s*(\{.*?\})\s*` required a closing tag and only matched objects. Coder-Next emitted three real-world shapes in a single session: canonical (closed + object), open-only (no ``), and array-shaped (model hallucinated a list of pseudo-results). The new parser uses `json.JSONDecoder.raw_decode` on each `` opener so it consumes the next valid JSON value regardless of close tag, dispatches objects with a `name`, drops list payloads silently, and continues scanning so a later well-formed call in the same message still lands. 7 new unit tests in `tests/test_agent.py` pin all three shapes plus the OpenAI-style stringified-arguments path. (2) **`_strip_tool_call_xml` helper** removes the JSON region the parser consumed from `result.text` before the streaming layer hands it to the chat bubble — fixes the "raw XML next to the ToolCallCard" duplication. Applied in both `run_agent_loop` and `run_agent_loop_streaming`. 6 new unit tests pin the strip behaviour. (3) **Qwen3.6-27B + Qwen3.5 catalog cleanup.** Dense Qwen3.6-27B (Coder-Next branding), Qwen3.6-27B-FP8, mlx-community/Qwen3.6-27B-4bit, and the family-level Qwen3.6 + Qwen3.5 entries all carried the `vision` capability — a copy-paste bug from when the catalog was scaffolded. Vision lives on a separate `Qwen3.6-27B-VL` variant we do not yet ship; the stale tag was promoting `supportsVision: true` for every community quant, making `ChatComposer` render the "Attach image" affordance for a text-only model. Dropped the tag from all five entries. | @@ -182,22 +182,22 @@ no longer relevant. | ~~FU-062~~ | ~~Bump `turboquant-mlx-full` floor `>=0.3.0` → `>=0.4.0`~~ | **Shipped 2026-05-25 (v0.9.3).** | Upstream `turboquant-mlx-full` 0.4.1 on PyPI (installed was 0.3.0, FU-001 pin). v0.4.0 added **expert streaming** — pages router-selected MoE experts from disk per token, runs models whose weights exceed available RAM. Live-validated upstream against `Qwen3.6-35B-A3B` (35B sparse) on a 16 GB Mac mini in under 4 GB RAM, output bit-identical to fully-resident model. Compounds with our existing Hadamard rotation + Lloyd-Max codebook K/V compression. Floor bump only — no API changes required, runtime continues to call `TurboQuantKVCache` with the same signature. Pin lives in [pyproject.toml](pyproject.toml) `[turboquant]` extra. Apple Silicon only (CUDA users stay on the `llama-server-turbo` binary path via FU-001's parallel track). | | ~~FU-063~~ | ~~Bump `mlx-vlm` floor `>=0.4.0` → `>=0.5.0`~~ | **Shipped 2026-05-25 (v0.9.3).** | Upstream `mlx-vlm` 0.5.0 on PyPI (installed was 0.4.4). Minor bump, no API breakage at our call surface (`mlx_vlm.load` + `mlx_vlm.generate` from [mlx_worker_multimodal.py](backend_service/mlx_worker_multimodal.py)). Floor bump in [pyproject.toml](pyproject.toml) `[mlx-vlm]` extra; loose `>=` semantics mean existing 0.4.x installs are still satisfied locally, but fresh installs pick up the newer wheel which carries the upstream Qwen3.5-VL + GLM-4.5V fixes. | | ~~FU-064~~ | ~~Add `ggml-org/Qwen3.6-{27B,35B-A3B}-GGUF` non-MTP catalog rows~~ | **Shipped 2026-05-25 (v0.9.3).** | ggml-org published canonical Q8_0 non-MTP companion packs on 2026-05-22 alongside the MTP variants we wired in FU-047. Two new rows in [text_models.py](backend_service/catalog/text_models.py) `qwen-3-6` family: `ggml-org/Qwen3.6-27B-GGUF` (Q8_0, 29 GB, dense) + `ggml-org/Qwen3.6-35B-A3B-GGUF` (Q8_0, 37 GB, MoE). Catalog note steers users at the MTP siblings when they want spec-dec. No runtime changes — direct `llama.cpp` lane, same as the lmstudio-community Q4_K_M variants already shipping. | -| FU-065 | Pin `llama-cpp-turboquant` to a commit hash instead of branch HEAD | Trigger: any user-reported build divergence between two install runs, OR a release-build gate where reproducibility matters more than tracking upstream. | [scripts/build-llama-turbo.sh](scripts/build-llama-turbo.sh) + [scripts/update-llama-turbo.sh](scripts/update-llama-turbo.sh) currently clone `TheTom/llama-cpp-turboquant` at branch `feature/turboquant-kv-cache` (`LLAMA_TURBO_BRANCH` env var), then `git reset --hard origin/$TURBO_BRANCH`. Two installs at different times can ship different binaries — the same drift problem FU-033 fixed for `dflash-mlx`. Today's branch HEAD is `2cbfdc62a1a047b01377948dfdede8cb6a744866`. Plan: add `LLAMA_TURBO_COMMIT="${LLAMA_TURBO_COMMIT:-2cbfdc62...}"` to both scripts, `git checkout "$LLAMA_TURBO_COMMIT"` after fetch, surface the hash in `llama-server-turbo.version`, and add a sync-assert to `pre-build-check` that compares the build-script pin to a value in [pyproject.toml](pyproject.toml) or a dedicated `UPSTREAM_PINS.md`. Defer because (a) branch is single-purpose with low churn — author is the same TheTom we already trust for `turboquant_plus`; (b) we already have the v0.9.2 → v0.9.3 release with this code path working. **2026-06-11 release scan:** branch HEAD has drifted `2cbfdc62…` → `73eb521daebc85da7c91d37178940b99a5524cf6` — confirms the reproducibility risk this row tracks. Pin still deferred: pinning the *drifted* `73eb521d` is unsafe without a verified test-compile (could ship a broken turbo binary), and reverting-pinning to the known-good `2cbfdc62` drops upstream work. When picked up, pin to a commit that's been build-tested on the M4 Max box. | -| FU-066 | Audit `cache-strategy-matrix` runner against bumped `turboquant-mlx-full` 0.4.x | When FU-062's bump lands in CI or when a user reports a TurboQuant regression. | The runner's TurboQuant cell (`mlx-community/Qwen3-0.6B-4bit × cacheStrategy=turboquant cacheBits=3`) passed against 0.3.0 with output hash `b4337bc07457` (FU-051 evidence). 0.4.x's expert-streaming code path is a no-op for dense 0.6B but flips on for MoE models like `mlx-community/Qwen3.6-35B-A3B-4bit` — worth a one-time live capture of an MoE turboquant cell against the 0.4.x wheel to lock in a baseline hash. No code changes; just record the number once the bumped wheel is installed on the M4 Max box. | +| FU-065 | Pin `llama-cpp-turboquant` to a commit hash instead of branch HEAD | Trigger: any user-reported build divergence between two install runs, OR a release-build gate where reproducibility matters more than tracking upstream. | [scripts/build-llama-turbo.sh](scripts/build-llama-turbo.sh) + [scripts/update-llama-turbo.sh](scripts/update-llama-turbo.sh) currently clone `TheTom/llama-cpp-turboquant` at branch `feature/turboquant-kv-cache` (`LLAMA_TURBO_BRANCH` env var), then `git reset --hard origin/$TURBO_BRANCH`. Two installs at different times can ship different binaries — the same drift problem FU-033 fixed for `dflash-mlx`. Today's branch HEAD is `2cbfdc62a1a047b01377948dfdede8cb6a744866`. Plan: add `LLAMA_TURBO_COMMIT="${LLAMA_TURBO_COMMIT:-2cbfdc62...}"` to both scripts, `git checkout "$LLAMA_TURBO_COMMIT"` after fetch, surface the hash in `llama-server-turbo.version`, and add a sync-assert to `pre-build-check` that compares the build-script pin to a value in [pyproject.toml](pyproject.toml) or a dedicated `UPSTREAM_PINS.md`. Defer because (a) branch is single-purpose with low churn — author is the same TheTom we already trust for `turboquant_plus`; (b) we already have the v0.9.2 → v0.9.3 release with this code path working. **2026-06-11 release scan:** branch HEAD has drifted `2cbfdc62…` → `73eb521daebc85da7c91d37178940b99a5524cf6` — confirms the reproducibility risk this row tracks. Pin still deferred: pinning the *drifted* `73eb521d` is unsafe without a verified test-compile (could ship a broken turbo binary), and reverting-pinning to the known-good `2cbfdc62` drops upstream work. When picked up, pin to a commit that's been build-tested on the M4 Max box. **2026-06-15 release scan:** branch HEAD drifted again → `7985f6b90bf19881ab7c7a8444954e91cae36056`. Reproducibility risk continues to accumulate. Still deferred pending test-compile. | +| FU-066 | Audit `cache-strategy-matrix` runner against bumped `turboquant-mlx-full` 0.8.x | When 0.8.0 floor is installed on the M4 Max box or when a user reports a TurboQuant regression. | The runner's TurboQuant cell (`mlx-community/Qwen3-0.6B-4bit × cacheStrategy=turboquant cacheBits=3`) passed against 0.3.0 with output hash `b4337bc07457` (FU-051 evidence). 0.4.x expert-streaming + 0.5.x parallel prefetch + 0.8.x Mamba/hybrid arch support are all no-ops for dense 0.6B but may affect MoE models. **2026-06-15:** floor bumped `>=0.6.2` → `>=0.8.0` in [pyproject.toml](pyproject.toml). Worth a one-time live capture of the TurboQuant cell against 0.8.0 once the wheel is installed locally. Bumped threshold from "0.4.x" to "0.8.x" to track the current floor. | | ~~FU-072~~ | ~~Restore `vision` capability to Qwen3.5 + Qwen3.6 families (reverse FU-040)~~ | **Shipped 2026-05-28.** | FU-040 (2026-05-10) removed `vision` from Qwen3.6-27B + family, asserting the dense model was text-only with vision on "a separate `Qwen3.6-27B-VL` we don't ship." Re-checking upstream on 2026-05-28: **every** Qwen3.5/3.6 `config.json` now ships `architectures: [Qwen3_5ForConditionalGeneration]` / `[Qwen3_5MoeForConditionalGeneration]` with `vision_config` + `image_token_id` + `vision_start/end_token_id` — the base models are natively multimodal. `mlx-vlm` ships `qwen3_5` + `qwen3_5_moe` model support, and the `ggml-org/*-GGUF` packs include an `mmproj-*.gguf` sibling (auto-wired by `llama_cpp_engine._resolve_mmproj_path` → `--mmproj`). The catalog was also internally inconsistent (Qwen3.5-9B tagged vision, Qwen3.5-4B not, same arch). Re-added `vision` across both families in [text_models.py](backend_service/catalog/text_models.py): qwen-3-6 family-level + all 11 variants; qwen-3-5 family-level + `Qwen3.5-4B` (vision+video, matching its 9B sibling) + `lmstudio-community/Qwen3.5-9B-GGUF`. **Safety net (why this can't resurrect the FU-040 broken-button bug):** the composer "Attach image" affordance ([ChatComposer.tsx:129](src/features/chat/ChatComposer.tsx)) reads the *runtime* `supportsVision`, which [catalog/capabilities.py](backend_service/catalog/capabilities.py) demotes to False for the MLX worker (carries no images today) and gates on actual `--mmproj` resolution for GGUF ([llama_cpp_engine.py:737](backend_service/inference/llama_cpp_engine.py) `visionEnabled=attempt_mmproj_path is not None`). So the catalog `vision` tag now drives only the variant-picker / discover badges (capability-in-principle), while the functional button stays runtime-accurate. `gemma-4` was already correctly vision-tagged (mlx-vlm `gemma4` support) — left untouched. Catalog parses + `test_capabilities` / `test_mmproj_vision` green. | | ~~FU-075~~ | ~~MLX spec-dec silently broken — stale `configure_full_attention_split` import~~ | **Shipped 2026-05-29.** | **Highest-impact bug this sweep.** Inspecting the matrix runtimeNotes (not just pass/fail) revealed the MLX DFlash / DDTree / MTPLX cells were *passing the weak non-empty-output check while NOT actually running spec-dec* — `actual_strategy: native`, note `dflash-mlx could not be imported (cannot import name 'configure_full_attention_split' from 'dflash_mlx.runtime')`. Root cause: dflash-mlx 0.1.5 moved the pre-0.1.5 top-level `configure_full_attention_split` onto the per-family `target_ops` adapter (the FU-006 migration that rewrote `ddtree.py` — but [mlx_worker_lifecycle.py:153](backend_service/mlx_worker_lifecycle.py) was missed). Python evaluates the whole `from … import a, b` line, so the failed `configure_full_attention_split` symbol killed the co-imported `load_draft_bundle` too → `_dflash_generator` never loaded → **every** MLX spec-dec path fell back to standard generation for all users. Fix: import `load_draft_bundle` + `resolve_target_ops` (both still top-level), resolve the adapter, and call `target_ops.configure_full_attention_split(...)` only for the `hybrid_gdn` family (it's a no-op for pure-attention Qwen3/3.5/3.6 — upstream only calls it there). Live-verified after fix: DFlash note "DFLASH speculative decoding active (draft: z-lab/Qwen3-4B-DFlash-b16)", DDTree "DDTree active (budget=16)". | | ~~FU-076~~ | ~~MTP tensor probe missed top-level `mtp.` keys → MTPLX never selected~~ | **Shipped 2026-05-29.** | The matrix MTPLX cell routed to the DFlash path instead of `MtplxEngine`. `RuntimeController._select_engine` gates MTPLX on `has_mtp_heads_strict(repo, path)`, which calls `model_has_mtp_tensors(path)` → scans the safetensors index against `_MTP_TENSOR_HINTS = ('mtp_heads.', 'mtp_decoder.', 'mtp_emb.', 'model.mtp.', '.mtp.')`. Every hint assumes a *nested* key, but Qwen3.5 / Qwen3.6 ship the MTP head as **top-level** `mtp.layers.*` / `mtp.fc.weight` (no leading prefix) — so the probe returned False on a genuinely MTP-bearing model and MTPLX was skipped. Live-confirmed: `model_has_mtp_tensors` returned False on the real `Qwen/Qwen3.5-4B` snapshot. Fix in [_mtp.py](backend_service/inference/_mtp.py): also match `tensor_name.startswith("mtp.")`. New `test_safetensors_index_with_top_level_mtp_keys` in [tests/test_inference.py](tests/test_inference.py). | | ~~FU-077~~ | ~~MTPLX isolated venv had a truncated install (missing server deps)~~ | **Shipped 2026-05-29.** | After FU-076 routed correctly, `MtplxEngine` startup died: `ModuleNotFoundError: No module named 'numpy'` — and then `safetensors`, `uvicorn`, `fastapi`, `pydantic`, `mlx-lm`, `rich`… The `~/.chaosengine/mtplx-venv` was a *truncated* install (interrupted `pip install mtplx`), but the installer's verify only ran `import mtplx`, which succeeds because the server deps are imported lazily by `mtplx.server.openai` (not at package top level). Fixed the live venv with a full `pip install --upgrade mtplx` (0.3.5 → 0.3.7, pulled all deps). Hardened [scripts/install-mtplx.sh](scripts/install-mtplx.sh): the verify now imports `mtplx.server.openai` (the real server entrypoint) and auto-retries a full dependency install once before failing loudly, so a truncated install can't pass silently again. | | ~~FU-078~~ | ~~MtplxEngine handed MTPLX a bare repo id instead of the local snapshot path~~ | **Shipped 2026-05-29.** | Final MTPLX blocker: `mtplx quickstart` died with "model is not available locally. Run: mtplx pull Qwen/Qwen3.5-4B" — it resolves a model *id* against its own registry/cache, not the HF hub cache. [mtplx_engine.py](backend_service/inference/mtplx_engine.py) set `model_arg = path or runtime_target or model_ref`, and for raw HF-org repos `path` is None while `runtime_target` is the *repo id* (`Qwen/Qwen3.5-4B`), so MTPLX got an id it couldn't find. Fix: whenever the candidate isn't an existing local directory, resolve the already-downloaded HF snapshot dir via `snapshot_download(model_ref, local_files_only=True)` (no network) and pass that. Live-verified: MTPLX now **loads + engages** (note "MTPLX MTP speculative decoding active (draft tokens: 1, model: Qwen3.5-4B)", reports 17.8 tok/s) instead of failing to start. Also fixed the matrix runner's `0.0 tok/s` (read `done.assistant.metrics.tokS`, not a non-existent top-level `tokensPerSecond`) + captured `dflashAcceptanceRate`. **Verified-genuine after these fixes: DFlash (33.2 tok/s), DDTree (31.4 tok/s), GGUF-MTP (14.7 tok/s), turboquant MLX/GGUF, triattention, native** — all stream real output with real throughput. MTPLX still has one remaining issue → FU-079. | | ~~FU-080~~ | ~~Backend cold start dragged in torch via cache-strategy availability probes~~ | **Shipped 2026-05-29.** | `python -X importtime backend_service.app` measured **2.6 s**, of which **1.64 s was `diffusers.hooks`** (→ `torch` → `torch._dynamo` → `sympy`) — blowing the CLAUDE.md "< 2 s backend startup" target. Traced the chain: state init → system snapshot → `_get_cache_strategies()` → `registry.available()` instantiates every strategy and calls `is_available()`, and the 5 diffusion strategies (fbcache / taylorseer / magcache / pab / fastercache) answered availability by **actually importing `diffusers.hooks`** — pulling the whole torch stack onto the cold-start path on every launch. Fix: new [cache_compression/_diffusers_probe.py](cache_compression/_diffusers_probe.py) `diffusers_at_least(major, minor)` reads the installed version via `importlib.metadata.version` (metadata only — never executes `diffusers.__init__`, so no torch). Each `is_available()` now gates on the version (fbcache ≥0.36, the other four ≥0.38); the real `diffusers.hooks` import stays lazy inside each `apply_*` method (still raises a clean NotImplementedError on a broken install). Result: `diffusers` / `torch` / `mlx` are **no longer in `sys.modules` after `import backend_service.app`**, import time dropped **2.6 s → ~0.85 s**, and cold-start → first `/api/health` 200 is **2.34 s** (the native-backend MLX subprocess probe was already async — "detection still running" on first health, never blocked startup). Two subprocess-isolated regression guards in [tests/test_cache_strategies.py](tests/test_cache_strategies.py) (`StartupImportPurityTests`) assert neither `registry.available()` nor `import backend_service.app` pulls torch/diffusers, so this can't silently regress. All 5 diffusion strategies still report `available=True` against the installed diffusers 0.38. | -| FU-079 | MTPLX proxy doesn't surface incremental tokens to the chat stream (empty output) | Active — MTPLX-specific, lower priority (FU-048: MTPLX is ~flat-to-slower vs the alternatives, which all work). | After FU-075–078, the matrix MTPLX cell flipped from "fake pass via DFlash fallback" to **engine genuinely engaged but `FAIL — empty output`**: the loaded-model note confirms "MTPLX MTP active (draft tokens: 1)" and the done event carries a real `tokS` (17.8), but the streamed assistant text is empty (output SHA `e3b0c44298fc` = the empty-string hash). Confirmed the chat stream's incremental token field IS `{"token": "..."}` (DFlash/DDTree/GGUF-MTP/native all stream through it fine on the same `/api/chat/generate/stream` endpoint), so the gap is in `MtplxEngine`'s OpenAI-`/v1`-proxy → SSE adapter: it surfaces final metrics but not per-token deltas, leaving `full_text` empty for both the matrix runner AND the real Chat UI. Plan: inspect `MtplxEngine.generate` / its streaming proxy in [mtplx_engine.py](backend_service/inference/mtplx_engine.py), map the mtplx server's `/v1/chat/completions` SSE `choices[].delta.content` chunks onto our `{"token": ...}` event shape. Until fixed, MTPLX loads but produces no visible output — DFlash is the working MLX spec-dec lane for the same models (and faster per FU-048). **2026-06-11 release scan:** MTPLX reached **v1.0.0 + v1.0.1** (PyPI; was 0.3.5 on this box). The installer ([scripts/install-mtplx.sh](scripts/install-mtplx.sh)) is unpinned (`pip install --upgrade mtplx`), so a fresh install now auto-pulls v1.0.1 — no code change needed. v1.0.0 release notes claim `/v1/completions` now "streams tokens as they are generated, with real finish reasons and usage", which **may resolve this empty-output** at the source. Still HTTP-server-only (the FU-048 in-process-API root persists). **Action: re-test FU-079 against v1.0.1 with a live MTPLX run** (reinstall the mtplx venv → load an MTP model → confirm the chat stream surfaces per-token `{"token": …}` deltas). If v1.0.0's streaming fixed it, this row closes with no adapter change. | +| FU-079 | MTPLX proxy doesn't surface incremental tokens to the chat stream (empty output) | Active — MTPLX-specific, lower priority (FU-048: MTPLX is ~flat-to-slower vs the alternatives, which all work). | After FU-075–078, the matrix MTPLX cell flipped from "fake pass via DFlash fallback" to **engine genuinely engaged but `FAIL — empty output`**: the loaded-model note confirms "MTPLX MTP active (draft tokens: 1)" and the done event carries a real `tokS` (17.8), but the streamed assistant text is empty (output SHA `e3b0c44298fc` = the empty-string hash). Confirmed the chat stream's incremental token field IS `{"token": "..."}` (DFlash/DDTree/GGUF-MTP/native all stream through it fine on the same `/api/chat/generate/stream` endpoint), so the gap is in `MtplxEngine`'s OpenAI-`/v1`-proxy → SSE adapter: it surfaces final metrics but not per-token deltas, leaving `full_text` empty for both the matrix runner AND the real Chat UI. Plan: inspect `MtplxEngine.generate` / its streaming proxy in [mtplx_engine.py](backend_service/inference/mtplx_engine.py), map the mtplx server's `/v1/chat/completions` SSE `choices[].delta.content` chunks onto our `{"token": ...}` event shape. Until fixed, MTPLX loads but produces no visible output — DFlash is the working MLX spec-dec lane for the same models (and faster per FU-048). **2026-06-11 release scan:** MTPLX reached **v1.0.0 + v1.0.1** (PyPI; was 0.3.5 on this box). The installer ([scripts/install-mtplx.sh](scripts/install-mtplx.sh)) is unpinned (`pip install --upgrade mtplx`), so a fresh install now auto-pulls v1.0.1 — no code change needed. v1.0.0 release notes claim `/v1/completions` now "streams tokens as they are generated, with real finish reasons and usage", which **may resolve this empty-output** at the source. Still HTTP-server-only (the FU-048 in-process-API root persists). **Action: re-test FU-079 against v1.0.1 with a live MTPLX run** (reinstall the mtplx venv → load an MTP model → confirm the chat stream surfaces per-token `{"token": …}` deltas). If v1.0.0's streaming fixed it, this row closes with no adapter change. **2026-06-15 release scan:** MTPLX now at **v1.0.4** (was v1.0.1). Installer remains unpinned so fresh installs pick up 1.0.4 automatically. Re-test action unchanged — priority to validate before next release. | | ~~FU-074~~ | ~~GGUF MTP speculative decoding had no UI toggle~~ | **Shipped 2026-05-28.** | FU-047 wired the GGUF MTP backend (`--spec-type draft-mtp`, gated on the `speculativeDecoding` request flag in [llama_cpp_engine.py:531](backend_service/inference/llama_cpp_engine.py)) + the `ggufMtpAvailable` capability flag, but never surfaced a UI control. The launch modal's only spec-dec toggles are DFlash (hidden for GGUF — "not supported with llama.cpp models") and MTPLX (Apple-Silicon MLX only), so a user loading `ggml-org/Qwen3.6-27B-MTP-GGUF` had **no way to enable** the lane — only the matrix runner could, by POSTing `speculativeDecoding=true` directly. The button audit (this turn) caught it. Added an `isMtpGgufRepo(repo)` helper in [runtimeSupport.ts](src/components/runtimeSupport.ts) (mirrors backend `is_mtp_gguf_repo`: MTP-flavoured name on a GGUF repo) + a "GGUF MTP" toggle in [RuntimeControls.tsx](src/components/RuntimeControls.tsx), shown only when `isGgufBackend && isMtpGgufRepo(selectedCanonicalRepo)` (FU-034 hide-when-not-applicable). It binds to the same `speculativeDecoding` flag the backend reads; no cache-strategy lock (GGUF KV cache is orthogonal to MTP draft decode, unlike MLX DFlash which forces native). Also patched the DFlash-availability reset effect (was clearing `speculativeDecoding` for any non-DFlash model — would have instantly un-ticked the GGUF-MTP box) to keep it on for `ggufMtpModelSupported`. Old binaries without `--spec-type` fall back to standard decode + a runtimeNote (backend FU-047 path) — acceptable since the bundled llama-server is current; a future refinement could additionally gate the toggle on the `ggufMtpAvailable` capability for old-binary boxes (needs the flag threaded through the ~8 RuntimeControls call sites). 8 new `isMtpGgufRepo` unit tests in [runtimeSupport.test.ts](src/components/__tests__/runtimeSupport.test.ts). Verified live: matrix `gguf MTP (Qwen3.6-27B)` cell PASS (sha 74a1eca8b3b4). | | ~~FU-073~~ | ~~Matrix MTPLX cell targeted a non-MTP VL model~~ | **Shipped 2026-05-28.** | `scripts/cache-strategy-matrix.py` `MID_MLX_MTPLX_CAPABLE` was `mlx-community/Qwen3.5-4B-bf16` — a VL conversion (ships `video_preprocessor_config.json`) that carries no MTP heads and is absent from both `MTP_MODEL_MAP` and `_MTP_ALIASES`, so the MTPLX cell could never have exercised MTP even with the model on disk (it'd fail the `has_mtp_heads_strict` tensor probe). Switched to the canonical `Qwen/Qwen3.5-4B`, which is a direct `MTP_MODEL_MAP` key (verified `mtp.layers.*` + `mtp.fc.weight` in its safetensors index), a catalog variant (so it passes the `library_refs` check), and downloaded to exercise the lane. Pairs with the FU-070 download-skip classifier so the cell reports honestly on boxes without the model. | | ~~FU-071~~ | ~~DDTree availability probe checks pre-0.1.5 symbol names~~ | **Shipped 2026-05-28.** | The cache-strategy matrix `ddtree spec-dec` cell skipped with *DDTree runtime not available* even though `dflash_mlx` 0.1.5.1 is installed and `backend_service/ddtree.py` works. Root cause: `dflash.is_ddtree_available()` ([dflash/__init__.py](dflash/__init__.py)) source-greps the installed `dflash_mlx.runtime` for three required symbols and the list was stale — it required `target_forward_with_hidden_states`, which dflash-mlx 0.1.5 **renamed** to the per-family adapter `target_ops.forward_with_hidden_capture` (the same FU-006 migration that rewrote our `ddtree.py` to call `resolve_target_ops(target_model)`). The probe was never updated alongside that rewrite, so it required a symbol that (a) no longer exists in any modern dflash-mlx build (`grep -c` = 0 in the installed `runtime.py`) and (b) our own code no longer uses. Confirmed the real contract our DDTree path imports: `resolve_target_ops` (ddtree.py adapter entry), `load_draft_bundle` (worker lifecycle), `stream_dflash_generate` (speculative). Updated `required_symbols` to those three; dropped the obsolete name + the unused `load_target_bundle`. `dflash.is_ddtree_available()` now returns `True` on this M4 Max box. 4 new `DDTreeAvailabilityProbeTests` in [tests/test_dflash.py](tests/test_dflash.py) mock the runtime source so a future rename can't silently regress the probe again. Note: when FU-057 bumps dflash-mlx to 0.1.7 (which removes `configure_full_attention_split` and reshapes `stream_dflash_generate`), this probe + the lifecycle import need re-checking in lockstep. | | ~~FU-070~~ | ~~Matrix runner: classify missing-download as SKIP, not FAIL~~ | **Shipped 2026-05-28.** | The full `scripts/cache-strategy-matrix.py` sweep on 2026-05-28 reported the `gguf MTP (Qwen3.6-27B)` cell as **FAIL** — `POST /api/models/load -> 500: Cannot load 'ggml-org/Qwen3.6-27B-MTP-GGUF': No .gguf, .safetensors, or pytorch weights found in HF cache entry.` Root cause: the repo had an empty `~/.cache/huggingface/hub/models--ggml-org--Qwen3.6-27B-MTP-GGUF/` dir (4.0 KB, only `refs/main`, dated May 16 — an interrupted pull), and the runner's `skip_reason` library check uses `caps.library_refs`, which is built from the **catalog** (every variant repo from `/api/workspace`), not from what's actually downloaded. So a catalogued-but-undownloaded model passes the library check and only errors at load — reported as a product FAIL when it's really a missing download (same false-positive class as FU-053). Fix: new pure helper `classify_load_skip(msg)` in [scripts/cache-strategy-matrix.py](scripts/cache-strategy-matrix.py) matches the backend's 'no weights found in HF cache entry' markers; `run_cell` now wraps the load call separately and converts that specific error into `skipped=True, skip_reason="weights not downloaded ()"` instead of a failure. Genuine load errors (OOM, etc.) still surface as fails. 4 unit tests in [tests/test_cache_strategy_matrix_runner.py](tests/test_cache_strategy_matrix_runner.py) (`ClassifyLoadSkipTests`) pin the classification. The dflash/mtplx cells already skipped correctly because their target models (`mlx-community/Qwen3-4B-bf16` / `Qwen3.5-4B-bf16`) aren't catalog variants so they never entered `library_refs`. **To actually exercise the GGUF-MTP lane (FU-047/FU-052 trip-wire), download `ggml-org/Qwen3.6-27B-MTP-GGUF` first**, then re-run full. | | ~~FU-069~~ | ~~Bump `turboquant-mlx-full` floor `>=0.4.0` → `>=0.5.0`~~ | **Shipped 2026-05-28.** | Upstream `turboquant-mlx-full` 0.5.0 on PyPI (FU-062 had just floored at 0.4.0 on 2026-05-25). v0.5.0 builds on the v0.4.0 expert-streaming path (FU-062) with **parallel expert prefetch** — the missing MoE experts for each layer are read on a thread pool (`--prefetch-workers`, default `8`) so SSD latency hides behind compute. Upstream-reported **~1.9× faster decode** at a tight cache budget, still bit-identical output. `--prefetch-workers 1` restores the serial v0.4.0 behaviour. No API change at our call surface — runtime still constructs `TurboQuantKVCache` with the same signature; the new flag is converter/runtime-side. Floor bump only in [pyproject.toml](pyproject.toml) `[turboquant]` extra; loose `>=` so existing 0.4.x installs stay satisfied locally. Apple Silicon only. Folds in the spirit of FU-066 (the matrix MoE-turboquant baseline should be captured against 0.5.0 once the wheel is installed on the M4 Max box). | | ~~FU-068~~ | ~~MLX probe timeout 12 s → 20 s~~ | **Shipped 2026-05-25 (v0.9.3).** | E2E full-sweep Phase 1 surfaced three intermittent fails on a freshly-booted backend — `MLX native cache` / `MLX TurboQuant cache` / `fused attention flag` all returned `MLX backend requested but unavailable: ...mlx_worker probe timed out after 12.0 seconds`. Measured cold-start: `time .venv/bin/python -m backend_service.mlx_worker probe` = **12.43 s** on M4 Max / Python 3.11 against current `mlx 0.31.2` + `mlx-lm 0.31.3` + `mlx-vlm 0.4.4` — 0.4 s past the 12.0 s ceiling. The 12.0 s value was an arbitrary default from the v0.8.0 `capabilities.py` extract (commit `f91709e`), never tuned. Bumped to **20.0 s** in [backend_service/inference/capabilities.py](backend_service/inference/capabilities.py) `_probe_native_backends` — ~60% headroom over today's envelope. Phase 5 video gen + Phase 1 GGUF / DFlash / cache-preview already passed (proves MLX itself works once the probe lands), so this was a pure cold-boot probe timing issue, not a regression from the FU-062 / FU-063 floor bumps (which are loose `>=`, no installed package changed). | -| FU-067 | Watch dflash-mlx for v0.1.8+ migration guide (FU-057 is multi-hour, deferred) | Trigger: (a) upstream publishes v0.1.8 with a stability commitment + migration guide, OR (b) we hit a concrete user-visible bug on the orphan `fada1eb` pin, OR (c) a shipped catalog model needs a v0.1.6+ feature (adaptive verify / Gemma4 backend / Qwen3-Next GDN). | Dup of FU-057's trigger but resurfaced after the v0.9.3 upstream scan confirmed v0.1.7 is now on PyPI (`pip install dflash-mlx==0.1.7` resolves) and tagged at commit `210a0fc1`. Plan-of-record stays FU-057's six-step migration. Re-checking quarterly via `git ls-remote --tags` for `v0.1.8` / `v0.2.0` release tags — if upstream publishes a migration guide alongside, the cost drops dramatically. **2026-06-11 release scan:** **v0.1.9** is now tagged (branch HEAD `7f884380`; tags `v0.1.5.1…v0.1.9`). Still no published migration guide, so FU-057's six-step rewrite stays the plan of record and remains deferred. Newest migration target is now v0.1.9 (was v0.1.7/v0.1.8). | +| FU-067 | Watch dflash-mlx for v0.1.8+ migration guide (FU-057 is multi-hour, deferred) | Trigger: (a) upstream publishes v0.1.8 with a stability commitment + migration guide, OR (b) we hit a concrete user-visible bug on the orphan `fada1eb` pin, OR (c) a shipped catalog model needs a v0.1.6+ feature (adaptive verify / Gemma4 backend / Qwen3-Next GDN). | Dup of FU-057's trigger but resurfaced after the v0.9.3 upstream scan confirmed v0.1.7 is now on PyPI (`pip install dflash-mlx==0.1.7` resolves) and tagged at commit `210a0fc1`. Plan-of-record stays FU-057's six-step migration. Re-checking quarterly via `git ls-remote --tags` for `v0.1.8` / `v0.2.0` release tags — if upstream publishes a migration guide alongside, the cost drops dramatically. **2026-06-11 release scan:** **v0.1.9** is now tagged (branch HEAD `7f884380`; tags `v0.1.5.1…v0.1.9`). Still no published migration guide, so FU-057's six-step rewrite stays the plan of record and remains deferred. Newest migration target is now v0.1.9 (was v0.1.7/v0.1.8). **2026-06-15 release scan:** **v0.1.10** now tagged (branch HEAD `9ca00289`). One more release since last scan; migration target advances to v0.1.10. No migration guide published. FU-057 deferred. | | ~~FU-061~~ | ~~"Watching upstream" badge + disabled download for tracked-only image seeds~~ | **Shipped 2026-05-18.** | User-reported gap: downloaded `baidu/ERNIE-Image-Turbo` from Image Discover (it sits in `LATEST_IMAGE_TRACKED_SEEDS`), expected it in the Studio dropdown, didn't appear. Root cause: tracked seeds are discovery-only — Studio's dropdown is fed by `IMAGE_MODEL_FAMILIES` which requires explicit pipeline routing (flow-match flags, sampler registry, scheduler defaults). ERNIE-Image (+ Nucleus-Image, Z-Image, HiDream, GLM-Image, FLUX.2 family) has no diffusers-routable Studio variant yet. Fix path A picked over path B (full per-family pipeline wiring) — surgical UX disambiguation. **Backend:** new `_is_launchable_image_repo(repo_id)` helper in [backend_service/helpers/images.py](backend_service/helpers/images.py) returns True only when `repo_id` resolves to a curated `IMAGE_MODEL_FAMILIES` variant. Wired into both payload sites — `_tracked_latest_seed_payloads` (line 411) + the live-HF lane (line 622) — so every Discover row carries `trackedOnly: bool`. **Frontend:** new `trackedOnly?: boolean` field on `ImageModelVariant` ([src/types/image.ts](src/types/image.ts)). [ImageDiscoverTab.tsx](src/features/images/ImageDiscoverTab.tsx) chip row gains a "Watching upstream" badge + tooltip when `trackedOnly`. Action column branches first on `trackedOnly` → renders a disabled `IconActionButton` with tooltip "Watching upstream — Studio playback for this family isn't wired yet. Catalog entry is for awareness; download won't unlock Studio." instead of the Generate / Download / Resume CTAs. Backward-compat: existing curated families have `trackedOnly: undefined` → falsy → no UX change. **Tests:** new `TrackedOnlyFlagTests` in [tests/test_image_discover.py](tests/test_image_discover.py) — 5 cases covering `_is_launchable_image_repo` (FLUX.1-dev + SDXL = true; ERNIE-Image / Nucleus-Image = false; empty = false), `trackedOnly: True` on ERNIE seed payload, and the negative case where a tracked seed that IS in IMAGE_MODEL_FAMILIES must NOT carry the flag (forward-compat for catalog evolution). **Follow-up path B (deferred):** wire ERNIE-Image / Nucleus-Image / Z-Image / HiDream / GLM-Image / FLUX.2 family as real launchable families via per-family pipeline detection in `image_runtime`. Multi-hour per family, gated on diffusers' upstream support landing for each architecture. | --- diff --git a/pyproject.toml b/pyproject.toml index 623203a..e9b3257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,10 +38,10 @@ mlx-vlm = [ "mlx-vlm>=0.6.3", "torchvision>=0.20", ] -triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "vllm>=0.22.1"] +triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "vllm>=0.23.0"] triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "mlx-lm>=0.22.0"] -turboquant = ["turboquant-mlx-full>=0.6.2"] -vllm = ["vllm>=0.22.1"] +turboquant = ["turboquant-mlx-full>=0.8.0"] +vllm = ["vllm>=0.23.0"] dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@fada1eb2b75cd1c875ca6547b6518783fd3d2956"] dflash = ["dflash>=0.1.0"] desktop = [