From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Mon, 4 May 2026 20:58:58 +0200
Subject: [PATCH 1/7] Implemented generic multimodal chat handler.

---
 llama_cpp/llama.py             | 12 +++++++++
 llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 1241f81e2..848706a90 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -85,6 +85,7 @@ class Llama:
     def __init__(
         self,
         model_path: str,
+        clip_model_path: Optional[str] = None,
         *,
         # Model Params
         n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto",
@@ -608,6 +609,17 @@ def __init__(
 
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
+        
+        if clip_model_path is not None:
+            if self.chat_handler is not None and self.verbose:
+                print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True)
+
+            self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
+                gguf_metadata = self.metadata,
+                clip_model_path = clip_model_path,
+                model_arch = None,
+                verbose = self.verbose
+            )
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a0d8d25db..468a73c07 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2887,10 +2887,14 @@ def __init__(
             raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")
 
         # Pre-compile Jinja template
+        if not hasattr(self, "chat_format") or self.chat_format is None:
+            self.chat_format = self.CHAT_FORMAT
+
+        self._chat_format_parser_tags = []
         self.chat_template = ImmutableSandboxedEnvironment(
             trim_blocks=True,
             lstrip_blocks=True,
-        ).from_string(self.CHAT_FORMAT)
+        ).from_string(self.chat_format)
 
         self._exit_stack = ExitStack()
 
@@ -3116,6 +3120,13 @@ def _process_mtmd_prompt(
             tool_choice=tool_choice,
             **getattr(self, 'extra_template_arguments', {})
         )
+        
+        for tag in self._chat_format_parser_tags:
+            if tag not in text:
+                continue
+
+            text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):]
+
         # Replace image_url by media_marker in text
         for item in media_items:
             text = text.replace(item["url"], media_marker)
@@ -3827,6 +3838,42 @@ def from_pretrained(
             **kwargs,
         )
 
+class GenericMTMDChatHandler(MTMDChatHandler):
+    def __init__(
+        self,
+        gguf_metadata: Dict[str, Any],
+        clip_model_path: str,
+        model_arch: Optional[str] = None,
+        verbose: bool = True,
+        **kwargs
+    ) -> None:
+        self.model_metadata = gguf_metadata
+
+        self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
+        self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch
+
+        if verbose:
+            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
+        
+        if self.arch is None:
+            if verbose:
+                print("Unknown model architecture. Will use general/most-common tags.")
+            
+            self.arch = "unknown"
+
+        if self.chat_format is None:
+            raise ValueError("Failed to get model chat template automatically.")
+        
+        super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
+        
+        if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]:
+            self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"]
+        elif self.arch in ["gemma4"]:
+            self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"]
+        elif self.arch in ["mistral3", "mistral4", "deepseek2"]:
+            self._chat_format_parser_tags += ["[IMG]"]
+        elif verbose:
+            print("Warning: Could not determine chat format parser tags.", flush = True)
 
 class Llava15ChatHandler(MTMDChatHandler):
     CHAT_FORMAT = (

From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Mon, 4 May 2026 21:19:20 +0200
Subject: [PATCH 2/7] Used text.replace()

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 468a73c07..ab5e438d3 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3125,7 +3125,7 @@ def _process_mtmd_prompt(
             if tag not in text:
                 continue
 
-            text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):]
+            text = text.replace(tag, media_marker)
 
         # Replace image_url by media_marker in text
         for item in media_items:

From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Tue, 5 May 2026 17:46:08 +0200
Subject: [PATCH 3/7] Fixed some bugs.

---
 llama_cpp/llama_chat_format.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index ab5e438d3..40491968a 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3874,6 +3874,18 @@ def __init__(
             self._chat_format_parser_tags += ["[IMG]"]
         elif verbose:
             print("Warning: Could not determine chat format parser tags.", flush = True)
+    
+    def __call__(self, **kwargs):
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
 
 class Llava15ChatHandler(MTMDChatHandler):
     CHAT_FORMAT = (

From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Tue, 5 May 2026 18:49:21 +0200
Subject: [PATCH 4/7] Implemented 'chat_handler_kwargs'.

---
 llama_cpp/llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 848706a90..6dab44602 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -152,6 +152,7 @@ def __init__(
         spm_infill: bool = False,
         verbose: bool = True,
         # Extra Params
+        chat_handler_kwargs: Dict[str, Any] = {},
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
@@ -618,7 +619,8 @@ def __init__(
                 gguf_metadata = self.metadata,
                 clip_model_path = clip_model_path,
                 model_arch = None,
-                verbose = self.verbose
+                verbose = self.verbose,
+                **chat_handler_kwargs
             )
 
         eos_token_id = self.token_eos()

From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Sat, 16 May 2026 06:41:17 +0200
Subject: [PATCH 5/7] fix

---
 llama_cpp/llama.py             |  1 -
 llama_cpp/llama_chat_format.py | 33 +++++++++++----------------------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6dab44602..7666b822a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -618,7 +618,6 @@ def __init__(
             self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
                 gguf_metadata = self.metadata,
                 clip_model_path = clip_model_path,
-                model_arch = None,
                 verbose = self.verbose,
                 **chat_handler_kwargs
             )
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 40491968a..0be38a19d 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3839,47 +3839,36 @@ def from_pretrained(
         )
 
 class GenericMTMDChatHandler(MTMDChatHandler):
+    KNOWN_MEDIA_TAGS = [
+        "<|image_pad|>",
+        "<|audio_pad|>",
+        "<|video_pad|>",
+        "<|image|>",
+        "<|audio|>",
+        "<|video|>",
+        "[IMG]"
+    ]
+
     def __init__(
         self,
         gguf_metadata: Dict[str, Any],
         clip_model_path: str,
-        model_arch: Optional[str] = None,
         verbose: bool = True,
         **kwargs
     ) -> None:
         self.model_metadata = gguf_metadata
-
         self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
-        self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch
 
         if verbose:
             print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
-        
-        if self.arch is None:
-            if verbose:
-                print("Unknown model architecture. Will use general/most-common tags.")
-            
-            self.arch = "unknown"
 
         if self.chat_format is None:
             raise ValueError("Failed to get model chat template automatically.")
         
         super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
-        
-        if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]:
-            self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"]
-        elif self.arch in ["gemma4"]:
-            self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"]
-        elif self.arch in ["mistral3", "mistral4", "deepseek2"]:
-            self._chat_format_parser_tags += ["[IMG]"]
-        elif verbose:
-            print("Warning: Could not determine chat format parser tags.", flush = True)
     
     def __call__(self, **kwargs):
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
+        self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
 
         if self.verbose:
             print(f"{self.log_prefix} - Start processing")

From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 18 May 2026 21:43:37 +0800
Subject: [PATCH 6/7] Update Submodule vendor/llama.cpp 39cf5d6..6db1304

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 39cf5d619..6db130445 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee
+Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322

From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 19 May 2026 19:36:28 +0800
Subject: [PATCH 7/7] build(ci+cu131): bundle LLVM OpenMP runtime for Windows
 CPU backends

- Add a PowerShell step to the Windows CI workflow to locate and copy
  `libomp140.x86_64.dll` from the Visual Studio redistributables.
- Place the runtime DLL into the `llama_cpp\lib` package directory.

This ensures that the dynamically loaded `ggml-cpu-*.dll` variants
(which are built with LLVM OpenMP on Windows) have their required
dependencies packaged in the wheel. Without this,
`ggml_backend_load_all_from_path()` can silently fail to load the CPU
backends at runtime on end-user machines.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml
index 14bea65d1..5f77003a5 100644
--- a/.github/workflows/build-wheels-cu131-win.yml
+++ b/.github/workflows/build-wheels-cu131-win.yml
@@ -67,6 +67,31 @@ jobs:
           echo LIB=%LIB%>>%GITHUB_ENV%
           echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%
 
+      - name: Copy LLVM OpenMP runtime
+        shell: pwsh
+        run: |
+          # GGML CPU all-variant backends are built with LLVM OpenMP on Windows.
+          # The dynamically loaded ggml-cpu-*.dll files depend on this runtime.
+          # If it is missing from the wheel, ggml_backend_load_all_from_path()
+          # may fail to load CPU backend DLLs at runtime.
+          $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib"
+          New-Item -ItemType Directory -Force $packageLibDir | Out-Null
+
+          $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" `
+            -Recurse `
+            -Filter "libomp140.x86_64.dll" `
+            -ErrorAction SilentlyContinue |
+            Where-Object { $_.FullName -match "OpenMP\.LLVM" } |
+            Select-Object -First 1
+
+          if (!$omp) {
+            Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables."
+            exit 1
+          }
+
+          Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force
+          Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)"
+
       - name: Build wheel
         run: |
           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')