From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 20:58:58 +0200 Subject: [PATCH 1/7] Implemented generic multimodal chat handler. --- llama_cpp/llama.py | 12 +++++++++ llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e2..848706a90 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -85,6 +85,7 @@ class Llama: def __init__( self, model_path: str, + clip_model_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -608,6 +609,17 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) + + if clip_model_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + gguf_metadata = self.metadata, + clip_model_path = clip_model_path, + model_arch = None, + verbose = self.verbose + ) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db..468a73c07 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2887,10 +2887,14 @@ def __init__( raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") # Pre-compile Jinja template + if not hasattr(self, "chat_format") or self.chat_format is None: + self.chat_format = self.CHAT_FORMAT + + self._chat_format_parser_tags = [] self.chat_template = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) + ).from_string(self.chat_format) self._exit_stack = ExitStack() @@ -3116,6 +3120,13 @@ def _process_mtmd_prompt( tool_choice=tool_choice, **getattr(self, 'extra_template_arguments', {}) ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + # Replace image_url by media_marker in text for item in media_items: text = text.replace(item["url"], media_marker) @@ -3827,6 +3838,42 @@ def from_pretrained( **kwargs, ) +class GenericMTMDChatHandler(MTMDChatHandler): + def __init__( + self, + gguf_metadata: Dict[str, Any], + clip_model_path: str, + model_arch: Optional[str] = None, + verbose: bool = True, + **kwargs + ) -> None: + self.model_metadata = gguf_metadata + + self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch + + if verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + if self.arch is None: + if verbose: + print("Unknown model architecture. Will use general/most-common tags.") + + self.arch = "unknown" + + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + + if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: + self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] + elif self.arch in ["gemma4"]: + self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] + elif self.arch in ["mistral3", "mistral4", "deepseek2"]: + self._chat_format_parser_tags += ["[IMG]"] + elif verbose: + print("Warning: Could not determine chat format parser tags.", flush = True) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 21:19:20 +0200 Subject: [PATCH 2/7] Used text.replace() --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 468a73c07..ab5e438d3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3125,7 +3125,7 @@ def _process_mtmd_prompt( if tag not in text: continue - text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + text = text.replace(tag, media_marker) # Replace image_url by media_marker in text for item in media_items: From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 17:46:08 +0200 Subject: [PATCH 3/7] Fixed some bugs. --- llama_cpp/llama_chat_format.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab5e438d3..40491968a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3874,6 +3874,18 @@ def __init__( self._chat_format_parser_tags += ["[IMG]"] elif verbose: print("Warning: Could not determine chat format parser tags.", flush = True) + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 18:49:21 +0200 Subject: [PATCH 4/7] Implemented 'chat_handler_kwargs'. --- llama_cpp/llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 848706a90..6dab44602 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -152,6 +152,7 @@ def __init__( spm_infill: bool = False, verbose: bool = True, # Extra Params + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -618,7 +619,8 @@ def __init__( gguf_metadata = self.metadata, clip_model_path = clip_model_path, model_arch = None, - verbose = self.verbose + verbose = self.verbose, + **chat_handler_kwargs ) eos_token_id = self.token_eos() From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sat, 16 May 2026 06:41:17 +0200 Subject: [PATCH 5/7] fix --- llama_cpp/llama.py | 1 - llama_cpp/llama_chat_format.py | 33 +++++++++++---------------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dab44602..7666b822a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -618,7 +618,6 @@ def __init__( self.chat_handler = llama_chat_format.GenericMTMDChatHandler( gguf_metadata = self.metadata, clip_model_path = clip_model_path, - model_arch = None, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 40491968a..0be38a19d 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3839,47 +3839,36 @@ def from_pretrained( ) class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + def __init__( self, gguf_metadata: Dict[str, Any], clip_model_path: str, - model_arch: Optional[str] = None, verbose: bool = True, **kwargs ) -> None: self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) - self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.arch is None: - if verbose: - print("Unknown model architecture. Will use general/most-common tags.") - - self.arch = "unknown" if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) - - if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: - self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] - elif self.arch in ["gemma4"]: - self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] - elif self.arch in ["mistral3", "mistral4", "deepseek2"]: - self._chat_format_parser_tags += ["[IMG]"] - elif verbose: - print("Warning: Could not determine chat format parser tags.", flush = True) def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] if self.verbose: print(f"{self.log_prefix} - Start processing") From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 6/7] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d619..6db130445 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322 From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 19:36:28 +0800 Subject: [PATCH 7/7] build(ci+cu131): bundle LLVM OpenMP runtime for Windows CPU backends - Add a PowerShell step to the Windows CI workflow to locate and copy `libomp140.x86_64.dll` from the Visual Studio redistributables. - Place the runtime DLL into the `llama_cpp\lib` package directory. This ensures that the dynamically loaded `ggml-cpu-*.dll` variants (which are built with LLVM OpenMP on Windows) have their required dependencies packaged in the wheel. Without this, `ggml_backend_load_all_from_path()` can silently fail to load the CPU backends at runtime on end-user machines. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 14bea65d1..5f77003a5 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,6 +67,31 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + - name: Copy LLVM OpenMP runtime + shell: pwsh + run: | + # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. + # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. + # If it is missing from the wheel, ggml_backend_load_all_from_path() + # may fail to load CPU backend DLLs at runtime. + $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" + New-Item -ItemType Directory -Force $packageLibDir | Out-Null + + $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` + -Recurse ` + -Filter "libomp140.x86_64.dll" ` + -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "OpenMP\.LLVM" } | + Select-Object -First 1 + + if (!$omp) { + Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." + exit 1 + } + + Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force + Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" + - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')