Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ help:
@echo " llava-cpu - Build Llava runner with CPU backend"
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner + OpenAI serving worker with CUDA backend"
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
Expand Down Expand Up @@ -444,11 +444,13 @@ qwen3_5_moe-cuda:
gemma4_31b-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Gemma 4 31B runner with CUDA..."
@echo "==> Building Gemma 4 31B runner + serving worker with CUDA..."
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
@echo " Serving worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
@echo " Launch: see examples/models/gemma4_31b/README.md (Serving)"

gemma4_31b-mlx:
@echo "==> Building and installing ExecuTorch with MLX..."
Expand Down
28 changes: 26 additions & 2 deletions examples/models/gemma4_31b/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
set(_json_include
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
)

# gflags
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
Expand Down Expand Up @@ -58,9 +61,13 @@ endif()
# Tokenizer (HuggingFace tokenizer.json)
list(APPEND link_libraries tokenizers::tokenizers)

add_executable(gemma4_31b_runner main.cpp)
if(EXECUTORCH_BUILD_CUDA)
add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
else()
add_executable(gemma4_31b_runner main.cpp)
endif()
target_include_directories(
gemma4_31b_runner PUBLIC ${_common_include_directories}
gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})

Expand All @@ -71,6 +78,23 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
endif()
endif()

if(EXECUTORCH_BUILD_CUDA)
add_executable(
gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp
)
target_include_directories(
gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(gemma4_31b_worker)
if(NOT APPLE AND NOT MSVC)
target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
endif()
endif()
endif()

if(TARGET mlxdelegate)
executorch_target_copy_mlx_metallib(gemma4_31b_runner)
endif()
8 changes: 4 additions & 4 deletions examples/models/gemma4_31b/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
{
"name": "gemma4-31b-cuda",
"displayName": "Gemma 4 31B runner (CUDA)",
"displayName": "Gemma 4 31B runner + serving worker (CUDA)",
"inherits": ["gemma4-31b-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_CUDA": "ON"
Expand All @@ -39,9 +39,9 @@
"buildPresets": [
{
"name": "gemma4-31b-cuda",
"displayName": "Build Gemma 4 31B runner (CUDA)",
"displayName": "Build Gemma 4 31B runner + serving worker (CUDA)",
"configurePreset": "gemma4-31b-cuda",
"targets": ["gemma4_31b_runner"]
"targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
},
{
"name": "gemma4-31b-mlx",
Expand All @@ -53,7 +53,7 @@
"workflowPresets": [
{
"name": "gemma4-31b-cuda",
"displayName": "Configure and build Gemma 4 31B runner (CUDA)",
"displayName": "Configure and build Gemma 4 31B runner + serving worker (CUDA)",
"steps": [
{
"type": "configure",
Expand Down
33 changes: 30 additions & 3 deletions examples/models/gemma4_31b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,12 @@ model produces sensible text.
## Build the runner

```bash
make gemma4_31b-cuda # Linux — CUDA backend
make gemma4_31b-mlx # macOS — MLX backend (Apple Silicon)
make gemma4_31b-cuda # Linux — CUDA runner + serving worker
make gemma4_31b-mlx # macOS — MLX runner (serving later)
```

The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
The CUDA build also produces
`cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`.

## Run the .pte

Expand All @@ -162,3 +163,29 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.

For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
graph (decode is fully static — `T=1`).

## Serving

The CUDA OpenAI-compatible server is a Python control plane plus a C++ model worker.
The worker owns the ExecuTorch model and speaks the shared JSONL protocol used by
the generic LLM server.

```bash
LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
python -m executorch.examples.models.gemma4_31b.serve \
--model-path ./gemma4_31b_exports/model.pte \
--data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
--hf-tokenizer ./gemma4_31b_int4 \
--model-id gemma4-31b \
--max-sessions 1
```

The launcher defaults to the Hermes `<tool_call>{...}</tool_call>` parser. Use
`--tool-parser qwen` or `--tool-parser none` if the model/template you are
testing emits a different tool-call format.

Named sessions and warm resume require worker capacity above one. CUDA exports
with `get_mutable_buffer_metadata` can use per-session mutable rebinding and
advertise `--max-sessions > 1`; older exports fail closed to a single scratch
session. MLX serving is intentionally left for a later change.
9 changes: 9 additions & 0 deletions examples/models/gemma4_31b/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"""

import argparse
import json
import os

import torch
Expand Down Expand Up @@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
# Export + lower


def _mutable_buffer_metadata(model: nn.Module) -> str:
mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
return json.dumps({"version": 1, "mutable_buffers": mutable})


def export_and_lower(
model: Gemma4_31B,
config: Gemma4_31BConfig,
Expand Down Expand Up @@ -181,6 +187,7 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
import executorch.backends.cuda.quantize_op_dispatch # noqa: F401

materialize_runtime_buffers(model, dtype=torch.bfloat16)
mutable_buffer_metadata = _mutable_buffer_metadata(model)

# Int4Tensor weights are used directly — no format conversion.
# F.linear dispatches to executorch_cuda::int4_plain_mm (CUDA shim).
Expand Down Expand Up @@ -248,6 +255,8 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
"get_vocab_size": config.vocab_size,
"get_n_layers": config.num_hidden_layers,
"get_max_prefill_chunk": max_prefill,
"get_min_prefill_chunk": 5,
"get_mutable_buffer_metadata": mutable_buffer_metadata,
"use_kv_cache": True,
"use_sdpa_with_kv_cache": False,
"enable_dynamic_shape": True,
Expand Down
Loading
Loading