Skip to content

mtmd : remove libllava, remove clip-quantize-cli (⚠️ breaking change) #13460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 0 additions & 35 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,29 +1,3 @@
# llava (legacy)

add_library(llava OBJECT
llava.cpp
llava.h
clip.cpp
clip.h
)

target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common)

target_compile_features(llava PRIVATE cxx_std_17)

add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS)
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS llava_shared LIBRARY)
endif()

# mtmd

add_library(mtmd OBJECT
Expand Down Expand Up @@ -53,12 +27,10 @@ if (BUILD_SHARED_LIBS)
endif()

if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
endif()

if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
endif()

Expand All @@ -73,10 +45,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

set(TARGET llama-llava-clip-quantize-cli)
add_executable(${TARGET} clip-quantize-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
44 changes: 0 additions & 44 deletions tools/mtmd/README-quantize.md

This file was deleted.

7 changes: 4 additions & 3 deletions tools/mtmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advanta

Multimodal projector (`mmproj`) files are specific to each model architecture.

For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
Expand All @@ -52,11 +52,12 @@ For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` fla

For older models, please refer to the relevant guide for instructions on how to obtain or create them:

NOTE: conversion scripts are located under `tools/mtmd/legacy-models`

- [LLaVA](../../docs/multimodal/llava.md)
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
- [GLM-Edge](../../docs/multimodal/glmedge.md)
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
- [Google Gemma 3](../../docs/multimodal/gemma3.md)
53 changes: 0 additions & 53 deletions tools/mtmd/android/adb_run.sh

This file was deleted.

8 changes: 0 additions & 8 deletions tools/mtmd/android/build_64.sh

This file was deleted.

59 changes: 0 additions & 59 deletions tools/mtmd/clip-quantize-cli.cpp

This file was deleted.

135 changes: 0 additions & 135 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3576,141 +3576,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return true;
}

bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
assert(itype < GGML_TYPE_COUNT);
ggml_type type = static_cast<ggml_type>(itype);

auto * ctx_clip = clip_init(fname_inp, clip_context_params{
/* use_gpu */ false,
/* verbosity */ GGML_LOG_LEVEL_ERROR,
});

const auto & ctx_src = ctx_clip->ctx_gguf.get();
const auto & ctx_data = ctx_clip->ctx_data.get();

auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", itype);

auto fout = std::ofstream(fname_out, std::ios::binary);

const int n_tensors = gguf_get_n_tensors(ctx_src);

for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx_src, i);
ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
gguf_add_tensor(ctx_out, cur);
}

const size_t meta_size = gguf_get_meta_size(ctx_out);
for (size_t i = 0; i < meta_size; ++i) {
fout.put(0);
}

// regexes of tensor names to be quantized
const std::vector<std::string> k_names = {
".*weight",
};

std::vector<uint8_t> work(512);
std::vector<float> conv_buf(512);
size_t total_size_org = 0;
size_t total_size_new = 0;

for (int i = 0; i < n_tensors; ++i) {
const std::string name = gguf_get_tensor_name(ctx_src, i);
ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());

enum ggml_type new_type;
void * new_data;
size_t new_size;

bool quantize = false;
for (const auto & s : k_names) {
if (std::regex_match(name, std::regex(s))) {
quantize = true;
break;
}
}

// quantize only 2D tensors and bigger than block size
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);

if (quantize) {
new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
const size_t n_elms = ggml_nelements(cur);
float * f32_data;

switch (cur->type) {
case GGML_TYPE_F32:
f32_data = (float *)cur->data;
break;
case GGML_TYPE_F16:
if (conv_buf.size() < n_elms) {
conv_buf.resize(n_elms);
}
for (size_t j = 0; j < n_elms; ++j) {
conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
}
f32_data = (float *)conv_buf.data();
break;
default:
LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
gguf_free(ctx_out);
return false;
}

if (work.size() < n_elms * 4) {
work.resize(n_elms * 4);
}
new_data = work.data();

new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
} else {
new_type = cur->type;
new_data = cur->data;
new_size = ggml_nbytes(cur);
}
const size_t orig_size = ggml_nbytes(cur);
total_size_org += orig_size;
total_size_new += new_size;
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
fout.write((const char *)new_data, new_size);
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}

LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
}

// go back to beginning of file and write the updated metadata
fout.seekp(0, std::ios::beg);
std::vector<uint8_t> meta(meta_size);
gguf_get_meta_data(ctx_out, meta.data());
fout.write((const char *)meta.data(), meta_size);

fout.close();

clip_free(ctx_clip);
gguf_free(ctx_out);

{
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
}

return true;
}

int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
switch (ctx->proj_type) {
case PROJECTOR_TYPE_LDP:
Expand Down
File renamed without changes.
Loading
Loading