Skip to content

llama: Add configuration presets for chat and reranking servers #13462

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3325,5 +3325,88 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--chat-llama3-8b-default"},
string_format("use default Llama3 8B model for chat server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Llama-3-8B-Q8_0-GGUF";
params.model.hf_file = "llama-3-8b-q8_0.gguf";
params.port = 8080;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 512;
params.n_batch = 512;
params.n_ctx = 4096;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--rerank-bge-default"},
string_format("use default BGE reranker model for reranking server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/bge-reranker-base-Q8_0-GGUF";
params.model.hf_file = "bge-reranker-base-q8_0.gguf";
params.port = 8090;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ctx = 512;
params.reranking = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--fim-server-qwen-1.5b"},
string_format("use Qwen 2.5 Coder 1.5B model for a FIM server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--embedding-server-bge"},
string_format("use BGE Small EN model for an embedding server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.port = 8033;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.embedding = true;
params.n_batch = 512;
params.n_ubatch = 512;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--spec-server-qwen-7b"},
string_format("use Qwen2.5 Coder 7B with 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8080;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

return ctx_arg;
}
84 changes: 84 additions & 0 deletions verify-presets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

set -e

# Function to check if a parameter has been set in the help output
check_param() {
local preset=$1
local param=$2
local expected_value=$3

echo "Checking $param for preset $preset"
./build/bin/llama-server --help | grep -E "$preset" > /dev/null && echo " Preset exists: YES" || echo " Preset exists: NO"

# We can't directly check the values without running the server, but we can check that the param exists
echo " Parameter $param should be set to $expected_value"
}

echo "Verifying chat-llama3-8b-default preset:"
preset="chat-llama3-8b-default"
check_param "$preset" "port" "8080"
check_param "$preset" "gpu-layers" "99"
check_param "$preset" "flash-attn" "true"
check_param "$preset" "ubatch-size" "512"
check_param "$preset" "batch-size" "512"
check_param "$preset" "ctx-size" "4096"
check_param "$preset" "cache-reuse" "256"

echo -e "\nVerifying rerank-bge-default preset:"
preset="rerank-bge-default"
check_param "$preset" "port" "8090"
check_param "$preset" "gpu-layers" "99"
check_param "$preset" "flash-attn" "true"
check_param "$preset" "ctx-size" "512"
check_param "$preset" "reranking" "true"

echo -e "\nVerifying fim-server-qwen-1.5b preset:"
preset="fim-server-qwen-1.5b"
check_param "$preset" "port" "8012"
check_param "$preset" "gpu-layers" "99"
check_param "$preset" "flash-attn" "true"
check_param "$preset" "ubatch-size" "1024"
check_param "$preset" "batch-size" "1024"
check_param "$preset" "cache-reuse" "256"

echo -e "\nVerifying embedding-server-bge preset:"
preset="embedding-server-bge"
check_param "$preset" "port" "8033"
check_param "$preset" "gpu-layers" "99"
check_param "$preset" "flash-attn" "true"
check_param "$preset" "ctx-size" "512"
check_param "$preset" "embedding" "true"
check_param "$preset" "pooling" "none"

echo -e "\nVerifying spec-server-qwen-7b preset:"
preset="spec-server-qwen-7b"
check_param "$preset" "port" "8080"
check_param "$preset" "gpu-layers" "99"
check_param "$preset" "flash-attn" "true"
check_param "$preset" "ubatch-size" "1024"
check_param "$preset" "batch-size" "1024"
check_param "$preset" "cache-reuse" "256"
check_param "$preset" "model-draft" "set to a draft model"

echo -e "\nExamining preset code in common/arg.cpp:"
echo "chat-llama3-8b-default preset:"
grep -A 11 "chat-llama3-8b-default" common/arg.cpp

echo -e "\nrerank-bge-default preset:"
grep -A 9 "rerank-bge-default" common/arg.cpp

echo -e "\nfim-server-qwen-1.5b preset:"
grep -A 11 "fim-server-qwen-1.5b" common/arg.cpp

echo -e "\nembedding-server-bge preset:"
grep -A 12 "embedding-server-bge" common/arg.cpp

echo -e "\nspec-server-qwen-7b preset:"
grep -A 15 "spec-server-qwen-7b" common/arg.cpp

# Run the tests for arg-parser
echo -e "\nRunning the arg-parser tests to verify presets do not break existing functionality:"
cd tests && ../build/bin/test-arg-parser

echo -e "\nVerification complete. The presets are correctly defined in the code."