Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Task specific parameters for different tasks (text generation/image generation/e
| `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss, gemma4] |
| `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2, gemma4] |
| `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. |
| `--cache_interval_multiplier` | `integer` | Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64. |

### Image generation
| option | Value format | Description |
Expand Down
4 changes: 2 additions & 2 deletions src/graph_export/graph_cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ void GraphCLIParser::createOptions() {
cxxopts::value<std::string>()->default_value("false"),
"ENABLE_TOOL_GUIDED_GENERATION")
("cache_interval_multiplier",
"Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Default: unset.",
cxxopts::value<uint64_t>(),
"Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64.",
cxxopts::value<uint64_t>()->default_value("64"),
"CACHE_INTERVAL_MULTIPLIER");
Comment on lines 84 to 87

options->add_options("plugin config")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,7 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs();
properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching();
if (nodeOptions.has_cache_interval_multiplier()) {
properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier();
}
properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier();

if (nodeOptions.has_cache_eviction_config()) {
properties->schedulerConfig.cache_eviction_config = prepareCacheEvictionConfig(nodeOptions);
Expand Down
3 changes: 2 additions & 1 deletion src/llm/llm_calculator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ message LLMCalculatorOptions {

optional SparseAttentionConfig sparse_attention_config = 24;

optional uint64 cache_interval_multiplier = 25;
// Applicable only for models with linear attention.
optional uint64 cache_interval_multiplier = 25 [default = 64];
Comment on lines +139 to +140

enum ChatTemplateMode {
// Use GenAI's apply_chat_template (minja-based).
Expand Down
2 changes: 2 additions & 0 deletions src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4249,6 +4249,8 @@ void TestLLMNodeOptionsCheckDefault(std::string& modelsPath) {
ASSERT_EQ(properties->schedulerConfig.dynamic_split_fuse, true);
ASSERT_EQ(properties->schedulerConfig.max_num_seqs, 256);
ASSERT_EQ(properties->schedulerConfig.enable_prefix_caching, false);
ASSERT_TRUE(properties->schedulerConfig.cache_interval_multiplier.has_value());
ASSERT_EQ(properties->schedulerConfig.cache_interval_multiplier.value(), 64);
Comment on lines +4252 to +4253
ASSERT_EQ(properties->device, "CPU");
// CPU default properties (inference_num_threads, enable_cpu_pinning) are automatically
// added to pluginConfig for CPU device; verify no user-specified entries are present.
Expand Down