diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index f4bdfbf1a0d..734cf196fd1 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -310,11 +310,15 @@ class MethodConfig: torch.export tracing. Controls which graph path is captured, e.g. prefill vs decode, or for YOCO, where all layers run for decode but not prefill. When unset, uses the model's default input length. + phase: Optional inference phase tag ("prefill" or "decode"). When set, + the method name is recorded in llm_methods metadata so the runtime + can pick the correct method for each inference phase. """ method_name: str lora_config: Optional[LoraConfig] = None export_seq_len: Optional[int] = None + phase: Optional[str] = None @dataclass diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index d7b36077757..b75ee6819b7 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -19,6 +19,10 @@ inline constexpr auto kVocabSize = "get_vocab_size"; inline constexpr auto kUseKVCache = "use_kv_cache"; inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; +// LLM multimethod phase metadata (e.g. YOCO prefill/decode) +inline constexpr auto kLlmMethodsPrefill = "llm_methods_prefill"; +inline constexpr auto kLlmMethodsDecode = "llm_methods_decode"; + // Multimodal method name conventions inline constexpr auto kVisionEncoderMethod = "vision_encoder"; inline constexpr auto kAudioEncoderMethod = "audio_encoder";