From ce726f6973a87b033306bc922c2315705c365014 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Wed, 8 Apr 2026 07:29:55 -0700 Subject: [PATCH] Add llm_methods metadata to multimethod PTE export Summary: Add phase-aware method metadata to PTE files during multimethod export. When MethodConfig entries have a `phase` tag ("prefill" or "decode"), the export pipeline writes `llm_methods_prefill` and `llm_methods_decode` constant methods into the PTE. This enables the runtime to discover which methods to call for each inference phase without hardcoding names. Changes: - MethodConfig: add optional `phase` field - _build_yoco_multimethod_config: tag methods with phase="prefill"/"decode" - _export_llm_backbone_multimethod: collect phase-tagged method names - _lower_and_save_multimethod: accept extra_metadata parameter - constants.h: add kLlmMethodsPrefill/kLlmMethodsDecode keys Differential Revision: D99689421 --- extension/llm/export/config/llm_config.py | 4 ++++ extension/llm/runner/constants.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index f4bdfbf1a0d..734cf196fd1 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -310,11 +310,15 @@ class MethodConfig: torch.export tracing. Controls which graph path is captured, e.g. prefill vs decode, or for YOCO, where all layers run for decode but not prefill. When unset, uses the model's default input length. + phase: Optional inference phase tag ("prefill" or "decode"). When set, + the method name is recorded in llm_methods metadata so the runtime + can pick the correct method for each inference phase. """ method_name: str lora_config: Optional[LoraConfig] = None export_seq_len: Optional[int] = None + phase: Optional[str] = None @dataclass diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index d7b36077757..b75ee6819b7 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -19,6 +19,10 @@ inline constexpr auto kVocabSize = "get_vocab_size"; inline constexpr auto kUseKVCache = "use_kv_cache"; inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; +// LLM multimethod phase metadata (e.g. YOCO prefill/decode) +inline constexpr auto kLlmMethodsPrefill = "llm_methods_prefill"; +inline constexpr auto kLlmMethodsDecode = "llm_methods_decode"; + // Multimodal method name conventions inline constexpr auto kVisionEncoderMethod = "vision_encoder"; inline constexpr auto kAudioEncoderMethod = "audio_encoder";