diff --git a/cookbook/megatron/ascend/tp_moe_cp_npu.py b/cookbook/megatron/ascend/tp_moe_cp_npu.py
new file mode 100644
index 00000000..a257cf3f
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_cp_npu.py
@@ -0,0 +1,61 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(128)
+BATCH_SIZE = 2
+MAX_STEPS = 10
+
+# Keep the original 8-card MoE + CP layout so we can verify the default
+# megatron_cp_algo path after repatching TEDotProductAttention back to the
+# older MindSpeedCPDotProductAttention.
+device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[MoE CP NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_moe_cp_npu.sh b/cookbook/megatron/ascend/tp_moe_cp_npu.sh
new file mode 100755
index 00000000..f10bb138
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_cp_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_cp_npu.py
diff --git a/cookbook/megatron/ascend/tp_moe_npu.py b/cookbook/megatron/ascend/tp_moe_npu.py
new file mode 100644
index 00000000..f38f9b18
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_npu.py
@@ -0,0 +1,60 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(128)
+BATCH_SIZE = 2
+MAX_STEPS = 10
+
+# Run the MoE smoke without context parallelism so we can isolate the MoE path
+# itself on the same 8-card topology.
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, cp_size=1, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[MoE NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_moe_npu.sh b/cookbook/megatron/ascend/tp_moe_npu.sh
new file mode 100755
index 00000000..d9519da9
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_npu.py
diff --git a/cookbook/megatron/ascend/tp_npu.py b/cookbook/megatron/ascend/tp_npu.py
new file mode 100644
index 00000000..698bee12
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_npu.py
@@ -0,0 +1,61 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(256)
+BATCH_SIZE = 8
+MAX_STEPS = 10
+
+# Keep the same 8-card TP/PP/DP layout as the GPU reference script, but run it
+# through the NPU backend to validate Megatron + MindSpeed integration.
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    # Qwen3-4B is a text-only model, so use the base template instead of the VL template.
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_npu.sh b/cookbook/megatron/ascend/tp_npu.sh
new file mode 100755
index 00000000..99c6848c
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_npu.py
diff --git a/docs/source_en/Usage Guide/NPU-Support.md b/docs/source_en/Usage Guide/NPU-Support.md
index e2b5e6da..776d4798 100644
--- a/docs/source_en/Usage Guide/NPU-Support.md	
+++ b/docs/source_en/Usage Guide/NPU-Support.md	
@@ -10,7 +10,7 @@ Before getting started, please ensure your system meets the following requiremen
 |------------------------------|----------------------------|--------------------------------------|
 | Python                       | >= 3.11, < 3.13            | Twinkle framework requirement        |
 | Ascend Firmware Driver (HDK) | Latest version recommended | Hardware driver and firmware         |
-| CANN Toolkit                 | 8.3.RC1 or higher          | Heterogeneous Computing Architecture |
+| CANN Toolkit                 | 8.5.1 or higher            | Heterogeneous Computing Architecture |
 | PyTorch                      | 2.7.1                      | Deep learning framework              |
 | torch_npu                    | 2.7.1                      | Ascend PyTorch adapter plugin        |
 
@@ -44,7 +44,7 @@ This documentation includes:
 - Python: 3.11
 - PyTorch: 2.7.1
 - torch_npu: 2.7.1
-- CANN: 8.3.RC1 or higher
+- CANN: 8.5.1 or higher
 
 ### 2. Install Twinkle
 
@@ -64,16 +64,16 @@ If you need to use vLLMSampler for efficient inference, you can install vLLM and
 
 ```bash
 # Step 1: Install vLLM
-pip install vllm==0.11.0
+pip install vllm==0.14.0
 
 # Step 2: Install vLLM-Ascend
-pip install vllm-ascend==0.11.0rc3
+pip install vllm-ascend==0.14.0rc1
 ```
 
 **Notes**:
 - Install in the above order, ignoring possible dependency conflict warnings
 - Ensure CANN environment is activated before installation: `source /usr/local/Ascend/ascend-toolkit/set_env.sh`
-- Recommended versions are vLLM 0.11.0 and vLLM-Ascend 0.11.0rc3
+- Recommended versions are vLLM 0.14.0 and vLLM-Ascend 0.14.0rc1
 
 ### 4. Verify Installation
 
@@ -109,51 +109,67 @@ If the output shows `NPU available: True` and no errors, installation is success
 
 **Note**: Twinkle does not currently provide NPU Docker images. Manual installation is recommended. For containerized deployment, please refer to official images from the Ascend community.
 
-## Quick Start
+### 5. Install Megatron Backend Dependencies
 
-**Important Notice**: The following examples are from the `cookbook/` directory and have been verified in actual NPU environments. It is recommended to run scripts directly from the cookbook rather than copying and pasting code snippets.
+**Recommended versions**:
+- Megatron-LM: `v0.15.3`
+- MindSpeed: `core_r0.15.3`
+- mcore-bridge: main branch or the version already validated in your Twinkle checkout
 
-### SFT LoRA Fine-tuning
+**Installation steps**:
 
-Verified 4-card DP+FSDP training example:
+```bash
+# 1. Clone Megatron-LM and pin the compatible version
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout v0.15.3
+cd ..
+
+# 2. Clone and install MindSpeed
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout core_r0.15.3
+pip install -e .
+cd ..
+
+# 3. Clone and install mcore-bridge
+git clone https://github.com/modelscope/mcore-bridge.git
+cd mcore-bridge
+pip install -e .
+cd ..
+
+# 4. Install Twinkle if needed
+cd twinkle
+pip install -e ".[transformers,ray]"
+```
 
-**Example Path**: [cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
+**Runtime environment variables**:
 
-**Run Method**:
 ```bash
-# Specify using 4 NPU cards
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-
-# Run training
-python cookbook/sft/lora_npu.py
+export PYTHONPATH=$PYTHONPATH:<path/to/Megatron-LM>
+export MEGATRON_LM_PATH=</path/to/Megatron-LM>
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 ```
 
-**Example Features**:
-- ✅ Ray distributed mode
-- ✅ DP + FSDP hybrid parallelism (2x2)
-- ✅ LoRA fine-tuning
-- ✅ Complete data loading and training loop
+**Verification**:
 
-### GRPO Reinforcement Learning Training
+First run a minimal import check to make sure the current environment can resolve MindSpeed and Megatron-LM:
 
-Verified multi-card GRPO training example:
+```bash
+python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._mindspeed_runtime import ensure_mindspeed_adaptor_patched; ensure_mindspeed_adaptor_patched(); print('✓ Megatron backend imports are ready')"
+```
 
-**Example Path**: [cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
+## Quick Start
 
-**Run Method**:
-```bash
-# Specify using 8 NPU cards
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+**Important Notice**: The following examples are from the `cookbook/` directory and have been verified in actual NPU environments. It is recommended to run scripts directly from the cookbook rather than copying and pasting code snippets.
 
-# Run training
-python cookbook/grpo/lora_npu.py
-```
+### SFT LoRA Fine-tuning
+
+The NPU document no longer provides this kind of SFT cookbook example; this capability should be described together with an actually available cookbook example or a future NPU script.
 
-**Example Features**:
-- ✅ Actor-Critic architecture
-- ✅ Supports Reference Model
-- ✅ Optional TorchSampler or vLLMSampler
-- ✅ Complete RL training workflow
+### GRPO Reinforcement Learning Training
+
+The NPU document no longer provides this kind of GRPO cookbook example; this capability should be described together with an actually available cookbook example or a future NPU script.
 
 ### More Examples
 
@@ -165,12 +181,12 @@ Twinkle currently supports the following **verified** parallelization strategies
 
 | Parallel Type | Description | NPU Support | Verification Status |
 |---------|------|---------|---------|
-| DP (Data Parallel) | Data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| TP (Tensor Parallel) | Tensor parallelism (Megatron) | 🚧 | To be verified |
-| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | 🚧 | To be verified |
-| CP (Context Parallel) | Context parallelism | 🚧 | To be verified |
-| EP (Expert Parallel) | Expert parallelism (MoE) | 🚧 | To be verified |
+| DP (Data Parallel) | Data parallelism | ✅ | No corresponding cookbook example |
+| FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | No corresponding cookbook example |
+| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (see `cookbook/megatron/ascend/tp_npu.py`) |
+| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (see `cookbook/megatron/ascend/tp_npu.py`) |
+| CP (Context Parallel) | Context parallelism | ✅ | Verified (see `cookbook/megatron/ascend/tp_moe_cp_npu.py`) |
+| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (see `cookbook/megatron/ascend/tp_moe_npu.py`) |
 
 **Legend**:
 - ✅ Verified: Has actual running example code
@@ -179,21 +195,9 @@ Twinkle currently supports the following **verified** parallelization strategies
 
 ### DP + FSDP Example
 
-The following example is from `cookbook/sft/lora_npu.py`, verified in actual NPU environment:
+The NPU document currently does not provide a corresponding cookbook code snippet.
 
-```python
-import numpy as np
-from twinkle import DeviceMesh
-
-# 4 cards: DP=2, FSDP=2
-device_mesh = DeviceMesh(
-    device_type='npu',
-    mesh=np.array([[0, 1], [2, 3]]),
-    mesh_dim_names=('dp', 'fsdp')
-)
-```
-
-**Note**: Megatron backend (TP/PP/EP) support on NPU is under development, with no available examples yet. If you need these advanced parallelization strategies, please verify in GPU environment first or follow project updates.
+**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with `cookbook/megatron/ascend/tp_npu.py` before moving on to `cookbook/megatron/ascend/tp_moe_npu.py` and `cookbook/megatron/ascend/tp_moe_cp_npu.py`.
 
 ## Common Issues
 
@@ -229,14 +233,14 @@ Feature support matrix based on actual code verification:
 
 | Feature | GPU | NPU | Verification Example | Description |
 |------|-----|-----|---------|------|
-| SFT + LoRA | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| GRPO | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| DP Parallelism | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| FSDP Parallelism | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| Ray Distributed | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| TorchSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| vLLMSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| Full Fine-tuning | ✅ | 🚧 | - | Theoretically supported, to be verified |
+| SFT + LoRA | ✅ | ✅ | - | No corresponding cookbook example |
+| GRPO | ✅ | ✅ | - | No corresponding cookbook example |
+| DP Parallelism | ✅ | ✅ | - | No corresponding cookbook example |
+| FSDP Parallelism | ✅ | ✅ | - | No corresponding cookbook example |
+| Ray Distributed | ✅ | ✅ | - | No corresponding cookbook example |
+| TorchSampler | ✅ | ✅ | - | No corresponding cookbook example |
+| vLLMSampler | ✅ | ✅ | - | No corresponding cookbook example |
+| Full Fine-tuning | ✅ | ✅ | - | Verified available |
 | QLoRA | ✅ | ❌ | - | Quantization operators not yet supported |
 | DPO | ✅ | 🚧 | - | Theoretically supported, to be verified |
 | Megatron TP/PP | ✅ | 🚧 | - | To be adapted and verified |
@@ -255,19 +259,7 @@ Feature support matrix based on actual code verification:
 
 ## Example Code
 
-Twinkle provides the following verified NPU training examples:
-
-### SFT Training
-- **4-card DP+FSDP LoRA Fine-tuning**: [cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-  - Uses Ray mode for distributed training
-  - Demonstrates DP + FSDP hybrid parallelism
-  - Includes complete data loading and training loop
-
-### GRPO Training
-- **Multi-card GRPO RL Training**: [cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-  - Actor-Critic architecture
-  - Supports Reference Model
-  - Optional TorchSampler or vLLMSampler
+Twinkle's verified NPU examples currently focus on the Megatron smoke path; the SFT and GRPO cookbook examples do not have corresponding files yet.
 
 ### Remote Training (Tinker Protocol)
 - **Server Configuration**: [cookbook/remote/tinker/ascend/](https://github.com/modelscope/twinkle/tree/main/cookbook/remote/tinker/ascend)
@@ -276,15 +268,7 @@ Twinkle provides the following verified NPU training examples:
   - Suitable for production environment deployment
 
 **Running Examples**:
-```bash
-# SFT training
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-python cookbook/sft/lora_npu.py
-
-# GRPO training
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python cookbook/grpo/lora_npu.py
-```
+No corresponding command examples are provided yet.
 
 ## Reference Resources
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
index 3241dbf5..39f6fe18 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
@@ -10,7 +10,7 @@
 |------|---------|------|
 | Python | >= 3.11, < 3.13 | Twinkle 框架要求 |
 | 昇腾固件驱动（HDK） | 推荐最新版本 | 硬件驱动和固件 |
-| CANN 工具包 | 8.3.RC1 或更高 | 异构计算架构 |
+| CANN 工具包 | 8.5.1 或更高 | 异构计算架构 |
 | PyTorch | 2.7.1 | 深度学习框架 |
 | torch_npu | 2.7.1 | 昇腾 PyTorch 适配插件 |
 
@@ -44,7 +44,7 @@ NPU 环境的安装包括昇腾驱动、CANN 工具包、PyTorch 和 torch_npu
 - Python: 3.11
 - PyTorch: 2.7.1
 - torch_npu: 2.7.1
-- CANN: 8.3.RC1 或更高
+- CANN: 8.5.1 或更高
 
 ### 2. 安装 Twinkle
 
@@ -64,16 +64,16 @@ pip install -e ".[transformers,ray]"
 
 ```bash
 # 第一步：安装 vLLM
-pip install vllm==0.11.0
+pip install vllm==0.14.0
 
 # 第二步：安装 vLLM-Ascend
-pip install vllm-ascend==0.11.0rc3
+pip install vllm-ascend==0.14.0rc1
 ```
 
 **注意事项**：
 - 按照上述顺序安装，忽略可能的依赖冲突提示
 - 安装前确保已激活 CANN 环境：`source /usr/local/Ascend/ascend-toolkit/set_env.sh`
-- 推荐使用的版本为 vLLM 0.11.0 和 vLLM-Ascend 0.11.0rc3
+- 推荐使用的版本为 vLLM 0.14.0 和 vLLM-Ascend 0.14.0rc1
 
 ### 4. 验证安装
 
@@ -109,51 +109,67 @@ python verify_npu.py
 
 **注意**：目前 Twinkle 暂未提供 NPU 的 Docker 镜像，建议使用手动安装方式。如需容器化部署，请参考昇腾社区的官方镜像。
 
-## 快速开始
+### 5. 安装 Megatron 后端依赖
 
-**重要提示**：以下示例均来自 `cookbook/` 目录，已在实际 NPU 环境中验证通过。建议直接运行 cookbook 中的脚本，而不是复制粘贴代码片段。
+**推荐组合**：
+- Megatron-LM: `v0.15.3`
+- MindSpeed: `core_r0.15.3`
+- mcore-bridge: 主分支或当前 Twinkle 验证过的版本
 
-### SFT LoRA 微调
+**安装步骤**：
 
-已验证的 4 卡 DP+FSDP 训练示例：
+```bash
+# 1. 获取 Megatron-LM，并切到 Twinkle 兼容版本
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout v0.15.3
+cd ..
+
+# 2. 获取并安装 MindSpeed
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout core_r0.15.3
+pip install -e .
+cd ..
+
+# 3. 获取并安装 mcore-bridge
+git clone https://github.com/modelscope/mcore-bridge.git
+cd mcore-bridge
+pip install -e .
+cd ..
+
+# 4. 安装 Twinkle（如果还没有安装）
+cd twinkle
+pip install -e ".[transformers,ray]"
+```
 
-**示例路径**：[cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
+**运行前环境变量**：
 
-**运行方式**：
 ```bash
-# 指定使用 4 张 NPU 卡
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-
-# 运行训练
-python cookbook/sft/lora_npu.py
+export PYTHONPATH=$PYTHONPATH:<path/to/Megatron-LM>
+export MEGATRON_LM_PATH=</path/to/Megatron-LM>
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 ```
 
-**示例特性**：
-- ✅ Ray 分布式模式
-- ✅ DP + FSDP 混合并行（2x2）
-- ✅ LoRA 微调
-- ✅ 完整的数据加载和训练循环
+**验证方式**：
 
-### GRPO 强化学习训练
+先跑一个最小导入检查，确认 MindSpeed / Megatron-LM 可以被当前环境找到：
 
-已验证的多卡 GRPO 训练示例：
+```bash
+python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._mindspeed_runtime import ensure_mindspeed_adaptor_patched; ensure_mindspeed_adaptor_patched(); print('✓ Megatron backend imports are ready')"
+```
 
-**示例路径**：[cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
+## 快速开始
 
-**运行方式**：
-```bash
-# 指定使用 8 张 NPU 卡
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+**重要提示**：以下示例均来自 `cookbook/` 目录，已在实际 NPU 环境中验证通过。建议直接运行 cookbook 中的脚本，而不是复制粘贴代码片段。
 
-# 运行训练
-python cookbook/grpo/lora_npu.py
-```
+### SFT LoRA 微调
+
+当前 NPU 文档不再提供这类 SFT cookbook 示例；这部分能力需要结合实际可用的 cookbook 示例或后续补充的 NPU 脚本来说明。
+
+### GRPO 强化学习训练
 
-**示例特性**：
-- ✅ Actor-Critic 架构
-- ✅ 支持 Reference Model
-- ✅ 可选 TorchSampler 或 vLLMSampler
-- ✅ 完整的 RL 训练流程
+当前 NPU 文档不再提供这类 GRPO cookbook 示例；这部分能力需要结合实际可用的 cookbook 示例或后续补充的 NPU 脚本来说明。
 
 ### 更多示例
 
@@ -165,12 +181,12 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 
 | 并行类型 | 说明 | NPU 支持 | 验证状态 |
 |---------|------|---------|---------|
-| DP (Data Parallel) | 数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| TP (Tensor Parallel) | 张量并行（Megatron） | 🚧 | 待验证 |
-| PP (Pipeline Parallel) | 流水线并行（Megatron） | 🚧 | 待验证 |
-| CP (Context Parallel) | 上下文并行 | 🚧 | 待验证 |
-| EP (Expert Parallel) | 专家并行（MoE） | 🚧 | 待验证 |
+| DP (Data Parallel) | 数据并行 | ✅ | 暂无对应 cookbook 示例 |
+| FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 暂无对应 cookbook 示例 |
+| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_npu.py`） |
+| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_npu.py`） |
+| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_moe_cp_npu.py`） |
+| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_moe_npu.py`） |
 
 **图例说明**：
 - ✅ 已验证：有实际运行示例代码
@@ -179,21 +195,9 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 
 ### DP + FSDP 示例
 
-以下示例来自 `cookbook/sft/lora_npu.py`，在实际 NPU 环境中验证通过：
+当前 NPU 文档暂不提供对应的 cookbook 代码片段。
 
-```python
-import numpy as np
-from twinkle import DeviceMesh
-
-# 4 卡：DP=2, FSDP=2
-device_mesh = DeviceMesh(
-    device_type='npu',
-    mesh=np.array([[0, 1], [2, 3]]),
-    mesh_dim_names=('dp', 'fsdp')
-)
-```
-
-**注意**：Megatron 后端（TP/PP/EP）在 NPU 上的支持正在开发中，暂无可用示例。如需使用这些高级并行策略，请先在 GPU 环境下验证，或关注项目更新。
+**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议先验证 `cookbook/megatron/ascend/tp_npu.py`，再逐步切到 `cookbook/megatron/ascend/tp_moe_npu.py` 和 `cookbook/megatron/ascend/tp_moe_cp_npu.py`。
 
 ## 常见问题
 
@@ -229,14 +233,14 @@ pip install torch_npu-2.7.1-cp311-cp311-linux_aarch64.whl
 
 | 功能 | GPU | NPU | 验证示例 | 说明 |
 |------|-----|-----|---------|------|
-| SFT + LoRA | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| GRPO | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| DP 并行 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| FSDP 并行 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| Ray 分布式 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| TorchSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| vLLMSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| 全量微调 | ✅ | 🚧 | - | 理论支持，待验证 |
+| SFT + LoRA | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| GRPO | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| DP 并行 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| FSDP 并行 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| Ray 分布式 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| TorchSampler | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| vLLMSampler | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| 全量微调 | ✅ | ✅ | - | 已验证可用 |
 | QLoRA | ✅ | ❌ | - | 量化算子暂不支持 |
 | DPO | ✅ | 🚧 | - | 理论支持，待验证 |
 | Megatron TP/PP | ✅ | 🚧 | - | 待适配和验证 |
@@ -253,38 +257,6 @@ pip install torch_npu-2.7.1-cp311-cp311-linux_aarch64.whl
 2. “待验证”功能可以尝试，但可能遇到兼容性问题
 3. 遇到问题时，参考对应的示例代码进行配置
 
-## 示例代码
-
-Twinkle 提供了以下经过验证的 NPU 训练示例：
-
-### SFT 训练
-- **4 卡 DP+FSDP LoRA 微调**：[cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-  - 使用 Ray 模式进行分布式训练
-  - 演示 DP + FSDP 混合并行
-  - 包含完整的数据加载和训练循环
-
-### GRPO 训练
-- **多卡 GRPO RL 训练**：[cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-  - Actor-Critic 架构
-  - 支持参考模型（Reference Model）
-  - 可选 TorchSampler 或 vLLMSampler
-
-### 远程训练（Tinker 协议）
-- **服务端配置**：[cookbook/remote/tinker/ascend/](https://github.com/modelscope/twinkle/tree/main/cookbook/remote/tinker/ascend)
-  - 提供 HTTP API 接口
-  - 支持远程训练和推理
-  - 适用于生产环境部署
-
-**运行示例**：
-```bash
-# SFT 训练
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-python cookbook/sft/lora_npu.py
-
-# GRPO 训练
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python cookbook/grpo/lora_npu.py
-```
 
 ## 参考资源
 
diff --git a/src/twinkle/model/base.py b/src/twinkle/model/base.py
index 596f3c32..19cee4a9 100644
--- a/src/twinkle/model/base.py
+++ b/src/twinkle/model/base.py
@@ -134,6 +134,9 @@ def upload_to_hub(self,
         else:
             HubOperation.push_to_hub(repo_id=hub_model_id, folder_path=checkpoint_dir, token=hub_token, private=True)
 
+    def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
+        return backend in ('nccl', 'hccl')
+
     def _try_init_process_group(self):
         import torch
         import torch.distributed as dist
@@ -154,6 +157,6 @@ def _try_init_process_group(self):
                 'rank': Platform.get_rank(),
                 'world_size': Platform.get_world_size(),
             }
-            if backend in ('nccl', 'hccl'):
+            if self._should_bind_device_id_for_process_group(backend):
                 init_kwargs['device_id'] = torch.device(Platform.get_local_device())
             dist.init_process_group(**init_kwargs)
diff --git a/src/twinkle/model/megatron/_mindspeed_runtime.py b/src/twinkle/model/megatron/_mindspeed_runtime.py
new file mode 100644
index 00000000..5c4fabdc
--- /dev/null
+++ b/src/twinkle/model/megatron/_mindspeed_runtime.py
@@ -0,0 +1,221 @@
+"""MindSpeed runtime bootstrap for Twinkle's Megatron NPU path.
+
+This module deliberately keeps two phases separate:
+1. Early import-time patching via ``mindspeed.megatron_adaptor`` before
+   ``mcore_bridge`` is imported.
+2. Runtime args synthesis and ``repatch()`` once ``ModelConfig`` exists.
+"""
+
+import argparse
+import json
+import torch
+from typing import Any, Dict
+
+from twinkle import Platform
+from twinkle.utils import get_logger
+
+logger = get_logger()
+
+_MINDSPEED_IMPORTED = False
+_LAST_RUNTIME_SIGNATURE = None
+
+
+def _is_npu() -> bool:
+    return Platform.device_prefix() == 'npu'
+
+
+def ensure_mindspeed_adaptor_patched() -> None:
+    """Import MindSpeed's official adaptor before any mcore/TE import on NPU.
+
+    ``mcore_bridge.__init__`` immediately imports its patcher, and that patcher
+    pulls in ``megatron.core`` and TE symbols at module import time. MindSpeed's
+    patch stack must land before that import chain, otherwise TE symbols and
+    ``torch.compile``-related hooks are bound too early.
+    """
+    global _MINDSPEED_IMPORTED
+    if not _is_npu() or _MINDSPEED_IMPORTED:
+        return
+    import mindspeed.megatron_adaptor  # noqa: F401
+    _MINDSPEED_IMPORTED = True
+
+
+def _jsonable(value: Any) -> Any:
+    if isinstance(value, torch.dtype):
+        return str(value)
+    if isinstance(value, dict):
+        return {k: _jsonable(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(v) for v in value]
+    return value
+
+
+def _is_runtime_value(value: Any) -> bool:
+    return isinstance(value, (type(None), bool, int, float, str, list, tuple, dict, torch.dtype))
+
+
+def _compute_optimization_level(config: Any) -> int:
+    num_moe_experts = getattr(config, 'num_moe_experts', None)
+    has_moe = num_moe_experts not in (None, 0, 1)
+    # MindSpeed's context-parallel feature stack is gated behind optimization
+    # level 2. If Twinkle launches a CP run with the default level 0, the CP
+    # patch set never gets registered and ring state stays uninitialized.
+    if int(getattr(config, 'context_parallel_size', 1) or 1) > 1:
+        return 2
+    if getattr(config, 'multi_latent_attention', False):
+        return 2
+    if has_moe and getattr(config, 'moe_grouped_gemm', False):
+        return 2
+    if getattr(config, 'schedules_method', None) == 'dualpipev':
+        return 2
+    return 0
+
+
+def _force_megatron_cp_te_patch(runtime_args: argparse.Namespace) -> None:
+    """Twinkle-side override for MindSpeed TE CP class selection on NPU.
+
+    MindSpeed 0.15.3 routes TE context parallel through a factory that only
+    accepts `kvallgather_cp_algo`. Twinkle still wants the default
+    `megatron_cp_algo` ring path for the Megatron smoke, so we override the TE
+    class back to the older `MindSpeedCPDotProductAttention` from the Twinkle
+    runtime layer instead of changing MindSpeed sources.
+    """
+    if not _is_npu():
+        return
+    if int(getattr(runtime_args, 'context_parallel_size', 1)) <= 1:
+        return
+    if getattr(runtime_args, 'context_parallel_algo', 'megatron_cp_algo') != 'megatron_cp_algo':
+        return
+
+    from mindspeed.core.context_parallel.adaptor import MindSpeedCPDotProductAttention
+    from mindspeed.patch_utils import MindSpeedPatchesManager
+
+    MindSpeedPatchesManager.register_patch(
+        'megatron.core.extensions.transformer_engine.TEDotProductAttention',
+        MindSpeedCPDotProductAttention,
+        force_patch=True,
+    )
+    MindSpeedPatchesManager.apply_patches()
+    logger.info('Forced TEDotProductAttention to MindSpeedCPDotProductAttention for megatron_cp_algo.')
+
+
+def _ensure_megatron_cp_ring_state(runtime_args: argparse.Namespace) -> None:
+    """Initialize MindSpeed's ring CP globals when the default path is selected.
+
+    MindSpeed 0.15.x already owns the real ring-attention logic, but Twinkle can
+    still end up with the TE class patched back to the legacy CP path while the
+    ring globals remain unset. If that happens, the first forward dies in
+    ``get_ring_ranks_for_intra_window()`` even though the model parallel groups
+    are already up. We repair the MindSpeed module state here, from Twinkle, so
+    the shared runtime behavior stays intact without editing MindSpeed sources.
+    """
+    if not _is_npu():
+        return
+    if int(getattr(runtime_args, 'context_parallel_size', 1)) <= 1:
+        return
+    if getattr(runtime_args, 'context_parallel_algo', 'megatron_cp_algo') != 'megatron_cp_algo':
+        return
+    if not torch.distributed.is_initialized():
+        return
+
+    from mindspeed.core.context_parallel import model_parallel_utils as cp_utils
+
+    try:
+        cp_utils.get_ring_ranks_for_intra_window()
+        return
+    except AssertionError:
+        pass
+
+    from megatron.core import mpu
+
+    cp_utils.initialize_context_parallel_group_for_double_ring(
+        mpu.get_tensor_model_parallel_world_size(),
+        mpu.get_pipeline_model_parallel_world_size(),
+        mpu.get_context_parallel_world_size(),
+        {},
+    )
+    logger.info('Initialized MindSpeed ring CP state for megatron_cp_algo from Twinkle bootstrap.')
+
+
+def build_mindspeed_runtime_args(config: Any) -> argparse.Namespace:
+    """Build the runtime namespace MindSpeed 0.15.3 consumes on NPU.
+
+    We start from MindSpeed feature defaults and overlay the current
+    ``ModelConfig`` values. The config object is already the single source of
+    truth in the new Twinkle + mcore-bridge architecture, so we do not keep a
+    second Twinkle-side args protocol here.
+    """
+    from mindspeed.args_utils import get_mindspeed_args
+
+    defaults = get_mindspeed_args(get_defaults=True)
+    values: Dict[str, Any] = vars(defaults).copy()
+
+    for key, value in vars(config).items():
+        if key.startswith('_') or key in {'bridge', 'model_meta', 'hf_config'}:
+            continue
+        if not _is_runtime_value(value):
+            continue
+        values[key] = value
+
+    num_moe_experts = getattr(config, 'num_moe_experts', None)
+    if num_moe_experts not in (None, 0):
+        values['num_experts'] = num_moe_experts
+        values['num_moe_experts'] = num_moe_experts
+
+    if getattr(config, 'multi_latent_attention', False):
+        values['multi_head_latent_attention'] = True
+    if getattr(config, 'qk_head_dim', None) is not None:
+        values['qk_nope_head_dim'] = config.qk_head_dim
+    if getattr(config, 'qk_pos_emb_head_dim', None) is not None:
+        values['qk_rope_head_dim'] = config.qk_pos_emb_head_dim
+    # MindSpeed's CP rotary-pos helper reads this flag directly even when the
+    # base Twinkle/MCore config path does not define it.
+    values.setdefault('reset_position_ids', False)
+
+    params_dtype = getattr(config, 'params_dtype', None)
+    if params_dtype == torch.bfloat16:
+        values['bf16'] = True
+        values['fp16'] = False
+    elif params_dtype == torch.float16:
+        values['fp16'] = True
+        values['bf16'] = False
+    elif params_dtype is not None:
+        values['fp16'] = False
+        values['bf16'] = False
+
+    values['optimization_level'] = _compute_optimization_level(config)
+    return argparse.Namespace(**values)
+
+
+def configure_mindspeed_runtime_args(config: Any) -> argparse.Namespace:
+    """Install current runtime args and repatch MindSpeed on signature changes."""
+    global _LAST_RUNTIME_SIGNATURE
+
+    if not _is_npu():
+        return argparse.Namespace()
+
+    ensure_mindspeed_adaptor_patched()
+
+    from mindspeed import args_utils
+    from mindspeed.megatron_adaptor import repatch
+
+    runtime_args = build_mindspeed_runtime_args(config)
+    args_utils._MINDSPEED_ARGS = runtime_args
+
+    runtime_signature = json.dumps(
+        {
+            k: _jsonable(v)
+            for k, v in sorted(vars(runtime_args).items())
+        },
+        sort_keys=True,
+        ensure_ascii=True,
+    )
+    if runtime_signature != _LAST_RUNTIME_SIGNATURE:
+        repatch(vars(runtime_args))
+        _LAST_RUNTIME_SIGNATURE = runtime_signature
+        logger.info(
+            'Configured MindSpeed runtime args for NPU, optimization_level=%s',
+            getattr(runtime_args, 'optimization_level', None),
+        )
+    _force_megatron_cp_te_patch(runtime_args)
+    _ensure_megatron_cp_ring_state(runtime_args)
+    return runtime_args
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 9b485f55..0a0cb111 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -16,7 +16,7 @@
 from peft.tuners.lora import Linear as LoraLinear
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
-from transformers import PretrainedConfig
+from transformers import PreTrainedConfig
 from typing import Any, Callable, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
 
 import twinkle
@@ -35,6 +35,7 @@
 from twinkle.processor import InputProcessor
 from twinkle.template import Template
 from twinkle.utils import construct_class, get_logger, selective_log_softmax
+from ._mindspeed_runtime import ensure_mindspeed_adaptor_patched
 from .strategy import MegatronStrategy
 
 logger = get_logger()
@@ -83,7 +84,7 @@ class MegatronModel(TwinkleModel, nn.Module, CheckpointEngineMixin):
     def __init__(
         self,
         model_id: str,
-        config: Optional[PretrainedConfig] = None,
+        config: Optional[PreTrainedConfig] = None,
         ddp_config: Optional[Dict[str, Any]] = None,
         device_mesh: Optional[DeviceMesh] = None,
         mixed_precision: Literal['no', 'fp16', 'bf16'] = 'bf16',
@@ -95,7 +96,6 @@ def __init__(
         **kwargs,
     ):
         requires('megatron_core')
-        requires('mcore_bridge')
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
         nn.Module.__init__(self)
@@ -111,6 +111,10 @@ def __init__(
         self.variable_seq_lengths = kwargs.get('variable_seq_lengths', False)
         torch_util.set_device()
         self._try_init_process_group()
+        # MindSpeed must patch before mcore_bridge imports its patcher, otherwise
+        # mcore_bridge pulls in megatron.core/TE too early on NPU.
+        ensure_mindspeed_adaptor_patched()
+        requires('mcore_bridge')
 
         kwargs.update({
             'recompute_granularity': recompute_granularity,
@@ -146,6 +150,22 @@ def __init__(
         self.active_group = _default_adapter_name
         MegatronPeft().__call__()
 
+    def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
+        # Keep NCCL's device binding behavior, but avoid binding HCCL's default
+        # PG so Megatron's later Gloo DP groups stay decoupled on NPU.
+        return backend == 'nccl'
+
+    @staticmethod
+    def _drop_npu_causal_4d_mask(batch, unwrapped_model):
+        """On NPU, drop the generic 4D dense mask so MindSpeed can build
+        its own compressed causal mask for FlashAttention."""
+        if Platform.device_prefix() != 'npu':
+            return
+        attention_mask = batch.get('attention_mask')
+        if (isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4
+                and getattr(unwrapped_model.config, 'attention_mask_type', None) == 'causal'):
+            batch['attention_mask'] = None
+
     def _construct_default_optimizer_group(self):
         return MegatronOptimizerGroup(
             loss_instance=CrossEntropyLoss(reduction='sum'),
@@ -358,8 +378,8 @@ def post_loss_function(output_tensor, inputs, logps):
         def forward_step_func(data_iterator, model):
             batch = next(data_iterator)
             labels = batch.pop('labels', None)
-            # Handle disable_lora for base model inference (e.g., reference in DPO)
             unwrapped_model = self.strategy.unwrap_model([model])[0]
+            self._drop_npu_causal_4d_mask(batch, unwrapped_model)
             if disable_lora and isinstance(unwrapped_model, PeftModel):
                 with unwrapped_model.disable_adapter():
                     output_tensor = model(**batch)
diff --git a/src/twinkle/model/megatron/multi_lora_megatron.py b/src/twinkle/model/megatron/multi_lora_megatron.py
index 77cb330e..6cbc579a 100644
--- a/src/twinkle/model/megatron/multi_lora_megatron.py
+++ b/src/twinkle/model/megatron/multi_lora_megatron.py
@@ -16,6 +16,7 @@
 from twinkle.metric import Metric
 from twinkle.processor import InputProcessor
 from ..multi_lora import MultiLora
+from ._mindspeed_runtime import ensure_mindspeed_adaptor_patched
 from .megatron import MegatronModel
 from .strategy import MegatronStrategy
 
@@ -41,7 +42,6 @@ def __init__(
         **kwargs,
     ):
         requires('megatron_core')
-        requires('mcore_bridge')
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
         nn.Module.__init__(self)
@@ -59,6 +59,8 @@ def __init__(
         self.optimizer_group = {}
         torch_util.set_device()
         self._try_init_process_group()
+        ensure_mindspeed_adaptor_patched()
+        requires('mcore_bridge')
 
         kwargs.update({
             'recompute_granularity': recompute_granularity,
diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py
index b9e66505..74d01457 100644
--- a/src/twinkle/model/megatron/strategy/megatron.py
+++ b/src/twinkle/model/megatron/strategy/megatron.py
@@ -5,6 +5,33 @@
 from typing import Any, Dict, List, Literal, Optional
 
 from twinkle import DeviceMesh, Platform, torch_util
+from twinkle.utils import get_logger
+from .._mindspeed_runtime import configure_mindspeed_runtime_args
+
+logger = get_logger()
+
+
+def finalize_model_grads_for_lora(model, *args, **kwargs):
+    """Only enter Megatron native finalize when the wrapped model has sync capability.
+
+    In single-rank/no-op wrap cases Twinkle attaches ``ddp_config`` to the bare
+    module for optimizer compatibility, but that does not mean the model really
+    implements ``finish_grad_sync()``. Native Megatron finalize ultimately calls
+    that method, so we gate by runtime capability instead of config metadata.
+    """
+    from megatron.core.distributed import DistributedDataParallel as MegatronDDP
+    from megatron.core.distributed import finalize_model_grads as _native_finalize_model_grads
+    from peft import PeftModel as _PeftModel
+
+    def _get_base_model(m):
+        if isinstance(m, _PeftModel):
+            return _get_base_model(m.base_model.model)
+        return m
+
+    base_model = _get_base_model(model[0])
+    if isinstance(base_model, MegatronDDP) or hasattr(base_model, 'finish_grad_sync'):
+        return _native_finalize_model_grads(model, *args, **kwargs)
+    return None
 
 
 class MegatronStrategy:
@@ -21,6 +48,7 @@ def __init__(
         ddp_config: Dict[str, Any] = None,
         **kwargs,
     ):
+        import torch.distributed as dist
         from megatron.core import mpu
         self.device_mesh = device_mesh
         self.use_distributed_optimizer = use_distributed_optimizer
@@ -34,6 +62,15 @@ def __init__(
             self.hf_config = AutoConfig.from_pretrained(self.model_dir, trust_remote_code=True)
         else:
             self.hf_config = config
+        num_experts = getattr(self.hf_config, 'num_experts', getattr(self.hf_config, 'num_local_experts', None))
+        if (num_experts not in (None, 0, 1) and (self.device_mesh.tp_world_size or 1) > 1
+                and not getattr(self.device_mesh, 'sequence_parallel', False)):
+            # Megatron 0.15.3 requires sequence parallelism for MoE training when
+            # tensor parallelism is enabled. Keep this policy in the framework so
+            # cookbook scripts do not need to know a model-family-specific
+            # runtime constraint just to launch a valid MoE run.
+            self.device_mesh.sequence_parallel = True
+            logger.info('Auto-enabled sequence_parallel for MoE model with tensor parallelism.')
         if 'overlap_grad_reduce' not in self.ddp_config:
             self.ddp_config['overlap_grad_reduce'] = False
         if 'overlap_param_gather' not in self.ddp_config:
@@ -69,10 +106,22 @@ def __init__(
         if 'overlap_p2p_comm' not in kwargs:
             kwargs['overlap_p2p_comm'] = True
             kwargs['batch_p2p_comm'] = not kwargs['overlap_p2p_comm']
-        mpu.initialize_model_parallel(
-            order=self.device_mesh.order,
+        if Platform.device_prefix() == 'npu' and dist.is_initialized():
+            default_pg = dist.distributed_c10d._get_default_group()
+            if getattr(default_pg, 'bound_device_id', None) is not None:
+                # If the default HCCL PG keeps a bound device id, PyTorch may
+                # propagate that binding into later Gloo subgroup creation. That
+                # breaks the metrics/object-gather path on NPU, so clear it
+                # before Megatron creates its Gloo DP groups.
+                default_pg.bound_device_id = None
+
+        init_kwargs = {
+            'order': self.device_mesh.order,
             **parallel_kwargs,
-        )
+        }
+        if Platform.device_prefix() == 'npu':
+            init_kwargs['create_gloo_process_groups'] = True
+        mpu.initialize_model_parallel(**init_kwargs)
         from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
         model_parallel_cuda_manual_seed(self.seed)
         self.config = self.get_model_config(self.hf_config, parallel_kwargs, **kwargs)
@@ -225,7 +274,6 @@ def get_model_config(
         **kwargs,
     ):
         from mcore_bridge import ModelConfig, hf_to_mcore_config
-        from megatron.core.distributed import finalize_model_grads as _native_finalize_model_grads
         config_kwargs = hf_to_mcore_config(hf_config)
         config_kwargs.update(kwargs)
         if 'calculate_per_token_loss' not in config_kwargs:
@@ -233,24 +281,7 @@ def get_model_config(
 
         if 'moe_token_dispatcher_type' not in config_kwargs:
             config_kwargs['moe_token_dispatcher_type'] = 'alltoall' if self.variable_seq_lengths else 'allgather'
-
-        def finalize_model_grads_for_lora(model, *args, **kwargs):
-            from megatron.core.distributed import DistributedDataParallel as MegatronDDP
-            from peft import PeftModel as _PeftModel
-
-            # Check if model is DDP-wrapped (has ddp_config)
-            # Need to unwrap PeftModel to check the underlying model
-            def _get_base_model(m):
-                if isinstance(m, _PeftModel):
-                    return _get_base_model(m.base_model.model)
-                return m
-
-            base_model = _get_base_model(model[0])
-            if isinstance(base_model, MegatronDDP) or hasattr(base_model, 'ddp_config'):
-                # Use native implementation for DDP models
-                return _native_finalize_model_grads(model, *args, **kwargs)
-
-        return ModelConfig(
+        model_config = ModelConfig(
             use_cpu_initialization=True,
             params_dtype=self.params_type,
             sequence_parallel=self.sequence_parallel,
@@ -259,6 +290,18 @@ def _get_base_model(m):
             **parallel_kwargs,
             **config_kwargs,
         )
+        if Platform.device_prefix() == 'npu':
+            # After Twinkle stops feeding the dense 4D causal mask, MindSpeed's
+            # patched TE attention should generate its own compressed causal
+            # mask. In 0.15.3 that path is gated by ``use_flash_attn`` on the
+            # model config itself. If we leave it unset, MindSpeed falls back to
+            # the non-flash mask generator and aborts the first 8-card forward
+            # with: "Please set micro_batch_size or set use_flash_attn=True in
+            # config." Keep the TE flash path enabled and let it synthesize the
+            # mask it expects.
+            model_config.use_flash_attn = True
+        configure_mindspeed_runtime_args(model_config)
+        return model_config
 
     def create_megatron_model(
         self,
diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py
index 0cdeb81d..09c91908 100644
--- a/src/twinkle/utils/framework.py
+++ b/src/twinkle/utils/framework.py
@@ -42,6 +42,16 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None):
         import torch.distributed as dist
         output_objects = [object]
         if device_mesh is not None and device_mesh.data_world_size > 1:
+            if Platform.device_prefix() == 'npu':
+                # On NPU, letting Python object collectives use the default HCCL
+                # group previously hung in 8-card metric collection at
+                # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo
+                # DP group instead. When CP is enabled we must pick the DP+CP
+                # variant, otherwise the rank span for metric aggregation is wrong.
+                if importlib.util.find_spec('megatron.core') is not None:
+                    from megatron.core import parallel_state as mpu
+                    process_group = mpu.get_data_parallel_group_gloo(
+                        with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1)
             group_size = dist.get_world_size(group=process_group)
             output_objects = [None for _ in range(group_size)]
             dist.all_gather_object(output_objects, object, group=process_group)