From 504acae052337c5f2ccc65980e91a65e9a7c8698 Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Sat, 11 Apr 2026 21:55:54 +0800
Subject: [PATCH 01/10] adapt npu megatron 0.15

---
 .../model/megatron/_mindspeed_runtime.py      | 221 ++++++++++++++++++
 src/twinkle/model/megatron/megatron.py        |  57 ++++-
 .../model/megatron/multi_lora_megatron.py     |   5 +-
 .../model/megatron/strategy/megatron.py       |  87 +++++--
 src/twinkle/utils/framework.py                |   9 +
 5 files changed, 353 insertions(+), 26 deletions(-)
 create mode 100644 src/twinkle/model/megatron/_mindspeed_runtime.py

diff --git a/src/twinkle/model/megatron/_mindspeed_runtime.py b/src/twinkle/model/megatron/_mindspeed_runtime.py
new file mode 100644
index 00000000..5c4fabdc
--- /dev/null
+++ b/src/twinkle/model/megatron/_mindspeed_runtime.py
@@ -0,0 +1,221 @@
+"""MindSpeed runtime bootstrap for Twinkle's Megatron NPU path.
+
+This module deliberately keeps two phases separate:
+1. Early import-time patching via ``mindspeed.megatron_adaptor`` before
+   ``mcore_bridge`` is imported.
+2. Runtime args synthesis and ``repatch()`` once ``ModelConfig`` exists.
+"""
+
+import argparse
+import json
+import torch
+from typing import Any, Dict
+
+from twinkle import Platform
+from twinkle.utils import get_logger
+
+logger = get_logger()
+
+_MINDSPEED_IMPORTED = False
+_LAST_RUNTIME_SIGNATURE = None
+
+
+def _is_npu() -> bool:
+    return Platform.device_prefix() == 'npu'
+
+
+def ensure_mindspeed_adaptor_patched() -> None:
+    """Import MindSpeed's official adaptor before any mcore/TE import on NPU.
+
+    ``mcore_bridge.__init__`` immediately imports its patcher, and that patcher
+    pulls in ``megatron.core`` and TE symbols at module import time. MindSpeed's
+    patch stack must land before that import chain, otherwise TE symbols and
+    ``torch.compile``-related hooks are bound too early.
+    """
+    global _MINDSPEED_IMPORTED
+    if not _is_npu() or _MINDSPEED_IMPORTED:
+        return
+    import mindspeed.megatron_adaptor  # noqa: F401
+    _MINDSPEED_IMPORTED = True
+
+
+def _jsonable(value: Any) -> Any:
+    if isinstance(value, torch.dtype):
+        return str(value)
+    if isinstance(value, dict):
+        return {k: _jsonable(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(v) for v in value]
+    return value
+
+
+def _is_runtime_value(value: Any) -> bool:
+    return isinstance(value, (type(None), bool, int, float, str, list, tuple, dict, torch.dtype))
+
+
+def _compute_optimization_level(config: Any) -> int:
+    num_moe_experts = getattr(config, 'num_moe_experts', None)
+    has_moe = num_moe_experts not in (None, 0, 1)
+    # MindSpeed's context-parallel feature stack is gated behind optimization
+    # level 2. If Twinkle launches a CP run with the default level 0, the CP
+    # patch set never gets registered and ring state stays uninitialized.
+    if int(getattr(config, 'context_parallel_size', 1) or 1) > 1:
+        return 2
+    if getattr(config, 'multi_latent_attention', False):
+        return 2
+    if has_moe and getattr(config, 'moe_grouped_gemm', False):
+        return 2
+    if getattr(config, 'schedules_method', None) == 'dualpipev':
+        return 2
+    return 0
+
+
+def _force_megatron_cp_te_patch(runtime_args: argparse.Namespace) -> None:
+    """Twinkle-side override for MindSpeed TE CP class selection on NPU.
+
+    MindSpeed 0.15.3 routes TE context parallel through a factory that only
+    accepts `kvallgather_cp_algo`. Twinkle still wants the default
+    `megatron_cp_algo` ring path for the Megatron smoke, so we override the TE
+    class back to the older `MindSpeedCPDotProductAttention` from the Twinkle
+    runtime layer instead of changing MindSpeed sources.
+    """
+    if not _is_npu():
+        return
+    if int(getattr(runtime_args, 'context_parallel_size', 1)) <= 1:
+        return
+    if getattr(runtime_args, 'context_parallel_algo', 'megatron_cp_algo') != 'megatron_cp_algo':
+        return
+
+    from mindspeed.core.context_parallel.adaptor import MindSpeedCPDotProductAttention
+    from mindspeed.patch_utils import MindSpeedPatchesManager
+
+    MindSpeedPatchesManager.register_patch(
+        'megatron.core.extensions.transformer_engine.TEDotProductAttention',
+        MindSpeedCPDotProductAttention,
+        force_patch=True,
+    )
+    MindSpeedPatchesManager.apply_patches()
+    logger.info('Forced TEDotProductAttention to MindSpeedCPDotProductAttention for megatron_cp_algo.')
+
+
+def _ensure_megatron_cp_ring_state(runtime_args: argparse.Namespace) -> None:
+    """Initialize MindSpeed's ring CP globals when the default path is selected.
+
+    MindSpeed 0.15.x already owns the real ring-attention logic, but Twinkle can
+    still end up with the TE class patched back to the legacy CP path while the
+    ring globals remain unset. If that happens, the first forward dies in
+    ``get_ring_ranks_for_intra_window()`` even though the model parallel groups
+    are already up. We repair the MindSpeed module state here, from Twinkle, so
+    the shared runtime behavior stays intact without editing MindSpeed sources.
+    """
+    if not _is_npu():
+        return
+    if int(getattr(runtime_args, 'context_parallel_size', 1)) <= 1:
+        return
+    if getattr(runtime_args, 'context_parallel_algo', 'megatron_cp_algo') != 'megatron_cp_algo':
+        return
+    if not torch.distributed.is_initialized():
+        return
+
+    from mindspeed.core.context_parallel import model_parallel_utils as cp_utils
+
+    try:
+        cp_utils.get_ring_ranks_for_intra_window()
+        return
+    except AssertionError:
+        pass
+
+    from megatron.core import mpu
+
+    cp_utils.initialize_context_parallel_group_for_double_ring(
+        mpu.get_tensor_model_parallel_world_size(),
+        mpu.get_pipeline_model_parallel_world_size(),
+        mpu.get_context_parallel_world_size(),
+        {},
+    )
+    logger.info('Initialized MindSpeed ring CP state for megatron_cp_algo from Twinkle bootstrap.')
+
+
+def build_mindspeed_runtime_args(config: Any) -> argparse.Namespace:
+    """Build the runtime namespace MindSpeed 0.15.3 consumes on NPU.
+
+    We start from MindSpeed feature defaults and overlay the current
+    ``ModelConfig`` values. The config object is already the single source of
+    truth in the new Twinkle + mcore-bridge architecture, so we do not keep a
+    second Twinkle-side args protocol here.
+    """
+    from mindspeed.args_utils import get_mindspeed_args
+
+    defaults = get_mindspeed_args(get_defaults=True)
+    values: Dict[str, Any] = vars(defaults).copy()
+
+    for key, value in vars(config).items():
+        if key.startswith('_') or key in {'bridge', 'model_meta', 'hf_config'}:
+            continue
+        if not _is_runtime_value(value):
+            continue
+        values[key] = value
+
+    num_moe_experts = getattr(config, 'num_moe_experts', None)
+    if num_moe_experts not in (None, 0):
+        values['num_experts'] = num_moe_experts
+        values['num_moe_experts'] = num_moe_experts
+
+    if getattr(config, 'multi_latent_attention', False):
+        values['multi_head_latent_attention'] = True
+    if getattr(config, 'qk_head_dim', None) is not None:
+        values['qk_nope_head_dim'] = config.qk_head_dim
+    if getattr(config, 'qk_pos_emb_head_dim', None) is not None:
+        values['qk_rope_head_dim'] = config.qk_pos_emb_head_dim
+    # MindSpeed's CP rotary-pos helper reads this flag directly even when the
+    # base Twinkle/MCore config path does not define it.
+    values.setdefault('reset_position_ids', False)
+
+    params_dtype = getattr(config, 'params_dtype', None)
+    if params_dtype == torch.bfloat16:
+        values['bf16'] = True
+        values['fp16'] = False
+    elif params_dtype == torch.float16:
+        values['fp16'] = True
+        values['bf16'] = False
+    elif params_dtype is not None:
+        values['fp16'] = False
+        values['bf16'] = False
+
+    values['optimization_level'] = _compute_optimization_level(config)
+    return argparse.Namespace(**values)
+
+
+def configure_mindspeed_runtime_args(config: Any) -> argparse.Namespace:
+    """Install current runtime args and repatch MindSpeed on signature changes."""
+    global _LAST_RUNTIME_SIGNATURE
+
+    if not _is_npu():
+        return argparse.Namespace()
+
+    ensure_mindspeed_adaptor_patched()
+
+    from mindspeed import args_utils
+    from mindspeed.megatron_adaptor import repatch
+
+    runtime_args = build_mindspeed_runtime_args(config)
+    args_utils._MINDSPEED_ARGS = runtime_args
+
+    runtime_signature = json.dumps(
+        {
+            k: _jsonable(v)
+            for k, v in sorted(vars(runtime_args).items())
+        },
+        sort_keys=True,
+        ensure_ascii=True,
+    )
+    if runtime_signature != _LAST_RUNTIME_SIGNATURE:
+        repatch(vars(runtime_args))
+        _LAST_RUNTIME_SIGNATURE = runtime_signature
+        logger.info(
+            'Configured MindSpeed runtime args for NPU, optimization_level=%s',
+            getattr(runtime_args, 'optimization_level', None),
+        )
+    _force_megatron_cp_te_patch(runtime_args)
+    _ensure_megatron_cp_ring_state(runtime_args)
+    return runtime_args
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 9b485f55..d214d653 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -16,7 +16,7 @@
 from peft.tuners.lora import Linear as LoraLinear
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
-from transformers import PretrainedConfig
+from transformers import PreTrainedConfig
 from typing import Any, Callable, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
 
 import twinkle
@@ -35,6 +35,7 @@
 from twinkle.processor import InputProcessor
 from twinkle.template import Template
 from twinkle.utils import construct_class, get_logger, selective_log_softmax
+from ._mindspeed_runtime import ensure_mindspeed_adaptor_patched
 from .strategy import MegatronStrategy
 
 logger = get_logger()
@@ -83,7 +84,7 @@ class MegatronModel(TwinkleModel, nn.Module, CheckpointEngineMixin):
     def __init__(
         self,
         model_id: str,
-        config: Optional[PretrainedConfig] = None,
+        config: Optional[PreTrainedConfig] = None,
         ddp_config: Optional[Dict[str, Any]] = None,
         device_mesh: Optional[DeviceMesh] = None,
         mixed_precision: Literal['no', 'fp16', 'bf16'] = 'bf16',
@@ -95,7 +96,6 @@ def __init__(
         **kwargs,
     ):
         requires('megatron_core')
-        requires('mcore_bridge')
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
         nn.Module.__init__(self)
@@ -111,6 +111,11 @@ def __init__(
         self.variable_seq_lengths = kwargs.get('variable_seq_lengths', False)
         torch_util.set_device()
         self._try_init_process_group()
+        self._ensure_megatron_process_group()
+        # MindSpeed must patch before mcore_bridge imports its patcher, otherwise
+        # mcore_bridge pulls in megatron.core/TE too early on NPU.
+        ensure_mindspeed_adaptor_patched()
+        requires('mcore_bridge')
 
         kwargs.update({
             'recompute_granularity': recompute_granularity,
@@ -146,6 +151,32 @@ def __init__(
         self.active_group = _default_adapter_name
         MegatronPeft().__call__()
 
+    def _ensure_megatron_process_group(self):
+        """Megatron still requires a default PG even for single-rank local smoke.
+
+        TwinkleModel._try_init_process_group() intentionally skips world_size==1,
+        because most frameworks do not need a default process group there.
+        Megatron is different: initialize_model_parallel() still assumes a default
+        PG already exists, so local NPU/GPU smoke needs a 1-rank fallback PG here.
+        """
+        import torch.distributed as dist
+
+        if not dist.is_initialized():
+            from twinkle import find_free_port
+
+            backend = Platform.device_backend()
+            init_kwargs = {
+                'backend': backend,
+                'init_method': f'tcp://127.0.0.1:{find_free_port()}',
+                'rank': 0,
+                'world_size': 1,
+            }
+            # Keep NCCL's device binding behavior, but avoid binding HCCL default PG
+            # here so the later Gloo sub-groups stay decoupled on NPU.
+            if backend == 'nccl':
+                init_kwargs['device_id'] = torch.device(Platform.get_local_device())
+            dist.init_process_group(**init_kwargs)
+
     def _construct_default_optimizer_group(self):
         return MegatronOptimizerGroup(
             loss_instance=CrossEntropyLoss(reduction='sum'),
@@ -358,6 +389,26 @@ def post_loss_function(output_tensor, inputs, logps):
         def forward_step_func(data_iterator, model):
             batch = next(data_iterator)
             labels = batch.pop('labels', None)
+            # MindSpeed 0.15.3 patches TE attention to a flash-attention based
+            # NPU implementation. That path expects to generate its own
+            # compressed causal mask (for example [2048, 2048]) when
+            # ``attention_mask`` is ``None``. Twinkle's generic Megatron
+            # processor, however, always expands the 1D token mask into a 4D
+            # dense causal mask. On NPU this makes FlashAttention receive the
+            # wrong mask shape and the real 8-card run fails in
+            # ``aclnnFlashAttentionScore``. For decoder-only causal training
+            # with right padding, the 4D mask is redundant: a causal mask
+            # already prevents valid tokens from attending to the padded tail,
+            # and padded query positions are ignored by labels == -100. So on
+            # the NPU TE path, drop this dense mask and let MindSpeed build the
+            # compressed causal mask it requires.
+            if Platform.device_prefix() == 'npu':
+                attention_mask = batch.get('attention_mask')
+                if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4:
+                    unwrapped_model = self.strategy.unwrap_model([model])[0]
+                    attention_mask_type = getattr(unwrapped_model.config, 'attention_mask_type', None)
+                    if attention_mask_type == 'causal':
+                        batch['attention_mask'] = None
             # Handle disable_lora for base model inference (e.g., reference in DPO)
             unwrapped_model = self.strategy.unwrap_model([model])[0]
             if disable_lora and isinstance(unwrapped_model, PeftModel):
diff --git a/src/twinkle/model/megatron/multi_lora_megatron.py b/src/twinkle/model/megatron/multi_lora_megatron.py
index 77cb330e..d05ac7a9 100644
--- a/src/twinkle/model/megatron/multi_lora_megatron.py
+++ b/src/twinkle/model/megatron/multi_lora_megatron.py
@@ -16,6 +16,7 @@
 from twinkle.metric import Metric
 from twinkle.processor import InputProcessor
 from ..multi_lora import MultiLora
+from ._mindspeed_runtime import ensure_mindspeed_adaptor_patched
 from .megatron import MegatronModel
 from .strategy import MegatronStrategy
 
@@ -41,7 +42,6 @@ def __init__(
         **kwargs,
     ):
         requires('megatron_core')
-        requires('mcore_bridge')
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
         nn.Module.__init__(self)
@@ -59,6 +59,9 @@ def __init__(
         self.optimizer_group = {}
         torch_util.set_device()
         self._try_init_process_group()
+        self._ensure_megatron_process_group()
+        ensure_mindspeed_adaptor_patched()
+        requires('mcore_bridge')
 
         kwargs.update({
             'recompute_granularity': recompute_granularity,
diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py
index b9e66505..74d01457 100644
--- a/src/twinkle/model/megatron/strategy/megatron.py
+++ b/src/twinkle/model/megatron/strategy/megatron.py
@@ -5,6 +5,33 @@
 from typing import Any, Dict, List, Literal, Optional
 
 from twinkle import DeviceMesh, Platform, torch_util
+from twinkle.utils import get_logger
+from .._mindspeed_runtime import configure_mindspeed_runtime_args
+
+logger = get_logger()
+
+
+def finalize_model_grads_for_lora(model, *args, **kwargs):
+    """Only enter Megatron native finalize when the wrapped model has sync capability.
+
+    In single-rank/no-op wrap cases Twinkle attaches ``ddp_config`` to the bare
+    module for optimizer compatibility, but that does not mean the model really
+    implements ``finish_grad_sync()``. Native Megatron finalize ultimately calls
+    that method, so we gate by runtime capability instead of config metadata.
+    """
+    from megatron.core.distributed import DistributedDataParallel as MegatronDDP
+    from megatron.core.distributed import finalize_model_grads as _native_finalize_model_grads
+    from peft import PeftModel as _PeftModel
+
+    def _get_base_model(m):
+        if isinstance(m, _PeftModel):
+            return _get_base_model(m.base_model.model)
+        return m
+
+    base_model = _get_base_model(model[0])
+    if isinstance(base_model, MegatronDDP) or hasattr(base_model, 'finish_grad_sync'):
+        return _native_finalize_model_grads(model, *args, **kwargs)
+    return None
 
 
 class MegatronStrategy:
@@ -21,6 +48,7 @@ def __init__(
         ddp_config: Dict[str, Any] = None,
         **kwargs,
     ):
+        import torch.distributed as dist
         from megatron.core import mpu
         self.device_mesh = device_mesh
         self.use_distributed_optimizer = use_distributed_optimizer
@@ -34,6 +62,15 @@ def __init__(
             self.hf_config = AutoConfig.from_pretrained(self.model_dir, trust_remote_code=True)
         else:
             self.hf_config = config
+        num_experts = getattr(self.hf_config, 'num_experts', getattr(self.hf_config, 'num_local_experts', None))
+        if (num_experts not in (None, 0, 1) and (self.device_mesh.tp_world_size or 1) > 1
+                and not getattr(self.device_mesh, 'sequence_parallel', False)):
+            # Megatron 0.15.3 requires sequence parallelism for MoE training when
+            # tensor parallelism is enabled. Keep this policy in the framework so
+            # cookbook scripts do not need to know a model-family-specific
+            # runtime constraint just to launch a valid MoE run.
+            self.device_mesh.sequence_parallel = True
+            logger.info('Auto-enabled sequence_parallel for MoE model with tensor parallelism.')
         if 'overlap_grad_reduce' not in self.ddp_config:
             self.ddp_config['overlap_grad_reduce'] = False
         if 'overlap_param_gather' not in self.ddp_config:
@@ -69,10 +106,22 @@ def __init__(
         if 'overlap_p2p_comm' not in kwargs:
             kwargs['overlap_p2p_comm'] = True
             kwargs['batch_p2p_comm'] = not kwargs['overlap_p2p_comm']
-        mpu.initialize_model_parallel(
-            order=self.device_mesh.order,
+        if Platform.device_prefix() == 'npu' and dist.is_initialized():
+            default_pg = dist.distributed_c10d._get_default_group()
+            if getattr(default_pg, 'bound_device_id', None) is not None:
+                # If the default HCCL PG keeps a bound device id, PyTorch may
+                # propagate that binding into later Gloo subgroup creation. That
+                # breaks the metrics/object-gather path on NPU, so clear it
+                # before Megatron creates its Gloo DP groups.
+                default_pg.bound_device_id = None
+
+        init_kwargs = {
+            'order': self.device_mesh.order,
             **parallel_kwargs,
-        )
+        }
+        if Platform.device_prefix() == 'npu':
+            init_kwargs['create_gloo_process_groups'] = True
+        mpu.initialize_model_parallel(**init_kwargs)
         from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
         model_parallel_cuda_manual_seed(self.seed)
         self.config = self.get_model_config(self.hf_config, parallel_kwargs, **kwargs)
@@ -225,7 +274,6 @@ def get_model_config(
         **kwargs,
     ):
         from mcore_bridge import ModelConfig, hf_to_mcore_config
-        from megatron.core.distributed import finalize_model_grads as _native_finalize_model_grads
         config_kwargs = hf_to_mcore_config(hf_config)
         config_kwargs.update(kwargs)
         if 'calculate_per_token_loss' not in config_kwargs:
@@ -233,24 +281,7 @@ def get_model_config(
 
         if 'moe_token_dispatcher_type' not in config_kwargs:
             config_kwargs['moe_token_dispatcher_type'] = 'alltoall' if self.variable_seq_lengths else 'allgather'
-
-        def finalize_model_grads_for_lora(model, *args, **kwargs):
-            from megatron.core.distributed import DistributedDataParallel as MegatronDDP
-            from peft import PeftModel as _PeftModel
-
-            # Check if model is DDP-wrapped (has ddp_config)
-            # Need to unwrap PeftModel to check the underlying model
-            def _get_base_model(m):
-                if isinstance(m, _PeftModel):
-                    return _get_base_model(m.base_model.model)
-                return m
-
-            base_model = _get_base_model(model[0])
-            if isinstance(base_model, MegatronDDP) or hasattr(base_model, 'ddp_config'):
-                # Use native implementation for DDP models
-                return _native_finalize_model_grads(model, *args, **kwargs)
-
-        return ModelConfig(
+        model_config = ModelConfig(
             use_cpu_initialization=True,
             params_dtype=self.params_type,
             sequence_parallel=self.sequence_parallel,
@@ -259,6 +290,18 @@ def _get_base_model(m):
             **parallel_kwargs,
             **config_kwargs,
         )
+        if Platform.device_prefix() == 'npu':
+            # After Twinkle stops feeding the dense 4D causal mask, MindSpeed's
+            # patched TE attention should generate its own compressed causal
+            # mask. In 0.15.3 that path is gated by ``use_flash_attn`` on the
+            # model config itself. If we leave it unset, MindSpeed falls back to
+            # the non-flash mask generator and aborts the first 8-card forward
+            # with: "Please set micro_batch_size or set use_flash_attn=True in
+            # config." Keep the TE flash path enabled and let it synthesize the
+            # mask it expects.
+            model_config.use_flash_attn = True
+        configure_mindspeed_runtime_args(model_config)
+        return model_config
 
     def create_megatron_model(
         self,
diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py
index 0cdeb81d..7f63d086 100644
--- a/src/twinkle/utils/framework.py
+++ b/src/twinkle/utils/framework.py
@@ -42,6 +42,15 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None):
         import torch.distributed as dist
         output_objects = [object]
         if device_mesh is not None and device_mesh.data_world_size > 1:
+            if Platform.device_prefix() == 'npu':
+                # On NPU, letting Python object collectives use the default HCCL
+                # group previously hung in 8-card metric collection at
+                # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo
+                # DP group instead. When CP is enabled we must pick the DP+CP
+                # variant, otherwise the rank span for metric aggregation is wrong.
+                from megatron.core import parallel_state as mpu
+                process_group = mpu.get_data_parallel_group_gloo(
+                    with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1)
             group_size = dist.get_world_size(group=process_group)
             output_objects = [None for _ in range(group_size)]
             dist.all_gather_object(output_objects, object, group=process_group)

From b955af99a15bac5990696754405e7a8899f7c504 Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Mon, 13 Apr 2026 10:57:16 +0800
Subject: [PATCH 02/10] fix

---
 src/twinkle/model/megatron/megatron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index d214d653..d400af73 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -407,7 +407,7 @@ def forward_step_func(data_iterator, model):
                 if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4:
                     unwrapped_model = self.strategy.unwrap_model([model])[0]
                     attention_mask_type = getattr(unwrapped_model.config, 'attention_mask_type', None)
-                    if attention_mask_type == 'causal':
+                    if attention_mask_type == 'causal' and self.device_mesh.cp_world_size > 1:
                         batch['attention_mask'] = None
             # Handle disable_lora for base model inference (e.g., reference in DPO)
             unwrapped_model = self.strategy.unwrap_model([model])[0]

From 1adf22307f86958c4879a441f1700266226bef68 Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Mon, 13 Apr 2026 14:33:16 +0800
Subject: [PATCH 03/10] update doc of npu

---
 docs/source_en/Usage Guide/NPU-Support.md     | 80 ++++++++++++++++---
 ...PU\347\232\204\346\224\257\346\214\201.md" | 70 +++++++++++++---
 2 files changed, 130 insertions(+), 20 deletions(-)

diff --git a/docs/source_en/Usage Guide/NPU-Support.md b/docs/source_en/Usage Guide/NPU-Support.md
index e2b5e6da..81fa76dc 100644
--- a/docs/source_en/Usage Guide/NPU-Support.md	
+++ b/docs/source_en/Usage Guide/NPU-Support.md	
@@ -10,7 +10,7 @@ Before getting started, please ensure your system meets the following requiremen
 |------------------------------|----------------------------|--------------------------------------|
 | Python                       | >= 3.11, < 3.13            | Twinkle framework requirement        |
 | Ascend Firmware Driver (HDK) | Latest version recommended | Hardware driver and firmware         |
-| CANN Toolkit                 | 8.3.RC1 or higher          | Heterogeneous Computing Architecture |
+| CANN Toolkit                 | 8.5.1 or higher            | Heterogeneous Computing Architecture |
 | PyTorch                      | 2.7.1                      | Deep learning framework              |
 | torch_npu                    | 2.7.1                      | Ascend PyTorch adapter plugin        |
 
@@ -44,7 +44,7 @@ This documentation includes:
 - Python: 3.11
 - PyTorch: 2.7.1
 - torch_npu: 2.7.1
-- CANN: 8.3.RC1 or higher
+- CANN: 8.5.1 or higher
 
 ### 2. Install Twinkle
 
@@ -64,16 +64,16 @@ If you need to use vLLMSampler for efficient inference, you can install vLLM and
 
 ```bash
 # Step 1: Install vLLM
-pip install vllm==0.11.0
+pip install vllm==0.14.0
 
 # Step 2: Install vLLM-Ascend
-pip install vllm-ascend==0.11.0rc3
+pip install vllm-ascend==0.14.0rc1
 ```
 
 **Notes**:
 - Install in the above order, ignoring possible dependency conflict warnings
 - Ensure CANN environment is activated before installation: `source /usr/local/Ascend/ascend-toolkit/set_env.sh`
-- Recommended versions are vLLM 0.11.0 and vLLM-Ascend 0.11.0rc3
+- Recommended versions are vLLM 0.14.0 and vLLM-Ascend 0.14.0rc1
 
 ### 4. Verify Installation
 
@@ -109,6 +109,66 @@ If the output shows `NPU available: True` and no errors, installation is success
 
 **Note**: Twinkle does not currently provide NPU Docker images. Manual installation is recommended. For containerized deployment, please refer to official images from the Ascend community.
 
+### 5. Install Megatron Backend Dependencies
+
+**Recommended versions**:
+- Megatron-LM: `v0.15.3`
+- MindSpeed: `core_r0.15.3`
+- mcore-bridge: main branch or the version already validated in your Twinkle checkout
+
+**Installation steps**:
+
+```bash
+# 1. Clone Megatron-LM and pin the compatible version
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout v0.15.3
+cd ..
+
+# 2. Clone and install MindSpeed
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout core_r0.15.3
+pip install -e .
+cd ..
+
+# 3. Clone and install mcore-bridge
+git clone https://github.com/modelscope/mcore-bridge.git
+cd mcore-bridge
+pip install -e .
+cd ..
+
+# 4. Install Twinkle if needed
+cd twinkle
+pip install -e ".[transformers,ray]"
+```
+
+**Runtime environment variables**:
+
+```bash
+export PYTHONPATH=$PYTHONPATH:/home/zyh/code1/Megatron-LM
+export MEGATRON_LM_PATH=/home/zyh/code1/Megatron-LM
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+```
+
+**Verification**:
+
+First run a minimal import check to make sure the current environment can resolve MindSpeed and Megatron-LM:
+
+```bash
+python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._mindspeed_runtime import ensure_mindspeed_adaptor_patched; ensure_mindspeed_adaptor_patched(); print('✓ Megatron backend imports are ready')"
+```
+
+Then run the cookbook smoke tests to verify the actual TP / MoE / CP training paths:
+
+```bash
+torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_npu.py
+torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_moe_npu.py
+torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_moe_cp_npu.py
+```
+
+If you only want to validate the base Megatron TP path first, start with `tp_npu.py`.
+
 ## Quick Start
 
 **Important Notice**: The following examples are from the `cookbook/` directory and have been verified in actual NPU environments. It is recommended to run scripts directly from the cookbook rather than copying and pasting code snippets.
@@ -167,10 +227,10 @@ Twinkle currently supports the following **verified** parallelization strategies
 |---------|------|---------|---------|
 | DP (Data Parallel) | Data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
 | FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| TP (Tensor Parallel) | Tensor parallelism (Megatron) | 🚧 | To be verified |
-| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | 🚧 | To be verified |
-| CP (Context Parallel) | Context parallelism | 🚧 | To be verified |
-| EP (Expert Parallel) | Expert parallelism (MoE) | 🚧 | To be verified |
+| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (see cookbook/megatron/tp_npu.py) |
+| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (see cookbook/megatron/tp_npu.py) |
+| CP (Context Parallel) | Context parallelism | ✅ | Verified (see cookbook/megatron/tp_moe_cp_npu.py) |
+| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (see cookbook/megatron/tp_moe_npu.py) |
 
 **Legend**:
 - ✅ Verified: Has actual running example code
@@ -193,7 +253,7 @@ device_mesh = DeviceMesh(
 )
 ```
 
-**Note**: Megatron backend (TP/PP/EP) support on NPU is under development, with no available examples yet. If you need these advanced parallelization strategies, please verify in GPU environment first or follow project updates.
+**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with `cookbook/megatron/tp_npu.py` before moving on to `tp_moe_npu.py` and `tp_moe_cp_npu.py`.
 
 ## Common Issues
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
index 3241dbf5..d0a04bed 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
@@ -10,7 +10,7 @@
 |------|---------|------|
 | Python | >= 3.11, < 3.13 | Twinkle 框架要求 |
 | 昇腾固件驱动（HDK） | 推荐最新版本 | 硬件驱动和固件 |
-| CANN 工具包 | 8.3.RC1 或更高 | 异构计算架构 |
+| CANN 工具包 | 8.5.1 或更高 | 异构计算架构 |
 | PyTorch | 2.7.1 | 深度学习框架 |
 | torch_npu | 2.7.1 | 昇腾 PyTorch 适配插件 |
 
@@ -44,7 +44,7 @@ NPU 环境的安装包括昇腾驱动、CANN 工具包、PyTorch 和 torch_npu
 - Python: 3.11
 - PyTorch: 2.7.1
 - torch_npu: 2.7.1
-- CANN: 8.3.RC1 或更高
+- CANN: 8.5.1 或更高
 
 ### 2. 安装 Twinkle
 
@@ -64,16 +64,16 @@ pip install -e ".[transformers,ray]"
 
 ```bash
 # 第一步：安装 vLLM
-pip install vllm==0.11.0
+pip install vllm==0.14.0
 
 # 第二步：安装 vLLM-Ascend
-pip install vllm-ascend==0.11.0rc3
+pip install vllm-ascend==0.14.0rc1
 ```
 
 **注意事项**：
 - 按照上述顺序安装，忽略可能的依赖冲突提示
 - 安装前确保已激活 CANN 环境：`source /usr/local/Ascend/ascend-toolkit/set_env.sh`
-- 推荐使用的版本为 vLLM 0.11.0 和 vLLM-Ascend 0.11.0rc3
+- 推荐使用的版本为 vLLM 0.14.0 和 vLLM-Ascend 0.14.0rc1
 
 ### 4. 验证安装
 
@@ -109,6 +109,56 @@ python verify_npu.py
 
 **注意**：目前 Twinkle 暂未提供 NPU 的 Docker 镜像，建议使用手动安装方式。如需容器化部署，请参考昇腾社区的官方镜像。
 
+### 5. 安装 Megatron 后端依赖
+
+**推荐组合**：
+- Megatron-LM: `v0.15.3`
+- MindSpeed: `core_r0.15.3`
+- mcore-bridge: 主分支或当前 Twinkle 验证过的版本
+
+**安装步骤**：
+
+```bash
+# 1. 获取 Megatron-LM，并切到 Twinkle 兼容版本
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout v0.15.3
+cd ..
+
+# 2. 获取并安装 MindSpeed
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout core_r0.15.3
+pip install -e .
+cd ..
+
+# 3. 获取并安装 mcore-bridge
+git clone https://github.com/modelscope/mcore-bridge.git
+cd mcore-bridge
+pip install -e .
+cd ..
+
+# 4. 安装 Twinkle（如果还没有安装）
+cd twinkle
+pip install -e ".[transformers,ray]"
+```
+
+**运行前环境变量**：
+
+```bash
+export PYTHONPATH=$PYTHONPATH:<path/to/Megatron-LM>
+export MEGATRON_LM_PATH=</path/to/Megatron-LM>
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+```
+
+**验证方式**：
+
+先跑一个最小导入检查，确认 MindSpeed / Megatron-LM 可以被当前环境找到：
+
+```bash
+python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._mindspeed_runtime import ensure_mindspeed_adaptor_patched; ensure_mindspeed_adaptor_patched(); print('✓ Megatron backend imports are ready')"
+```
+
 ## 快速开始
 
 **重要提示**：以下示例均来自 `cookbook/` 目录，已在实际 NPU 环境中验证通过。建议直接运行 cookbook 中的脚本，而不是复制粘贴代码片段。
@@ -167,10 +217,10 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 |---------|------|---------|---------|
 | DP (Data Parallel) | 数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
 | FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| TP (Tensor Parallel) | 张量并行（Megatron） | 🚧 | 待验证 |
-| PP (Pipeline Parallel) | 流水线并行（Megatron） | 🚧 | 待验证 |
-| CP (Context Parallel) | 上下文并行 | 🚧 | 待验证 |
-| EP (Expert Parallel) | 专家并行（MoE） | 🚧 | 待验证 |
+| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（见 cookbook/megatron/tp_npu.py） |
+| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（见 cookbook/megatron/tp_npu.py） |
+| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（见 cookbook/megatron/tp_moe_cp_npu.py） |
+| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（见 cookbook/megatron/tp_moe_npu.py） |
 
 **图例说明**：
 - ✅ 已验证：有实际运行示例代码
@@ -193,7 +243,7 @@ device_mesh = DeviceMesh(
 )
 ```
 
-**注意**：Megatron 后端（TP/PP/EP）在 NPU 上的支持正在开发中，暂无可用示例。如需使用这些高级并行策略，请先在 GPU 环境下验证，或关注项目更新。
+**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议从 `cookbook/megatron/tp_npu.py` 开始验证，再逐步切到 `tp_moe_npu.py` 和 `tp_moe_cp_npu.py`。
 
 ## 常见问题
 

From 2f9dd406b8c8d37beeb02e531c6fcaa259f744b3 Mon Sep 17 00:00:00 2001
From: addsubmuldiv <2362894675@qq.com>
Date: Mon, 13 Apr 2026 16:50:19 +0800
Subject: [PATCH 04/10] Update docs/source_en/Usage Guide/NPU-Support.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/source_en/Usage Guide/NPU-Support.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source_en/Usage Guide/NPU-Support.md b/docs/source_en/Usage Guide/NPU-Support.md
index 81fa76dc..dc78981b 100644
--- a/docs/source_en/Usage Guide/NPU-Support.md	
+++ b/docs/source_en/Usage Guide/NPU-Support.md	
@@ -146,8 +146,8 @@ pip install -e ".[transformers,ray]"
 **Runtime environment variables**:
 
 ```bash
-export PYTHONPATH=$PYTHONPATH:/home/zyh/code1/Megatron-LM
-export MEGATRON_LM_PATH=/home/zyh/code1/Megatron-LM
+export PYTHONPATH=$PYTHONPATH:<path/to/Megatron-LM>
+export MEGATRON_LM_PATH=</path/to/Megatron-LM>
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 ```
 

From b493a22b8bd5f1f800191c81d91bb7076ebacc7a Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Mon, 13 Apr 2026 17:10:26 +0800
Subject: [PATCH 05/10] fix

---
 src/twinkle/model/megatron/megatron.py | 2 +-
 src/twinkle/utils/framework.py         | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index d400af73..d214d653 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -407,7 +407,7 @@ def forward_step_func(data_iterator, model):
                 if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4:
                     unwrapped_model = self.strategy.unwrap_model([model])[0]
                     attention_mask_type = getattr(unwrapped_model.config, 'attention_mask_type', None)
-                    if attention_mask_type == 'causal' and self.device_mesh.cp_world_size > 1:
+                    if attention_mask_type == 'causal':
                         batch['attention_mask'] = None
             # Handle disable_lora for base model inference (e.g., reference in DPO)
             unwrapped_model = self.strategy.unwrap_model([model])[0]
diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py
index 7f63d086..e8573175 100644
--- a/src/twinkle/utils/framework.py
+++ b/src/twinkle/utils/framework.py
@@ -48,9 +48,12 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None):
                 # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo
                 # DP group instead. When CP is enabled we must pick the DP+CP
                 # variant, otherwise the rank span for metric aggregation is wrong.
-                from megatron.core import parallel_state as mpu
-                process_group = mpu.get_data_parallel_group_gloo(
-                    with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1)
+                try:
+                    from megatron.core import parallel_state as mpu
+                    process_group = mpu.get_data_parallel_group_gloo(
+                        with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1)
+                except (ImportError, ModuleNotFoundError):
+                    pass
             group_size = dist.get_world_size(group=process_group)
             output_objects = [None for _ in range(group_size)]
             dist.all_gather_object(output_objects, object, group=process_group)

From 09ce01435aee1093a0fce069210b24224136db16 Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Mon, 13 Apr 2026 19:18:39 +0800
Subject: [PATCH 06/10] update cookbook and doc

---
 cookbook/megatron/ascend/tp_moe_cp_npu.py     | 61 +++++++++++++++++++
 cookbook/megatron/ascend/tp_moe_cp_npu.sh     |  1 +
 cookbook/megatron/ascend/tp_moe_npu.py        | 60 ++++++++++++++++++
 cookbook/megatron/ascend/tp_moe_npu.sh        |  1 +
 cookbook/megatron/ascend/tp_npu.py            | 61 +++++++++++++++++++
 cookbook/megatron/ascend/tp_npu.sh            |  1 +
 docs/source_en/Usage Guide/NPU-Support.md     | 20 ++----
 ...PU\347\232\204\346\224\257\346\214\201.md" | 10 +--
 8 files changed, 195 insertions(+), 20 deletions(-)
 create mode 100644 cookbook/megatron/ascend/tp_moe_cp_npu.py
 create mode 100755 cookbook/megatron/ascend/tp_moe_cp_npu.sh
 create mode 100644 cookbook/megatron/ascend/tp_moe_npu.py
 create mode 100755 cookbook/megatron/ascend/tp_moe_npu.sh
 create mode 100644 cookbook/megatron/ascend/tp_npu.py
 create mode 100755 cookbook/megatron/ascend/tp_npu.sh

diff --git a/cookbook/megatron/ascend/tp_moe_cp_npu.py b/cookbook/megatron/ascend/tp_moe_cp_npu.py
new file mode 100644
index 00000000..a257cf3f
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_cp_npu.py
@@ -0,0 +1,61 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(128)
+BATCH_SIZE = 2
+MAX_STEPS = 10
+
+# Keep the original 8-card MoE + CP layout so we can verify the default
+# megatron_cp_algo path after repatching TEDotProductAttention back to the
+# older MindSpeedCPDotProductAttention.
+device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[MoE CP NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_moe_cp_npu.sh b/cookbook/megatron/ascend/tp_moe_cp_npu.sh
new file mode 100755
index 00000000..f10bb138
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_cp_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_cp_npu.py
diff --git a/cookbook/megatron/ascend/tp_moe_npu.py b/cookbook/megatron/ascend/tp_moe_npu.py
new file mode 100644
index 00000000..f38f9b18
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_npu.py
@@ -0,0 +1,60 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(128)
+BATCH_SIZE = 2
+MAX_STEPS = 10
+
+# Run the MoE smoke without context parallelism so we can isolate the MoE path
+# itself on the same 8-card topology.
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, cp_size=1, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[MoE NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_moe_npu.sh b/cookbook/megatron/ascend/tp_moe_npu.sh
new file mode 100755
index 00000000..d9519da9
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_moe_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_npu.py
diff --git a/cookbook/megatron/ascend/tp_npu.py b/cookbook/megatron/ascend/tp_npu.py
new file mode 100644
index 00000000..698bee12
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_npu.py
@@ -0,0 +1,61 @@
+import twinkle
+from peft import LoraConfig
+
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+DATASET_ID = 'ms://swift/self-cognition'
+DATASET_SLICE = range(256)
+BATCH_SIZE = 8
+MAX_STEPS = 10
+
+# Keep the same 8-card TP/PP/DP layout as the GPU reference script, but run it
+# through the NPU backend to validate Megatron + MindSpeed integration.
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+logger = get_logger()
+
+
+def build_dataset():
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
+    # Qwen3-4B is a text-only model, so use the base template instead of the VL template.
+    dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
+    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.encode()
+    return dataset
+
+
+def build_model(total_steps: int):
+    model = MegatronModel(model_id=MODEL_ID)
+    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model('default', lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=1e-4)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
+    return model
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
+    model = build_model(len(dataloader))
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
+
+    for step, batch in enumerate(dataloader):
+        if step >= MAX_STEPS:
+            break
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'[NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/megatron/ascend/tp_npu.sh b/cookbook/megatron/ascend/tp_npu.sh
new file mode 100755
index 00000000..99c6848c
--- /dev/null
+++ b/cookbook/megatron/ascend/tp_npu.sh
@@ -0,0 +1 @@
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_npu.py
diff --git a/docs/source_en/Usage Guide/NPU-Support.md b/docs/source_en/Usage Guide/NPU-Support.md
index dc78981b..099d88b5 100644
--- a/docs/source_en/Usage Guide/NPU-Support.md	
+++ b/docs/source_en/Usage Guide/NPU-Support.md	
@@ -159,16 +159,6 @@ First run a minimal import check to make sure the current environment can resolv
 python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._mindspeed_runtime import ensure_mindspeed_adaptor_patched; ensure_mindspeed_adaptor_patched(); print('✓ Megatron backend imports are ready')"
 ```
 
-Then run the cookbook smoke tests to verify the actual TP / MoE / CP training paths:
-
-```bash
-torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_npu.py
-torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_moe_npu.py
-torchrun --standalone --nproc_per_node=8 cookbook/megatron/tp_moe_cp_npu.py
-```
-
-If you only want to validate the base Megatron TP path first, start with `tp_npu.py`.
-
 ## Quick Start
 
 **Important Notice**: The following examples are from the `cookbook/` directory and have been verified in actual NPU environments. It is recommended to run scripts directly from the cookbook rather than copying and pasting code snippets.
@@ -227,10 +217,10 @@ Twinkle currently supports the following **verified** parallelization strategies
 |---------|------|---------|---------|
 | DP (Data Parallel) | Data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
 | FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (see cookbook/megatron/tp_npu.py) |
-| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (see cookbook/megatron/tp_npu.py) |
-| CP (Context Parallel) | Context parallelism | ✅ | Verified (see cookbook/megatron/tp_moe_cp_npu.py) |
-| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (see cookbook/megatron/tp_moe_npu.py) |
+| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (Megatron NPU smoke example) |
+| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (Megatron NPU smoke example) |
+| CP (Context Parallel) | Context parallelism | ✅ | Verified (Megatron NPU smoke example) |
+| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (Megatron NPU smoke example) |
 
 **Legend**:
 - ✅ Verified: Has actual running example code
@@ -253,7 +243,7 @@ device_mesh = DeviceMesh(
 )
 ```
 
-**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with `cookbook/megatron/tp_npu.py` before moving on to `tp_moe_npu.py` and `tp_moe_cp_npu.py`.
+**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with the base TP smoke script before moving on to the MoE and CP smoke scripts.
 
 ## Common Issues
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
index d0a04bed..fc0c1348 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
@@ -217,10 +217,10 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 |---------|------|---------|---------|
 | DP (Data Parallel) | 数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
 | FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（见 cookbook/megatron/tp_npu.py） |
-| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（见 cookbook/megatron/tp_npu.py） |
-| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（见 cookbook/megatron/tp_moe_cp_npu.py） |
-| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（见 cookbook/megatron/tp_moe_npu.py） |
+| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（Megatron NPU smoke 示例） |
+| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（Megatron NPU smoke 示例） |
+| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（Megatron NPU smoke 示例） |
+| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（Megatron NPU smoke 示例） |
 
 **图例说明**：
 - ✅ 已验证：有实际运行示例代码
@@ -243,7 +243,7 @@ device_mesh = DeviceMesh(
 )
 ```
 
-**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议从 `cookbook/megatron/tp_npu.py` 开始验证，再逐步切到 `tp_moe_npu.py` 和 `tp_moe_cp_npu.py`。
+**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议先验证基础 TP smoke，再逐步切到 MoE 和 CP smoke。
 
 ## 常见问题
 

From 917c4f06e82cd949ad50ed7bfe71bf0b6380252b Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Mon, 13 Apr 2026 19:27:38 +0800
Subject: [PATCH 07/10] update

---
 docs/source_en/Usage Guide/NPU-Support.md     | 106 +++-------------
 ...PU\347\232\204\346\224\257\346\214\201.md" | 114 +++---------------
 2 files changed, 38 insertions(+), 182 deletions(-)

diff --git a/docs/source_en/Usage Guide/NPU-Support.md b/docs/source_en/Usage Guide/NPU-Support.md
index 099d88b5..776d4798 100644
--- a/docs/source_en/Usage Guide/NPU-Support.md	
+++ b/docs/source_en/Usage Guide/NPU-Support.md	
@@ -165,45 +165,11 @@ python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._minds
 
 ### SFT LoRA Fine-tuning
 
-Verified 4-card DP+FSDP training example:
-
-**Example Path**: [cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-
-**Run Method**:
-```bash
-# Specify using 4 NPU cards
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-
-# Run training
-python cookbook/sft/lora_npu.py
-```
-
-**Example Features**:
-- ✅ Ray distributed mode
-- ✅ DP + FSDP hybrid parallelism (2x2)
-- ✅ LoRA fine-tuning
-- ✅ Complete data loading and training loop
+The NPU document no longer provides this kind of SFT cookbook example; this capability should be described together with an actually available cookbook example or a future NPU script.
 
 ### GRPO Reinforcement Learning Training
 
-Verified multi-card GRPO training example:
-
-**Example Path**: [cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-
-**Run Method**:
-```bash
-# Specify using 8 NPU cards
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-# Run training
-python cookbook/grpo/lora_npu.py
-```
-
-**Example Features**:
-- ✅ Actor-Critic architecture
-- ✅ Supports Reference Model
-- ✅ Optional TorchSampler or vLLMSampler
-- ✅ Complete RL training workflow
+The NPU document no longer provides this kind of GRPO cookbook example; this capability should be described together with an actually available cookbook example or a future NPU script.
 
 ### More Examples
 
@@ -215,12 +181,12 @@ Twinkle currently supports the following **verified** parallelization strategies
 
 | Parallel Type | Description | NPU Support | Verification Status |
 |---------|------|---------|---------|
-| DP (Data Parallel) | Data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | Verified (see cookbook/sft/lora_npu.py) |
-| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (Megatron NPU smoke example) |
-| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (Megatron NPU smoke example) |
-| CP (Context Parallel) | Context parallelism | ✅ | Verified (Megatron NPU smoke example) |
-| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (Megatron NPU smoke example) |
+| DP (Data Parallel) | Data parallelism | ✅ | No corresponding cookbook example |
+| FSDP (Fully Sharded Data Parallel) | Fully sharded data parallelism | ✅ | No corresponding cookbook example |
+| TP (Tensor Parallel) | Tensor parallelism (Megatron) | ✅ | Verified (see `cookbook/megatron/ascend/tp_npu.py`) |
+| PP (Pipeline Parallel) | Pipeline parallelism (Megatron) | ✅ | Verified (see `cookbook/megatron/ascend/tp_npu.py`) |
+| CP (Context Parallel) | Context parallelism | ✅ | Verified (see `cookbook/megatron/ascend/tp_moe_cp_npu.py`) |
+| EP (Expert Parallel) | Expert parallelism (MoE) | ✅ | Verified (see `cookbook/megatron/ascend/tp_moe_npu.py`) |
 
 **Legend**:
 - ✅ Verified: Has actual running example code
@@ -229,21 +195,9 @@ Twinkle currently supports the following **verified** parallelization strategies
 
 ### DP + FSDP Example
 
-The following example is from `cookbook/sft/lora_npu.py`, verified in actual NPU environment:
-
-```python
-import numpy as np
-from twinkle import DeviceMesh
-
-# 4 cards: DP=2, FSDP=2
-device_mesh = DeviceMesh(
-    device_type='npu',
-    mesh=np.array([[0, 1], [2, 3]]),
-    mesh_dim_names=('dp', 'fsdp')
-)
-```
+The NPU document currently does not provide a corresponding cookbook code snippet.
 
-**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with the base TP smoke script before moving on to the MoE and CP smoke scripts.
+**Megatron backend note**: Twinkle now provides runnable NPU smoke scripts for the Megatron backend. Please follow the installation section above before running the cookbook examples, and start with `cookbook/megatron/ascend/tp_npu.py` before moving on to `cookbook/megatron/ascend/tp_moe_npu.py` and `cookbook/megatron/ascend/tp_moe_cp_npu.py`.
 
 ## Common Issues
 
@@ -279,14 +233,14 @@ Feature support matrix based on actual code verification:
 
 | Feature | GPU | NPU | Verification Example | Description |
 |------|-----|-----|---------|------|
-| SFT + LoRA | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| GRPO | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| DP Parallelism | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| FSDP Parallelism | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| Ray Distributed | ✅ | ✅ | cookbook/sft/lora_npu.py | Verified available |
-| TorchSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| vLLMSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | Verified available |
-| Full Fine-tuning | ✅ | 🚧 | - | Theoretically supported, to be verified |
+| SFT + LoRA | ✅ | ✅ | - | No corresponding cookbook example |
+| GRPO | ✅ | ✅ | - | No corresponding cookbook example |
+| DP Parallelism | ✅ | ✅ | - | No corresponding cookbook example |
+| FSDP Parallelism | ✅ | ✅ | - | No corresponding cookbook example |
+| Ray Distributed | ✅ | ✅ | - | No corresponding cookbook example |
+| TorchSampler | ✅ | ✅ | - | No corresponding cookbook example |
+| vLLMSampler | ✅ | ✅ | - | No corresponding cookbook example |
+| Full Fine-tuning | ✅ | ✅ | - | Verified available |
 | QLoRA | ✅ | ❌ | - | Quantization operators not yet supported |
 | DPO | ✅ | 🚧 | - | Theoretically supported, to be verified |
 | Megatron TP/PP | ✅ | 🚧 | - | To be adapted and verified |
@@ -305,19 +259,7 @@ Feature support matrix based on actual code verification:
 
 ## Example Code
 
-Twinkle provides the following verified NPU training examples:
-
-### SFT Training
-- **4-card DP+FSDP LoRA Fine-tuning**: [cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-  - Uses Ray mode for distributed training
-  - Demonstrates DP + FSDP hybrid parallelism
-  - Includes complete data loading and training loop
-
-### GRPO Training
-- **Multi-card GRPO RL Training**: [cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-  - Actor-Critic architecture
-  - Supports Reference Model
-  - Optional TorchSampler or vLLMSampler
+Twinkle's verified NPU examples currently focus on the Megatron smoke path; the SFT and GRPO cookbook examples do not have corresponding files yet.
 
 ### Remote Training (Tinker Protocol)
 - **Server Configuration**: [cookbook/remote/tinker/ascend/](https://github.com/modelscope/twinkle/tree/main/cookbook/remote/tinker/ascend)
@@ -326,15 +268,7 @@ Twinkle provides the following verified NPU training examples:
   - Suitable for production environment deployment
 
 **Running Examples**:
-```bash
-# SFT training
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-python cookbook/sft/lora_npu.py
-
-# GRPO training
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python cookbook/grpo/lora_npu.py
-```
+No corresponding command examples are provided yet.
 
 ## Reference Resources
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
index fc0c1348..39f6fe18 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
@@ -165,45 +165,11 @@ python -c "import mindspeed.megatron_adaptor; from twinkle.model.megatron._minds
 
 ### SFT LoRA 微调
 
-已验证的 4 卡 DP+FSDP 训练示例：
-
-**示例路径**：[cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-
-**运行方式**：
-```bash
-# 指定使用 4 张 NPU 卡
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-
-# 运行训练
-python cookbook/sft/lora_npu.py
-```
-
-**示例特性**：
-- ✅ Ray 分布式模式
-- ✅ DP + FSDP 混合并行（2x2）
-- ✅ LoRA 微调
-- ✅ 完整的数据加载和训练循环
+当前 NPU 文档不再提供这类 SFT cookbook 示例；这部分能力需要结合实际可用的 cookbook 示例或后续补充的 NPU 脚本来说明。
 
 ### GRPO 强化学习训练
 
-已验证的多卡 GRPO 训练示例：
-
-**示例路径**：[cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-
-**运行方式**：
-```bash
-# 指定使用 8 张 NPU 卡
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-# 运行训练
-python cookbook/grpo/lora_npu.py
-```
-
-**示例特性**：
-- ✅ Actor-Critic 架构
-- ✅ 支持 Reference Model
-- ✅ 可选 TorchSampler 或 vLLMSampler
-- ✅ 完整的 RL 训练流程
+当前 NPU 文档不再提供这类 GRPO cookbook 示例；这部分能力需要结合实际可用的 cookbook 示例或后续补充的 NPU 脚本来说明。
 
 ### 更多示例
 
@@ -215,12 +181,12 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 
 | 并行类型 | 说明 | NPU 支持 | 验证状态 |
 |---------|------|---------|---------|
-| DP (Data Parallel) | 数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 已验证（见 cookbook/sft/lora_npu.py） |
-| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（Megatron NPU smoke 示例） |
-| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（Megatron NPU smoke 示例） |
-| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（Megatron NPU smoke 示例） |
-| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（Megatron NPU smoke 示例） |
+| DP (Data Parallel) | 数据并行 | ✅ | 暂无对应 cookbook 示例 |
+| FSDP (Fully Sharded Data Parallel) | 完全分片数据并行 | ✅ | 暂无对应 cookbook 示例 |
+| TP (Tensor Parallel) | 张量并行（Megatron） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_npu.py`） |
+| PP (Pipeline Parallel) | 流水线并行（Megatron） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_npu.py`） |
+| CP (Context Parallel) | 上下文并行 | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_moe_cp_npu.py`） |
+| EP (Expert Parallel) | 专家并行（MoE） | ✅ | 已验证（见 `cookbook/megatron/ascend/tp_moe_npu.py`） |
 
 **图例说明**：
 - ✅ 已验证：有实际运行示例代码
@@ -229,21 +195,9 @@ Twinkle 在 NPU 上目前支持以下**经过验证**的并行策略：
 
 ### DP + FSDP 示例
 
-以下示例来自 `cookbook/sft/lora_npu.py`，在实际 NPU 环境中验证通过：
+当前 NPU 文档暂不提供对应的 cookbook 代码片段。
 
-```python
-import numpy as np
-from twinkle import DeviceMesh
-
-# 4 卡：DP=2, FSDP=2
-device_mesh = DeviceMesh(
-    device_type='npu',
-    mesh=np.array([[0, 1], [2, 3]]),
-    mesh_dim_names=('dp', 'fsdp')
-)
-```
-
-**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议先验证基础 TP smoke，再逐步切到 MoE 和 CP smoke。
+**Megatron 后端说明**：Twinkle 的 Megatron NPU 路径已经提供了可直接运行的 smoke 示例，安装和运行依赖请参考上面的 “Megatron 后端依赖” 小节。当前优先建议先验证 `cookbook/megatron/ascend/tp_npu.py`，再逐步切到 `cookbook/megatron/ascend/tp_moe_npu.py` 和 `cookbook/megatron/ascend/tp_moe_cp_npu.py`。
 
 ## 常见问题
 
@@ -279,14 +233,14 @@ pip install torch_npu-2.7.1-cp311-cp311-linux_aarch64.whl
 
 | 功能 | GPU | NPU | 验证示例 | 说明 |
 |------|-----|-----|---------|------|
-| SFT + LoRA | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| GRPO | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| DP 并行 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| FSDP 并行 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| Ray 分布式 | ✅ | ✅ | cookbook/sft/lora_npu.py | 已验证可用 |
-| TorchSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| vLLMSampler | ✅ | ✅ | cookbook/grpo/lora_npu.py | 已验证可用 |
-| 全量微调 | ✅ | 🚧 | - | 理论支持，待验证 |
+| SFT + LoRA | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| GRPO | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| DP 并行 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| FSDP 并行 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| Ray 分布式 | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| TorchSampler | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| vLLMSampler | ✅ | ✅ | - | 暂无对应 cookbook 示例 |
+| 全量微调 | ✅ | ✅ | - | 已验证可用 |
 | QLoRA | ✅ | ❌ | - | 量化算子暂不支持 |
 | DPO | ✅ | 🚧 | - | 理论支持，待验证 |
 | Megatron TP/PP | ✅ | 🚧 | - | 待适配和验证 |
@@ -303,38 +257,6 @@ pip install torch_npu-2.7.1-cp311-cp311-linux_aarch64.whl
 2. “待验证”功能可以尝试，但可能遇到兼容性问题
 3. 遇到问题时，参考对应的示例代码进行配置
 
-## 示例代码
-
-Twinkle 提供了以下经过验证的 NPU 训练示例：
-
-### SFT 训练
-- **4 卡 DP+FSDP LoRA 微调**：[cookbook/sft/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/sft/lora_npu.py)
-  - 使用 Ray 模式进行分布式训练
-  - 演示 DP + FSDP 混合并行
-  - 包含完整的数据加载和训练循环
-
-### GRPO 训练
-- **多卡 GRPO RL 训练**：[cookbook/grpo/lora_npu.py](https://github.com/modelscope/twinkle/blob/main/cookbook/grpo/lora_npu.py)
-  - Actor-Critic 架构
-  - 支持参考模型（Reference Model）
-  - 可选 TorchSampler 或 vLLMSampler
-
-### 远程训练（Tinker 协议）
-- **服务端配置**：[cookbook/remote/tinker/ascend/](https://github.com/modelscope/twinkle/tree/main/cookbook/remote/tinker/ascend)
-  - 提供 HTTP API 接口
-  - 支持远程训练和推理
-  - 适用于生产环境部署
-
-**运行示例**：
-```bash
-# SFT 训练
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
-python cookbook/sft/lora_npu.py
-
-# GRPO 训练
-export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python cookbook/grpo/lora_npu.py
-```
 
 ## 参考资源
 

From 0a52cfbd3d47e7af7014e9c3858b2a4a18f208bb Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Tue, 14 Apr 2026 15:06:19 +0800
Subject: [PATCH 08/10] fix

---
 src/twinkle/model/base.py                     |  5 +++-
 src/twinkle/model/megatron/megatron.py        | 30 +++----------------
 .../model/megatron/multi_lora_megatron.py     |  1 -
 3 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/src/twinkle/model/base.py b/src/twinkle/model/base.py
index 596f3c32..19cee4a9 100644
--- a/src/twinkle/model/base.py
+++ b/src/twinkle/model/base.py
@@ -134,6 +134,9 @@ def upload_to_hub(self,
         else:
             HubOperation.push_to_hub(repo_id=hub_model_id, folder_path=checkpoint_dir, token=hub_token, private=True)
 
+    def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
+        return backend in ('nccl', 'hccl')
+
     def _try_init_process_group(self):
         import torch
         import torch.distributed as dist
@@ -154,6 +157,6 @@ def _try_init_process_group(self):
                 'rank': Platform.get_rank(),
                 'world_size': Platform.get_world_size(),
             }
-            if backend in ('nccl', 'hccl'):
+            if self._should_bind_device_id_for_process_group(backend):
                 init_kwargs['device_id'] = torch.device(Platform.get_local_device())
             dist.init_process_group(**init_kwargs)
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index d214d653..568d4710 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -111,7 +111,6 @@ def __init__(
         self.variable_seq_lengths = kwargs.get('variable_seq_lengths', False)
         torch_util.set_device()
         self._try_init_process_group()
-        self._ensure_megatron_process_group()
         # MindSpeed must patch before mcore_bridge imports its patcher, otherwise
         # mcore_bridge pulls in megatron.core/TE too early on NPU.
         ensure_mindspeed_adaptor_patched()
@@ -151,31 +150,10 @@ def __init__(
         self.active_group = _default_adapter_name
         MegatronPeft().__call__()
 
-    def _ensure_megatron_process_group(self):
-        """Megatron still requires a default PG even for single-rank local smoke.
-
-        TwinkleModel._try_init_process_group() intentionally skips world_size==1,
-        because most frameworks do not need a default process group there.
-        Megatron is different: initialize_model_parallel() still assumes a default
-        PG already exists, so local NPU/GPU smoke needs a 1-rank fallback PG here.
-        """
-        import torch.distributed as dist
-
-        if not dist.is_initialized():
-            from twinkle import find_free_port
-
-            backend = Platform.device_backend()
-            init_kwargs = {
-                'backend': backend,
-                'init_method': f'tcp://127.0.0.1:{find_free_port()}',
-                'rank': 0,
-                'world_size': 1,
-            }
-            # Keep NCCL's device binding behavior, but avoid binding HCCL default PG
-            # here so the later Gloo sub-groups stay decoupled on NPU.
-            if backend == 'nccl':
-                init_kwargs['device_id'] = torch.device(Platform.get_local_device())
-            dist.init_process_group(**init_kwargs)
+    def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
+        # Keep NCCL's device binding behavior, but avoid binding HCCL's default
+        # PG so Megatron's later Gloo DP groups stay decoupled on NPU.
+        return backend == 'nccl'
 
     def _construct_default_optimizer_group(self):
         return MegatronOptimizerGroup(
diff --git a/src/twinkle/model/megatron/multi_lora_megatron.py b/src/twinkle/model/megatron/multi_lora_megatron.py
index d05ac7a9..6cbc579a 100644
--- a/src/twinkle/model/megatron/multi_lora_megatron.py
+++ b/src/twinkle/model/megatron/multi_lora_megatron.py
@@ -59,7 +59,6 @@ def __init__(
         self.optimizer_group = {}
         torch_util.set_device()
         self._try_init_process_group()
-        self._ensure_megatron_process_group()
         ensure_mindspeed_adaptor_patched()
         requires('mcore_bridge')
 

From 41b0dccd97b70e83684e5f625476001325b9ee2f Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Tue, 14 Apr 2026 16:05:28 +0800
Subject: [PATCH 09/10] fix

---
 src/twinkle/model/megatron/megatron.py | 33 ++++++++++----------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 568d4710..0a0cb111 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -155,6 +155,17 @@ def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
         # PG so Megatron's later Gloo DP groups stay decoupled on NPU.
         return backend == 'nccl'
 
+    @staticmethod
+    def _drop_npu_causal_4d_mask(batch, unwrapped_model):
+        """On NPU, drop the generic 4D dense mask so MindSpeed can build
+        its own compressed causal mask for FlashAttention."""
+        if Platform.device_prefix() != 'npu':
+            return
+        attention_mask = batch.get('attention_mask')
+        if (isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4
+                and getattr(unwrapped_model.config, 'attention_mask_type', None) == 'causal'):
+            batch['attention_mask'] = None
+
     def _construct_default_optimizer_group(self):
         return MegatronOptimizerGroup(
             loss_instance=CrossEntropyLoss(reduction='sum'),
@@ -367,28 +378,8 @@ def post_loss_function(output_tensor, inputs, logps):
         def forward_step_func(data_iterator, model):
             batch = next(data_iterator)
             labels = batch.pop('labels', None)
-            # MindSpeed 0.15.3 patches TE attention to a flash-attention based
-            # NPU implementation. That path expects to generate its own
-            # compressed causal mask (for example [2048, 2048]) when
-            # ``attention_mask`` is ``None``. Twinkle's generic Megatron
-            # processor, however, always expands the 1D token mask into a 4D
-            # dense causal mask. On NPU this makes FlashAttention receive the
-            # wrong mask shape and the real 8-card run fails in
-            # ``aclnnFlashAttentionScore``. For decoder-only causal training
-            # with right padding, the 4D mask is redundant: a causal mask
-            # already prevents valid tokens from attending to the padded tail,
-            # and padded query positions are ignored by labels == -100. So on
-            # the NPU TE path, drop this dense mask and let MindSpeed build the
-            # compressed causal mask it requires.
-            if Platform.device_prefix() == 'npu':
-                attention_mask = batch.get('attention_mask')
-                if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 4:
-                    unwrapped_model = self.strategy.unwrap_model([model])[0]
-                    attention_mask_type = getattr(unwrapped_model.config, 'attention_mask_type', None)
-                    if attention_mask_type == 'causal':
-                        batch['attention_mask'] = None
-            # Handle disable_lora for base model inference (e.g., reference in DPO)
             unwrapped_model = self.strategy.unwrap_model([model])[0]
+            self._drop_npu_causal_4d_mask(batch, unwrapped_model)
             if disable_lora and isinstance(unwrapped_model, PeftModel):
                 with unwrapped_model.disable_adapter():
                     output_tensor = model(**batch)

From 2ff82f3c9c46b65bc5d34df096ac19d0dbdfa1ae Mon Sep 17 00:00:00 2001
From: addsubmuldiv <zyh13227@163.com>
Date: Tue, 14 Apr 2026 16:11:18 +0800
Subject: [PATCH 10/10] fix

---
 src/twinkle/utils/framework.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py
index e8573175..09c91908 100644
--- a/src/twinkle/utils/framework.py
+++ b/src/twinkle/utils/framework.py
@@ -48,12 +48,10 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None):
                 # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo
                 # DP group instead. When CP is enabled we must pick the DP+CP
                 # variant, otherwise the rank span for metric aggregation is wrong.
-                try:
+                if importlib.util.find_spec('megatron.core') is not None:
                     from megatron.core import parallel_state as mpu
                     process_group = mpu.get_data_parallel_group_gloo(
                         with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1)
-                except (ImportError, ModuleNotFoundError):
-                    pass
             group_size = dist.get_world_size(group=process_group)
             output_objects = [None for _ in range(group_size)]
             dist.all_gather_object(output_objects, object, group=process_group)