Skip to content

Commit 2f01c45

Browse files
authored
Npu adapt megatron (#153)
1 parent 01e21bf commit 2f01c45

File tree

14 files changed

+650
-210
lines changed

14 files changed

+650
-210
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import twinkle
2+
from peft import LoraConfig
3+
4+
from twinkle import DeviceMesh, get_device_placement, get_logger
5+
from twinkle.dataloader import DataLoader
6+
from twinkle.dataset import Dataset, DatasetMeta
7+
from twinkle.model import MegatronModel
8+
from twinkle.preprocessor import SelfCognitionProcessor
9+
10+
MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
11+
DATASET_ID = 'ms://swift/self-cognition'
12+
DATASET_SLICE = range(128)
13+
BATCH_SIZE = 2
14+
MAX_STEPS = 10
15+
16+
# Keep the original 8-card MoE + CP layout so we can verify the default
17+
# megatron_cp_algo path after repatching TEDotProductAttention back to the
18+
# older MindSpeedCPDotProductAttention.
19+
device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, device_type='npu')
20+
twinkle.initialize(mode='local', global_device_mesh=device_mesh)
21+
22+
logger = get_logger()
23+
24+
25+
def build_dataset():
26+
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
27+
dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
28+
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
29+
dataset.encode()
30+
return dataset
31+
32+
33+
def build_model(total_steps: int):
34+
model = MegatronModel(model_id=MODEL_ID)
35+
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
36+
model.add_adapter_to_model('default', lora_config)
37+
model.set_optimizer(optimizer_cls='default', lr=1e-4)
38+
model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
39+
return model
40+
41+
42+
def train():
43+
dataset = build_dataset()
44+
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
45+
model = build_model(len(dataloader))
46+
47+
logger.info(get_device_placement())
48+
logger.info(model.get_train_configs())
49+
logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
50+
51+
for step, batch in enumerate(dataloader):
52+
if step >= MAX_STEPS:
53+
break
54+
model.forward_backward(inputs=batch)
55+
model.clip_grad_and_step()
56+
metric = model.calculate_metric(is_training=True)
57+
logger.info(f'[MoE CP NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
58+
59+
60+
if __name__ == '__main__':
61+
train()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_cp_npu.py
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import twinkle
2+
from peft import LoraConfig
3+
4+
from twinkle import DeviceMesh, get_device_placement, get_logger
5+
from twinkle.dataloader import DataLoader
6+
from twinkle.dataset import Dataset, DatasetMeta
7+
from twinkle.model import MegatronModel
8+
from twinkle.preprocessor import SelfCognitionProcessor
9+
10+
MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
11+
DATASET_ID = 'ms://swift/self-cognition'
12+
DATASET_SLICE = range(128)
13+
BATCH_SIZE = 2
14+
MAX_STEPS = 10
15+
16+
# Run the MoE smoke without context parallelism so we can isolate the MoE path
17+
# itself on the same 8-card topology.
18+
device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, cp_size=1, ep_size=2, device_type='npu')
19+
twinkle.initialize(mode='local', global_device_mesh=device_mesh)
20+
21+
logger = get_logger()
22+
23+
24+
def build_dataset():
25+
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
26+
dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
27+
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
28+
dataset.encode()
29+
return dataset
30+
31+
32+
def build_model(total_steps: int):
33+
model = MegatronModel(model_id=MODEL_ID)
34+
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
35+
model.add_adapter_to_model('default', lora_config)
36+
model.set_optimizer(optimizer_cls='default', lr=1e-4)
37+
model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
38+
return model
39+
40+
41+
def train():
42+
dataset = build_dataset()
43+
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
44+
model = build_model(len(dataloader))
45+
46+
logger.info(get_device_placement())
47+
logger.info(model.get_train_configs())
48+
logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
49+
50+
for step, batch in enumerate(dataloader):
51+
if step >= MAX_STEPS:
52+
break
53+
model.forward_backward(inputs=batch)
54+
model.clip_grad_and_step()
55+
metric = model.calculate_metric(is_training=True)
56+
logger.info(f'[MoE NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
57+
58+
59+
if __name__ == '__main__':
60+
train()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_npu.py

cookbook/megatron/ascend/tp_npu.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import twinkle
2+
from peft import LoraConfig
3+
4+
from twinkle import DeviceMesh, get_device_placement, get_logger
5+
from twinkle.dataloader import DataLoader
6+
from twinkle.dataset import Dataset, DatasetMeta
7+
from twinkle.model import MegatronModel
8+
from twinkle.preprocessor import SelfCognitionProcessor
9+
10+
MODEL_ID = 'ms://Qwen/Qwen3-4B'
11+
DATASET_ID = 'ms://swift/self-cognition'
12+
DATASET_SLICE = range(256)
13+
BATCH_SIZE = 8
14+
MAX_STEPS = 10
15+
16+
# Keep the same 8-card TP/PP/DP layout as the GPU reference script, but run it
17+
# through the NPU backend to validate Megatron + MindSpeed integration.
18+
device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, device_type='npu')
19+
twinkle.initialize(mode='local', global_device_mesh=device_mesh)
20+
21+
logger = get_logger()
22+
23+
24+
def build_dataset():
25+
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=DATASET_SLICE))
26+
# Qwen3-4B is a text-only model, so use the base template instead of the VL template.
27+
dataset.set_template('Template', model_id=MODEL_ID, max_length=512)
28+
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
29+
dataset.encode()
30+
return dataset
31+
32+
33+
def build_model(total_steps: int):
34+
model = MegatronModel(model_id=MODEL_ID)
35+
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
36+
model.add_adapter_to_model('default', lora_config)
37+
model.set_optimizer(optimizer_cls='default', lr=1e-4)
38+
model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=total_steps)
39+
return model
40+
41+
42+
def train():
43+
dataset = build_dataset()
44+
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0)
45+
model = build_model(len(dataloader))
46+
47+
logger.info(get_device_placement())
48+
logger.info(model.get_train_configs())
49+
logger.info(f'Total steps: {len(dataloader)}, validating {MAX_STEPS} steps')
50+
51+
for step, batch in enumerate(dataloader):
52+
if step >= MAX_STEPS:
53+
break
54+
model.forward_backward(inputs=batch)
55+
model.clip_grad_and_step()
56+
metric = model.calculate_metric(is_training=True)
57+
logger.info(f'[NPU smoke] step {step + 1}/{MAX_STEPS}, metric: {metric}')
58+
59+
60+
if __name__ == '__main__':
61+
train()

cookbook/megatron/ascend/tp_npu.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_npu.py

0 commit comments

Comments
 (0)