Loading...
Loading...
MoE expert-parallel communication overlap in Megatron Bridge. Covers dispatch/combine overlap, flex dispatcher backends, and expert wgrad scheduling.
npx skill4agent add nvidia/skills perf-moe-comm-overlapEP > 1cfg.comm_overlap.overlap_moe_expert_parallel_comm = True
# Optional: delayed wgrad for additional overlap
cfg.comm_overlap.delay_wgrad_compute = True
# IMPORTANT: disable shared expert overlap when using dispatch overlap
cfg.model.moe_shared_expert_overlap = Falseexpert_model_parallel_size > 1num_moe_experts > 1moe_token_dispatcher_type"alltoall""flex"virtual_pipeline_model_parallel_sizeNonemoe_flex_dispatcher_backendmoe_token_dispatcher_type = "flex"delay_wgrad_computeEP=16alltoallmoe_permute_fusion=falsedelay_wgrad_computealltoallsrc/megatron/bridge/training/comm_overlap.pysrc/megatron/bridge/training/flex_dispatcher_backend.pysrc/megatron/bridge/training/config.pytests/unit_tests/training/test_comm_overlap.pytests/unit_tests/training/test_deepep.pymoe_shared_expert_overlapoverlap_moe_expert_parallel_commmoe_flex_dispatcher_backend="deepep"moe_token_dispatcher_type"alltoall"comm_overlap.pyuv run python scripts/performance/run_script.py \
-m qwen \
-mr qwen3_30b_a3b \
--task pretrain \
-g h100 \
-c bf16 \
-ng 16 \
-gn 8 \
--max_steps 8 \
--config_variant v1 \
--cuda_graph_impl none \
--moe_flex_dispatcher_backend None \
--moe_a2a_overlap false \
--tokenizer_type NullTokenizer \
comm_overlap.overlap_moe_expert_parallel_comm=true \
comm_overlap.delay_wgrad_compute=false \
model.moe_shared_expert_overlap=falsemodel.moe_permute_fusion=false