Loading...
Loading...
Guide for adding support for new LLM or VLM models in Megatron-Bridge. Covers bridge, provider, recipe, tests, docs, and examples.
npx skill4agent add nvidia/skills adding-model-supporthttps://huggingface.co/Qwen/Qwen3.5-VL-27Bconfig.jsonconfig.jsonmodel_type@register_bridge(model_type=...)architecturessource=...tie_word_embeddingsnum_hidden_layershidden_sizeintermediate_sizenum_attention_headsnum_key_value_headsvocab_sizemax_position_embeddingsrope_thetanum_local_expertsnum_experts_per_tokmoe_intermediate_sizeq_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dimCONFIG_MAPPINGmodel_bridge.pymodeling_*.pytext_configvision_configtext_configvision_configtext_configmodel.safetensorsmodel.safetensors.index.jsonfloat8_e4m3fnuint8uint4*_scale_inv*_scaleconfig.jsonquantization_config"torch_dtype": "float8_e4m3fn"weight_scale_invimport_ckptdequant_fp8_for_bridge.pyexamples/models/ministral/ministral3/dequant_fp8_for_bridge.pyw_bf16 = fp8_weight.to(bfloat16) * weight_scale_invmaybe_modify_loaded_hf_weight()def maybe_modify_loaded_hf_weight(self, hf_param, hf_state_dict):
weight = hf_state_dict[hf_param]
scale_key = hf_param + "_scale_inv"
if weight.dtype == torch.float8_e4m3fn and scale_key in hf_state_dict:
return weight.to(torch.bfloat16) * hf_state_dict[scale_key].to(torch.bfloat16)
return weightstdstd ≈ 13std ≈ 0.006src/megatron/bridge/models/qwen/qwen2_bridge.pysrc/megatron/bridge/models/<model>/
├── __init__.py
├── <model>_bridge.py # Config + weight mappings (no provider file needed)
└── modeling_<model>/ # (optional) Custom nn.Module implementations if needed
└── ...src/megatron/bridge/models/qwen_vl/src/megatron/bridge/models/<model>/
├── __init__.py
├── <model>_bridge.py # Config + weight mappings
├── <model>_provider.py # Only for VLMs that need custom provide()
└── modeling_<model>/ # If using Megatron vision encoder
├── __init__.py
└── model.py # Combines vision + languagesrc/megatron/bridge/models/<model>/
├── __init__.py
├── <model>_bridge.py
├── <model>_provider.py # Only for VLMs that need custom provide()
└── modeling_<model>.py # HF vision + Megatron language wrappernn.Modulemodeling_<model>/modeling_<model>.pymodeling_<model>provider_bridge()mapping_registry()super().provider_bridge()GPTModelProviderCONFIG_MAPPINGsuper().provider_bridge()GPTModelProviderPROVIDER_CLASSprovide()hf_config_to_provider_kwargs(text_config)tie_word_embeddingstie_word_embeddingstext_configprovider.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False)hf_configtie_word_embeddingsimage_token_idvideo_token_idhf_config.text_confignum_hidden_layershidden_sizehf_config.vision_configsrc/megatron/bridge/models/conversion/param_mapping.pymodel_bridge.pyquant_mapping.py<model>_mappings.py# src/megatron/bridge/models/glm/glm_moe_mappings.py
class GLMExpertDownProjMapping(FusedExpertMapping):
def __init__(self, megatron_param, hf_param, permute_dims=None):
super().__init__(megatron_param, hf_param, permute_dims, transpose_on_export=False)# Inside nemotron_h_bridge.py (private to the module)
class _MTPFlatteningMapping(MegatronParamMapping):
def resolve(self, captures):
return AutoMapping(self._flatten(captures), ...)# Inside minimax_m2_bridge.py (private to the module)
class _FullDimQKNormMapping(MegatronParamMapping):
def hf_to_megatron(self, hf_weights):
# Custom scatter logic for full-dim QK norm
...
def megatron_to_hf(self, megatron_weights):
# Custom gather logic
...MegatronModelBridge| Hook | When to use |
|---|---|
| Define all weight name mappings (abstract, always overridden) |
| Configure the provider with model-specific flags (call |
| Dequantize, rename, or reshape HF weights before conversion |
| Synthesize extra HF keys on export (e.g. |
| Build HF |
| Override CONFIG_MAPPING behavior for specific fields |
mapping_registry()self.hf_configmapping_registry()def mapping_registry(self) -> MegatronMappingRegistry:
hf_config = getattr(self, "hf_config", None)
num_mtp_layers = getattr(hf_config, "num_nextn_predict_layers", 0) if hf_config else 0
...build_conversion_tasks()self._hf_configGPTModelProviderPROVIDER_CLASSprovide()# src/megatron/bridge/models/<model>/<model>_provider.py
class MyVLModelProvider(GPTModelProvider):
image_token_id: int = 0
def provide(self, ...):
# Custom model construction combining vision encoder + language decoder
...PROVIDER_CLASS = MyVLModelProviderprovider_bridge()param_mapping.pymodel_bridge.pyFusedExpertMappingFusedGatedExpertMappingRMSNorm2ZeroCenteredRMSNormMappingCONFIG_MAPPINGif model_type == "..."src/megatron/bridge/training/utils/flop_utils.pyO(s²)transformer_flops()flop_utils.pyexperimental_attention_variantmtp_num_layersgetattrtests/unit_tests/training/utils/test_flop_utils.pylinear_attention_freqsrc/megatron/bridge/recipes/<family>/<model>.pysrc/megatron/bridge/recipes/<family>/<model>.py<model>_<size>_sft_config()<model>_<size>_peft_config()<model>_<size>_pretrain_config()__init__.py__all__src/megatron/bridge/recipes/__init__.pytrain_any_basic.pyconfig_map--modeltests/unit_tests/models/<model>/
├── __init__.py
├── test_<model>_bridge.py # Mock HF config → verify provider mapping
└── test_<model>_provider.py # (optional) Only if custom provider subclass existstests/functional_tests/models/<model>/
├── __init__.py
├── test_<model>_conversion.py # Toy model HF↔Megatron roundtrip
└── test_<model>_provider.py # compare_provider_configs (optional)examples/models/<brand>/<model>/examples/models/<brand>/<model>/
├── README.md
├── conversion.sh # HF↔Megatron conversion commands (real model)
├── inference.sh # Generation commands (real model, reasonable output)
├── slurm_sft.sh # SFT training on SLURM
└── slurm_peft.sh # PEFT training on SLURMconversion.shinference.shQwen/Qwen3-8Bdocs/models/<type>/<model>.mduv run python -c "
from megatron.bridge import AutoBridge
bridge = AutoBridge.from_hf_pretrained('<org>/<model>')
provider = bridge.to_megatron_provider()
provider.tensor_model_parallel_size = 1
provider.pipeline_model_parallel_size = 1
provider.finalize()
model = provider.provide_distributed_model(wrap_with_ddp=False)
bridge.load_hf_weights(model)
for i, (name, tensor) in enumerate(bridge.export_hf_weights(model, cpu=True)):
print(name, tuple(tensor.shape))
if i > 10: break
"uv run python examples/conversion/convert_checkpoints.py import \
--hf-model <org>/<model> \
--megatron-path /workspace/<model> \
--torch-dtype bfloat16
uv run python examples/conversion/convert_checkpoints.py export \
--hf-model <org>/<model> \
--megatron-path /workspace/<model>/iter_0000000 \
--hf-path /workspace/<model>-hf-exportuv run python examples/conversion/hf_to_megatron_generate_text.py \
--hf_model_path <org>/<model> --prompt "Hello"uv run python examples/conversion/hf_to_megatron_generate_vlm.py \
--hf_model_path <org>/<model> \
--image_path "https://example.com/image.jpeg" \
--prompt "Describe this image."uv run python -m pytest tests/unit_tests/models/<model>/ -v
uv run python -m pytest tests/functional_tests/models/<model>/ -v --run-gpuUser wants to add a model
│
├─ Has HF link? ─── No ──→ Ask for link (or config.json if private)
│
├─ Has text_config + vision_config? ─── Yes ──→ VLM path
│ ├─ Has Megatron vision encoder? ──→ Megatron encoder (Qwen3.5 pattern)
│ └─ No Megatron encoder ──→ HF encoder (Gemma3 pattern)
│
└─ No vision config ──→ LLM path (bridge only, no provider file)
├─ Standard GPT-style? ──→ Bridge with stock mappings
└─ Custom layers? ──→ Bridge + local mapping subclasses / hook overrides
├─ Custom weight layout? ──→ Local mapping subclass in family dir
└─ Custom import/export? ──→ Override bridge hooks (maybe_modify_*)