Loading...
Loading...
Compare original and translation side by side
slurm:job_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagehf_homeextra_mountsenv_varsmaster_portWORLD_SIZE = nodes * ntasks_per_nodeMASTER_ADDRMASTER_PORTskypilot:cloudacceleratorsnum_nodesuse_spot: truedisk_sizeregionsetupenv_varsstep_scheduler.checkpoint_intervalrestore_from.pathslurm.nsys_enabled: truensys profile.nsys-repslurm:
job_name: llm_finetune
nodes: 2
ntasks_per_node: 8
time: "04:00:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
hf_home: ~/.cache/huggingface
master_port: 13742
env_vars:
HF_TOKEN: "${HF_TOKEN}".nsys-repjob_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagehf_homeextra_mountsenv_varsmaster_portslurm:WORLD_SIZE = nodes * ntasks_per_nodeMASTER_ADDRMASTER_PORTcloudacceleratorsnum_nodesuse_spot: truedisk_sizeregionsetupenv_varsskypilot:step_scheduler.checkpoint_intervalrestore_from.pathslurm.nsys_enabled: truensys profile.nsys-repslurm:
job_name: llm_finetune
nodes: 2
ntasks_per_node: 8
time: "04:00:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
hf_home: ~/.cache/huggingface
master_port: 13742
env_vars:
HF_TOKEN: "${HF_TOKEN}".nsys-repundefinedundefined
No additional YAML section is needed for interactive mode. The CLI routes to torchrun automatically when no `slurm:` or `skypilot:` section is present in the config.
交互式模式无需额外的YAML配置段。当配置文件中不存在`slurm:`或`skypilot:`段时,CLI会自动路由到torchrun。SlurmConfigSlurmConfigslurm:
job_name: llm_finetune
nodes: 2
ntasks_per_node: 8
time: "04:00:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
hf_home: ~/.cache/huggingface
extra_mounts:
- source: /data
dest: /data
env_vars:
WANDB_API_KEY: "${WANDB_API_KEY}"
HF_TOKEN: "${HF_TOKEN}"slurm:
job_name: llm_finetune
nodes: 2
ntasks_per_node: 8
time: "04:00:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
hf_home: ~/.cache/huggingface
extra_mounts:
- source: /data
dest: /data
env_vars:
WANDB_API_KEY: "${WANDB_API_KEY}"
HF_TOKEN: "${HF_TOKEN}"job_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagenemo_mounthf_homeextra_mountsVolumeMapping(source, dest)master_portenv_varsnsys_enablednsys profilejob_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagenemo_mounthf_homeextra_mountsVolumeMapping(source, dest)master_portenv_varsnsys_enablednsys profileSkyPilotConfigSkyPilotConfigskypilot:
cloud: aws
accelerators: "H100:8"
num_nodes: 2
use_spot: true
disk_size: 200
region: us-east-1
setup: "pip install nemo-automodel"
env_vars:
HF_TOKEN: "${HF_TOKEN}"skypilot:
cloud: aws
accelerators: "H100:8"
num_nodes: 2
use_spot: true
disk_size: 200
region: us-east-1
setup: "pip install nemo-automodel"
env_vars:
HF_TOKEN: "${HF_TOKEN}"cloudawsgcpazurelambdakubernetesaccelerators"H100:8""A100-80GB:4"num_nodesuse_spotdisk_sizeregionsetupenv_varscloudawsgcpazurelambdakubernetesaccelerators"H100:8""A100-80GB:4"num_nodesuse_spotdisk_sizeregionsetupenv_varsuse_spot: trueskypilot:acceleratorsnum_nodesdisk_sizeregionsetupenv_varsstep_scheduler.checkpoint_intervalrestore_fromstep_scheduler:
checkpoint_interval: 100
restore_from:
path: /checkpoints/latestskypilot:use_spot: trueacceleratorsnum_nodesdisk_sizeregionsetupenv_varsstep_scheduler.checkpoint_intervalrestore_fromstep_scheduler:
checkpoint_interval: 100
restore_from:
path: /checkpoints/latestMASTER_ADDRMASTER_PORTWORLD_SIZEnodes * ntasks_per_nodeMASTER_ADDRMASTER_PORTWORLD_SIZEnodes * ntasks_per_nodeslurm:
job_name: llm_profile
nodes: 1
ntasks_per_node: 8
time: "00:30:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
nsys_enabled: truejob_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagensys_enabled: truensys profile.nsys-repslurm:
job_name: llm_profile
nodes: 1
ntasks_per_node: 8
time: "00:30:00"
account: my_account
partition: batch
container_image: nvcr.io/nvidia/nemo:dev
nsys_enabled: truejob_namenodesntasks_per_nodetimeaccountpartitioncontainer_imagensys_enabled: truensys profile.nsys-repcomponents/launcher/slurm/config.pycomponents/launcher/slurm/template.pycomponents/launcher/slurm/utils.pycomponents/launcher/skypilot/config.py_cli/app.pycomponents/launcher/slurm/config.pycomponents/launcher/slurm/template.pycomponents/launcher/slurm/utils.pycomponents/launcher/skypilot/config.py_cli/app.pymaster_portsourceextra_mountsuse_spot: true${VAR}timemaster_portextra_mountssourceuse_spot: true${VAR}time