Loading...
Loading...
Kubernetes execution platform — submits TAO container jobs as single-pod k8s Jobs with NVIDIA GPU scheduling. Use when running on EKS / GKE / AKS / on-prem clusters with the NVIDIA GPU Operator installed, or when integrating TAO into an existing k8s-native ML platform.
npx skill4agent add nvidia/skills tao-run-on-kubernetesnum_nodes > 1# 0. GPU node host runtime.
# Run this on each self-managed GPU worker node or in the node image build.
# Set TAO_K8S_SKIP_NODE_RUNTIME_CHECK=1 only when using managed GPU nodes whose
# driver/toolkit lifecycle is owned by the cloud provider or GPU Operator policy.
if [ "${TAO_K8S_SKIP_NODE_RUNTIME_CHECK:-0}" != "1" ]; then
TAO_SKILL_BANK_ROOT="${TAO_SKILL_BANK_ROOT:-$PWD}"
SETUP_SCRIPT="${TAO_SKILL_BANK_ROOT}/skills/tao-setup-nvidia-gpu-host/scripts/setup-nvidia-gpu-host.sh"
[ -x "$SETUP_SCRIPT" ] || SETUP_SCRIPT="${TAO_SKILL_BANK_ROOT}/platform/tao-setup-nvidia-gpu-host/scripts/setup-nvidia-gpu-host.sh"
bash "$SETUP_SCRIPT" --backend kubernetes --check-only || {
echo "MISSING: TAO Kubernetes GPU node runtime is not ready."
echo "For self-managed GPU nodes, run after user approval:"
echo " bash \"$SETUP_SCRIPT\" --backend kubernetes --install --yes"
echo "For managed clusters, verify the node image/GPU Operator policy installs driver 580 and toolkit 1.19.0, then set TAO_K8S_SKIP_NODE_RUNTIME_CHECK=1."
exit 1
}
fi
# 1. SDK + kubernetes extra installed.
# nvidia-tao-sdk is on public PyPI; pin lives in versions.yaml (wheels.tao_sdk_kubernetes).
PIN=$("${TAO_SKILL_BANK_PATH:?}/scripts/resolve_versions_key.py" wheels.tao_sdk_kubernetes)
python -c "import tao_sdk" 2>/dev/null || {
echo "MISSING: nvidia-tao-sdk not installed. Run:"
echo " pip install \"$PIN\""
exit 1
}
python -c "import kubernetes" 2>/dev/null || {
echo "MISSING: kubernetes extra not installed. Run:"
echo " pip install \"$PIN\""
exit 1
}
# 2. Cluster reachable (kubeconfig OR in-cluster service account)
python -c "from kubernetes import config; config.load_kube_config()" 2>/dev/null || \
python -c "from kubernetes import config; config.load_incluster_config()" 2>/dev/null || {
echo "MISSING: no kubeconfig at ~/.kube/config and not running in a pod."
echo "Configure kubectl (e.g., 'aws eks update-kubeconfig --name my-cluster') or set \$KUBECONFIG."
exit 1
}
# 3. NVIDIA GPU Operator present (soft check — warn if kubectl available, don't fail)
if command -v kubectl >/dev/null 2>&1; then
gpu=$(kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}' 2>/dev/null | grep -v '^$' | head -1)
if [ -z "$gpu" ] || [ "$gpu" = "0" ]; then
echo "WARN: no nvidia.com/gpu allocatable on this cluster."
echo "Install the NVIDIA GPU Operator before submitting GPU jobs:"
echo " https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html"
fi
fiTAO_K8S_SKIP_NODE_RUNTIME_CHECK=1kubectlKubernetesSDK.create_job()~/.kube/config$KUBECONFIGdefaultcreate_jobimage_pull_secretinputsoutputss3://HF_TOKENkubectltao-skill-bank:tao-run-platformcreate_jobbuild_entrypointActionWorkflowfrom tao_sdk.platforms.kubernetes import KubernetesSDK
sdk = KubernetesSDK() # auto-detects auth
job = sdk.create_job(
image='nvcr.io/nvidia/tao/tao-toolkit:6.26.3-pyt',
command='dino train -e /tmp/spec.yaml',
gpu_count=1,
env_vars={'NGC_KEY': os.environ['NGC_KEY']},
inputs={'/data/train.json': 's3://bucket/coco/train.json'},
outputs=['/results/'],
namespace='tao-jobs', # optional override
image_pull_secret='ngc-pull-secret', # optional, pre-created
node_selector={'gpu-type': 'h100'}, # optional
)V1Jobspec.template.spec.containers[0]command=["/bin/bash", "-c", <command>]resources.limits["nvidia.com/gpu"]: <gpu_count>env_varsscript_runnerrestart_policy=Neverbackoff_limit=0ttl_seconds_after_finished=3600status = sdk.get_job_status(job.id)
# status.status ∈ {"Pending", "Running", "Complete", "Error", "Canceled", "Unknown"}
logs = sdk.get_job_logs(job.id, tail=200) # concatenates logs from all pods of the Job
# For stuck-Pending jobs — replica diagnostics:
for r in sdk.get_job_replicas(job.id):
issue = r["status"].get("readiness_issue")
if issue:
print(issue["reason"], issue["message"])
# e.g. "ImagePullBackOff" / "Back-off pulling image..."
# e.g. "Pending" / "0/3 nodes available: 3 Insufficient nvidia.com/gpu"
# On failure:
analysis = sdk.get_failure_analysis(job.id)
# {"err_class": "ERR_PROGRAM" | "ERR_INFRA",
# "suggestion": "Container OOM-killed. Reduce batch size...",
# "job_failure_by_node_event": [{"node_event_name": "OOMKilled", ...}]}sdk.cancel_job(job.id) # delete_namespaced_job with propagation_policy="Foreground"ttl_seconds_after_finished=3600cancel_jobnvidia.com/gputao-setup-nvidia-gpu-hostbash skills/platform/tao-setup-nvidia-gpu-host/scripts/setup-nvidia-gpu-host.sh --backend kubernetes --install --yeshelm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update
helm install --wait gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operatornum_nodes > 1create_job()job-name=<job-name>clusterIP: NonepublishNotReadyAddresses: trueparallelism = completions = num_nodescompletionMode: IndexedJOB_COMPLETION_INDEX| Env var | Value | Read by |
|---|---|---|
| | TAO PyTorch container's |
| | TAO PyTorch container's entrypoint |
| | |
| | |
| | both |
| | both |
| | both (TAO's default) |
dino traintorchrunjob = sdk.create_job(
image='nvcr.io/nvidia/tao/tao-toolkit:6.26.3-pyt',
command='dino train -e /tmp/spec.yaml', # TAO entrypoint reads spec.train.num_nodes; env vars are wired by the container
gpu_count=8, # GPUs per node
num_nodes=4, # 4 × 8 = 32 GPUs total
inputs={'/data/train.json': 's3://bucket/coco/train.json'},
outputs=['/results/'],
)torchrunjob = sdk.create_job(
image='nvcr.io/nvidia/pytorch:25.08-py3',
command='torchrun --nnodes=$NNODES --nproc-per-node=$NPROC_PER_NODE --node-rank=$NODE_RANK '
'--master-addr=$MASTER_ADDR --master-port=$MASTER_PORT train.py',
gpu_count=8,
num_nodes=4,
)gpu_count × num_nodesnvidia.com/gpuPodIndexLabelMASTER_ADDR=<job>-0.<svc>kubectl versionMASTER_PORTNCCL_SOCKET_IFNAMENCCL_IB_HCAenv_varsPyTorchJobTFJobNo nvidia.com/gpu resources allocatable on the clusterkubectl get nodes -o jsonpath='{.items[*].status.allocatable}'ImagePullBackOffErrImagePullimage_pull_secretkubectl create secret docker-registry ngc-pull-secret \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password=$NGC_KEY -n tao-jobsPendingget_job_replicas(job_id)Insufficient nvidia.com/gpunode_selectorOOMKilledCredentialError: Could not authenticate to a Kubernetes clusterkubectl get nodes$KUBECONFIGbackoff_limit=0PyTorchJobMASTER_ADDR:MASTER_PORT$NGC_KEY