Loading...
Loading...
Brev managed GPU instances with Docker support. Use when running TAO training, evaluation, or inference on Brev GPU instances, managing Brev deployments, or dispatching TAO jobs through the Brev CLI. Trigger phrases include "run on Brev", "Brev GPU instance", "submit job to Brev", "Brev CLI deployment".
npx skill4agent add promptingcompany/nv-skills tao-run-on-brevbrev execbrevbrev-cli# 1. brev CLI installed
command -v brev >/dev/null 2>&1 || {
echo "MISSING: brev CLI not installed. Install:"
echo " https://docs.nvidia.com/brev/"
exit 1
}
# 2. brev-cli agent skill installed — provides the brev CLI's command reference to the agent
[ -d "$HOME/.claude/skills/brev-cli" ] || [ -d ".claude/skills/brev-cli" ] || {
echo "MISSING: brev-cli agent skill not installed. Run:"
echo " brev agent-skill install"
exit 1
}
# 3. brev login active — always token-login first when running headless.
# Plain `brev ls` will hit an interactive auth prompt (read: EOF on stdin)
# even when BREV_API_TOKEN is set, so refresh the session up front.
if [ -n "$BREV_API_TOKEN" ]; then
brev login --token "$BREV_API_TOKEN" >/dev/null 2>&1 || {
echo "MISSING: brev token login failed. Verify BREV_API_TOKEN."
exit 1
}
fi
# Retry once after a forced re-login: cached creds occasionally desync and the
# first `brev ls` returns auth EOF until the session is rebuilt.
brev ls >/dev/null 2>&1 || {
[ -n "$BREV_API_TOKEN" ] && brev login --token "$BREV_API_TOKEN" >/dev/null 2>&1
brev ls >/dev/null 2>&1 || {
echo "MISSING: not logged in to brev. Run:"
echo " brev login # interactive (opens browser)"
echo " # or set BREV_API_TOKEN in ~/.config/tao/.env (then 'brev login --token \$BREV_API_TOKEN')"
exit 1
}
}brev exec docker run …script_runnernvidia-tao-sdkversions.yamlpip install "$("${TAO_SKILL_BANK_PATH:?}/scripts/resolve_versions_key.py" wheels.tao_sdk_brev)"tao-skill-bank:tao-run-platformBrevSDKbuild_entrypointActionWorkflowBREV_API_TOKEN~/.config/tao/.envbrev login --tokenbrev loginbrev login --token "$BREV_API_TOKEN"brevEOFbrev lsbrev createbrev execBREV_API_TOKENbrevbrev ls --jsonbrev login --tokens3://ACCESS_KEYSECRET_KEYaws s3 ls/pathHF_TOKENinstance_idbackend_detailsinstance_idbrev createbrev create my-instance \
--gpu L40S:1 \
--cloud-cred-id <cloudCredId> \
--workspace-group-id <workspaceGroupId>~/.config/tao/.envbrev ls --json | jq -r '.workspaces[0].workspaceGroupId' # default group
brev orgs --json | jq -r '.[0].cloudCredentials[].id' # cloud credentialbackend_detailsBrevSDK().create_job(
...,
backend_details={
"cloud_cred_id": "<cloudCredId>",
"workspace_group_id": "<workspaceGroupId>",
},
)gpu_counttorchrun --nproc-per-node=Nbrev search--gpu-name--min-vram$HOME# NGC auth (one-time per instance)
brev exec <instance> -- docker login nvcr.io -u '$oauthtoken' -p <NGC_KEY>
# Run a TAO training job
brev exec <instance> -- docker run --gpus all --rm \
-v $HOME/data:/data \
nvcr.io/nvidia/tao/tao-toolkit:6.26.3-pyt \
visual_changenet train -e /data/spec.yamlbrev execRUNNINGbrev exechostname not resolvableConnection refused# Wait up to 5 minutes for shell readiness — covers the SSH bring-up window.
for i in $(seq 1 60); do
brev exec <instance> -- true >/dev/null 2>&1 && break
sleep 5
done
brev exec <instance> -- true >/dev/null 2>&1 || {
echo "instance <instance> never became exec-ready"; exit 1;
}brev execbrev exectimeoutexec failed{"skill": "vcn-gap-analysis", "action": "analyze", "platform": "brev"},
{"skill": "visual-changenet", "action": "train", "platform": "lepton"}brev delete <instance> # plain delete — no flags--yes-yunknown flag: --yesbrev delete <instance>brev lsBREV_API_TOKENbrev login --token "$BREV_API_TOKEN"brev login --tokenbrev loginbrev createcloudCredIdworkspaceGroupId--cloud-cred-id--workspace-group-idbrev exechostname not resolvableConnection refusedRUNNINGbrev execexec failedbrev execbrev execbrev delete --yesunknown flag: --yesbrev delete <instance>--gpu-namedocker login nvcr.io