Loading...
Loading...
Comprehensive Kubernetes debugging and troubleshooting toolkit. Use this skill when diagnosing Kubernetes cluster issues, debugging failing pods, investigating network connectivity problems, analyzing resource usage, troubleshooting deployments, or performing cluster health checks.
npx skill4agent add akin-ozer/cc-devops-skills k8s-debugscripts/pod_diagnostics.pypython3 scripts/pod_diagnostics.py <pod-name> -n <namespace>python3 scripts/pod_diagnostics.py <pod-name> -n <namespace> -o diagnostics.txtscripts/cluster_health.sh./scripts/cluster_health.shscripts/network_debug.sh./scripts/network_debug.sh <namespace> <pod-name>references/troubleshooting_workflow.mdreferences/common_issues.md# Quick assessment
kubectl get pod <pod-name> -n <namespace>
kubectl describe pod <pod-name> -n <namespace>
# Detailed diagnostics
python3 scripts/pod_diagnostics.py <pod-name> -n <namespace>
# Check common causes:
# - ImagePullBackOff: Verify image exists and credentials
# - CrashLoopBackOff: Check logs with --previous flag
# - Pending: Check node resources and scheduling# Verify service and endpoints
kubectl get svc <service-name> -n <namespace>
kubectl get endpoints <service-name> -n <namespace>
# Network diagnostics
./scripts/network_debug.sh <namespace> <pod-name>
# Test connectivity from debug pod
kubectl run tmp-shell --rm -i --tty --image nicolaka/netshoot -- /bin/bash
# Inside: curl <service-name>.<namespace>.svc.cluster.local:<port>
# Check network policies
kubectl get networkpolicies -n <namespace># Check resource usage
kubectl top nodes
kubectl top pods -n <namespace> --containers
# Get pod metrics
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 resources
# Check for OOMKilled
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 lastState
# Review application logs
kubectl logs <pod-name> -n <namespace> --tail=100# Run comprehensive health check
./scripts/cluster_health.sh > cluster-health-$(date +%Y%m%d-%H%M%S).txt
# Review output for:
# - Node conditions and resource pressure
# - Failed or pending pods
# - Recent error events
# - Component health status
# - Resource quota usage# View pod status
kubectl get pods -n <namespace> -o wide
# Detailed pod information
kubectl describe pod <pod-name> -n <namespace>
# View logs
kubectl logs <pod-name> -n <namespace>
kubectl logs <pod-name> -n <namespace> --previous # Previous container
kubectl logs <pod-name> -n <namespace> -c <container> # Specific container
# Execute commands in pod
kubectl exec <pod-name> -n <namespace> -it -- /bin/sh
# Get pod YAML
kubectl get pod <pod-name> -n <namespace> -o yaml# Check services
kubectl get svc -n <namespace>
kubectl describe svc <service-name> -n <namespace>
# Check endpoints
kubectl get endpoints -n <namespace>
# Test DNS
kubectl exec <pod-name> -n <namespace> -- nslookup kubernetes.default
# View events
kubectl get events -n <namespace> --sort-by='.lastTimestamp'# Node resources
kubectl top nodes
kubectl describe nodes
# Pod resources
kubectl top pods -n <namespace>
kubectl top pod <pod-name> -n <namespace> --containers# Restart deployment
kubectl rollout restart deployment/<name> -n <namespace>
# Rollback deployment
kubectl rollout undo deployment/<name> -n <namespace>
# Force delete stuck pod
kubectl delete pod <pod-name> -n <namespace> --force --grace-period=0
# Drain node (maintenance)
kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data
# Cordon node (prevent scheduling)
kubectl cordon <node-name>references/troubleshooting_workflow.mdreferences/common_issues.md# Attach ephemeral debug container
kubectl debug <pod-name> -n <namespace> -it --image=nicolaka/netshoot
# Create debug copy of pod
kubectl debug <pod-name> -n <namespace> -it --copy-to=<debug-pod-name> --container=<container># Forward pod port to local machine
kubectl port-forward pod/<pod-name> -n <namespace> <local-port>:<pod-port>
# Forward service port
kubectl port-forward svc/<service-name> -n <namespace> <local-port>:<service-port># Start kubectl proxy
kubectl proxy --port=8080
# Access API
curl http://localhost:8080/api/v1/namespaces/<namespace>/pods/<pod-name># Custom pod info
kubectl get pods -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,IP:.status.podIP
# Node taints
kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taintskubectl describe pod