Loading...
Loading...
Prometheus, Grafana, CloudWatch, Azure Monitor, Stackdriver, logging, alerting, and SRE practices
npx skill4agent add davincidreams/agent-team-plugins monitoring-observability# CPU usage rate
rate(process_cpu_seconds_total[5m])
# Request error rate
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])
# P95 latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Memory usage
process_resident_memory_bytes / node_memory_MemTotal_bytes * 100// Alarm configuration
{
"AlarmName": "HighCPUUsage",
"MetricName": "CPUUtilization",
"Namespace": "AWS/EC2",
"Statistic": "Average",
"Period": 300,
"EvaluationPeriods": 2,
"Threshold": 80,
"ComparisonOperator": "GreaterThanThreshold"
}
// Metric filter
{
"filterPattern": "[timestamp, request_id, status_code, latency]",
"metricTransformations": [
{
"metricName": "RequestLatency",
"metricNamespace": "Application",
"metricValue": "$latency"
}
]
}// KQL query for error rate
requests
| where timestamp > ago(1h)
| summarize count() by success
| project error_rate = 100.0 * (count_ - count_success) / count_
// Query for slow requests
requests
| where timestamp > ago(1h)
| where duration > 1000
| summarize count() by name
| top 10 by count_
// Query for exceptions
exceptions
| where timestamp > ago(1h)
| summarize count() by type, problemId
| top 10 by count_# Alerting policy
displayName: "High Error Rate"
conditions:
- displayName: "Error rate > 5%"
conditionThreshold:
filter: 'metric.type="custom.googleapis.com/error_rate"'
comparison: COMPARISON_GT
thresholdValue: 0.05
duration: 300s
aggregations:
- alignmentPeriod: 60s
perSeriesAligner: ALIGN_RATE# SLO configuration
slo_name: "API Availability"
sli_name: "api_availability"
slo_target: 0.999
slo_window: 30d
alert_threshold: 0.998
# SLI calculation
api_availability = 1 - (error_count / total_count)
# Error budget
error_budget = 1 - slo_target
error_budget_remaining = slo_target - current_availability