Loading...
Loading...
Set up monitoring, logging, and observability for applications and infrastructure. Use when implementing health checks, metrics collection, log aggregation, or alerting systems. Handles Prometheus, Grafana, ELK Stack, Datadog, and monitoring best practices.
npx skill4agent add supercent-io/skills-template monitoring-observabilityimport express from 'express';
import promClient from 'prom-client';
const app = express();
// Default metrics (CPU, Memory, etc.)
promClient.collectDefaultMetrics();
// Custom metrics
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code']
});
const httpRequestTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// Middleware to track requests
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const labels = {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode
};
httpRequestDuration.observe(labels, duration);
httpRequestTotal.inc(labels);
});
next();
});
// Metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', promClient.register.contentType);
res.end(await promClient.register.metrics());
});
app.listen(3000);global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'my-app'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/metrics'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- 'alert_rules.yml'groups:
- name: application_alerts
interval: 30s
rules:
# High error rate
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% (threshold: 5%)"
# Slow response time
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "Slow response time"
description: "95th percentile is {{ $value }}s"
# Pod down
- alert: PodDown
expr: up{job="my-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Pod is down"
description: "{{ $labels.instance }} has been down for more than 2 minutes"
# High memory usage
- alert: HighMemoryUsage
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}%"import winston from 'winston';
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'my-app',
environment: process.env.NODE_ENV
},
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
// Usage
logger.info('User logged in', { userId: '123', ip: '1.2.3.4' });
logger.error('Database connection failed', { error: err.message, stack: err.stack });
// Express middleware
app.use((req, res, next) => {
logger.info('HTTP Request', {
method: req.method,
path: req.path,
ip: req.ip,
userAgent: req.get('user-agent')
});
next();
});{
"dashboard": {
"title": "Application Metrics",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])",
"legendFormat": "Errors"
}
]
},
{
"title": "Response Time (p95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"
}
]
},
{
"title": "CPU Usage",
"type": "gauge",
"targets": [
{
"expr": "rate(process_cpu_seconds_total[5m]) * 100"
}
]
}
]
}
}interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
checks: {
database: { status: string; latency?: number; error?: string };
redis: { status: string; latency?: number };
externalApi: { status: string; latency?: number };
};
}
app.get('/health', async (req, res) => {
const startTime = Date.now();
const health: HealthStatus = {
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
checks: {
database: { status: 'unknown' },
redis: { status: 'unknown' },
externalApi: { status: 'unknown' }
}
};
// Database check
try {
const dbStart = Date.now();
await db.raw('SELECT 1');
health.checks.database = {
status: 'healthy',
latency: Date.now() - dbStart
};
} catch (error) {
health.status = 'unhealthy';
health.checks.database = {
status: 'unhealthy',
error: error.message
};
}
// Redis check
try {
const redisStart = Date.now();
await redis.ping();
health.checks.redis = {
status: 'healthy',
latency: Date.now() - redisStart
};
} catch (error) {
health.status = 'degraded';
health.checks.redis = { status: 'unhealthy' };
}
const statusCode = health.status === 'healthy' ? 200 : health.status === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
});Golden Signals:
1. Latency (응답 시간)
- P50, P95, P99 percentiles
- API 엔드포인트별
2. Traffic (요청량)
- Requests per second
- 엔드포인트별, 상태 코드별
3. Errors (에러율)
- 5xx error rate
- 4xx error rate
- 에러 타입별
4. Saturation (리소스 사용률)
- CPU 사용률
- 메모리 사용률
- Disk I/O
- Network bandwidth#monitoring#observability#Prometheus#Grafana#logging#metrics#infrastructure