Monitoring and Observability Guide 2026: Prometheus, Grafana, OpenTelemetry
Advertisement
Observability 2026: Know Before Your Users Do
You shouldn't find out about production problems from user complaints. Proper observability means you know something is wrong before users notice.
- The Three Pillars
- Structured Logging with Pino
- Prometheus Metrics
- Grafana Dashboard
- Distributed Tracing with OpenTelemetry
- Error Tracking with Sentry
- Alerting Rules
The Three Pillars
Logs → What happened (events with context)
Metrics → How many / how fast (numbers over time)
Traces → Why it's slow (request journey through services)
Structured Logging with Pino
// lib/logger.ts
import pino from 'pino'
export const logger = pino({
level: process.env.LOG_LEVEL || 'info',
// Pretty print in development
transport: process.env.NODE_ENV === 'development'
? { target: 'pino-pretty', options: { colorize: true } }
: undefined,
// Structured fields in production
base: {
service: 'api',
version: process.env.npm_package_version,
env: process.env.NODE_ENV,
},
// Redact sensitive fields
redact: {
paths: ['req.headers.authorization', 'body.password', 'body.creditCard'],
censor: '[REDACTED]',
},
// Custom serializers
serializers: {
err: pino.stdSerializers.err,
req: (req) => ({
method: req.method,
url: req.url,
remoteAddress: req.socket.remoteAddress,
}),
},
})
// Usage
logger.info({ userId: '123', action: 'login' }, 'User logged in')
logger.error({ err, requestId: req.id }, 'Database query failed')
logger.warn({ threshold: 100, current: 95 }, 'Connection pool near capacity')
// Request logging middleware
export function requestLogger() {
return (req: Request, res: Response, next: NextFunction) => {
const start = Date.now()
const requestId = crypto.randomUUID()
req.log = logger.child({ requestId })
res.on('finish', () => {
const duration = Date.now() - start
req.log.info({
method: req.method,
url: req.url,
statusCode: res.statusCode,
duration,
contentLength: res.getHeader('content-length'),
}, 'Request completed')
// Alert on slow requests
if (duration > 1000) {
req.log.warn({ duration }, 'Slow request')
}
})
next()
}
}
Prometheus Metrics
import { Registry, Counter, Histogram, Gauge } from 'prom-client'
const register = new Registry()
// Request metrics
const httpRequestTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'route', 'status'],
registers: [register],
})
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
registers: [register],
})
// Business metrics
const activeUsers = new Gauge({
name: 'active_users_total',
help: 'Number of active users in last 5 minutes',
registers: [register],
})
const postsCreatedTotal = new Counter({
name: 'posts_created_total',
help: 'Total posts created',
labelNames: ['category'],
registers: [register],
})
// Middleware to track metrics
export function metricsMiddleware() {
return (req: Request, res: Response, next: NextFunction) => {
const start = process.hrtime.bigint()
res.on('finish', () => {
const duration = Number(process.hrtime.bigint() - start) / 1e9
const labels = {
method: req.method,
route: req.route?.path || 'unknown',
status: res.statusCode.toString(),
}
httpRequestTotal.inc(labels)
httpRequestDuration.observe(labels, duration)
})
next()
}
}
// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType)
res.end(await register.metrics())
})
Grafana Dashboard
# docker-compose for monitoring stack
version: '3.9'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
grafana:
image: grafana/grafana:latest
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
depends_on:
- prometheus
volumes:
prometheus_data:
grafana_data:
# prometheus.yml
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'nodejs-app'
static_configs:
- targets: ['app:3000']
metrics_path: '/metrics'
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
Distributed Tracing with OpenTelemetry
// tracing.ts — Initialize before app startup
import { NodeSDK } from '@opentelemetry/sdk-node'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
import { Resource } from '@opentelemetry/resources'
import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_SERVICE_VERSION } from '@opentelemetry/semantic-conventions'
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node'
const sdk = new NodeSDK({
resource: new Resource({
[SEMRESATTRS_SERVICE_NAME]: 'api',
[SEMRESATTRS_SERVICE_VERSION]: process.env.npm_package_version,
}),
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4318/v1/traces',
}),
instrumentations: [getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-http': { enabled: true },
'@opentelemetry/instrumentation-express': { enabled: true },
'@opentelemetry/instrumentation-pg': { enabled: true },
'@opentelemetry/instrumentation-redis': { enabled: true },
})],
})
sdk.start()
process.on('SIGTERM', () => sdk.shutdown())
Error Tracking with Sentry
// sentry.ts
import * as Sentry from '@sentry/node'
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
release: process.env.npm_package_version,
tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
beforeSend(event, hint) {
// Filter noise
const err = hint?.originalException as Error
if (err?.message?.includes('ECONNREFUSED')) return null // Network errors
return event
},
})
// Capture unhandled errors
app.use(Sentry.expressErrorHandler())
// Add user context to error reports
app.use((req, res, next) => {
if (req.user) {
Sentry.setUser({ id: req.user.id, email: req.user.email })
}
next()
})
// Capture specific errors
try {
await riskyOperation()
} catch (err) {
Sentry.captureException(err, {
tags: { component: 'payment-processor' },
extra: { orderId: '123', userId: '456' },
})
throw err
}
Alerting Rules
# prometheus/alerts.yml
groups:
- name: api_alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate: {{ $value | humanizePercentage }}"
- alert: SlowResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "P95 response time > 1s: {{ $value }}s"
- alert: DatabaseConnectionsHigh
expr: pg_stat_database_numbackends > 80
for: 5m
labels:
severity: warning
annotations:
summary: "DB connections at {{ $value }}/100"
Golden Rule: your on-call rotation should be woken up by automated alerts, not by user complaints. Set up monitoring before you go to production.
Advertisement