Monitoring and Observability Guide 2026: Prometheus, Grafana, OpenTelemetry

Observability 2026: Know Before Your Users Do

You shouldn't find out about production problems from user complaints. Proper observability means you know something is wrong before users notice.

The Three Pillars
Structured Logging with Pino
Prometheus Metrics
Grafana Dashboard
Distributed Tracing with OpenTelemetry
Error Tracking with Sentry
Alerting Rules

The Three Pillars

Logs    → What happened (events with context)
Metrics → How many / how fast (numbers over time)
Traces  → Why it's slow (request journey through services)

Structured Logging with Pino

// lib/logger.ts
import pino from 'pino'

export const logger = pino({
  level: process.env.LOG_LEVEL || 'info',

  // Pretty print in development
  transport: process.env.NODE_ENV === 'development'
    ? { target: 'pino-pretty', options: { colorize: true } }
    : undefined,

  // Structured fields in production
  base: {
    service: 'api',
    version: process.env.npm_package_version,
    env: process.env.NODE_ENV,
  },

  // Redact sensitive fields
  redact: {
    paths: ['req.headers.authorization', 'body.password', 'body.creditCard'],
    censor: '[REDACTED]',
  },

  // Custom serializers
  serializers: {
    err: pino.stdSerializers.err,
    req: (req) => ({
      method: req.method,
      url: req.url,
      remoteAddress: req.socket.remoteAddress,
    }),
  },
})

// Usage
logger.info({ userId: '123', action: 'login' }, 'User logged in')
logger.error({ err, requestId: req.id }, 'Database query failed')
logger.warn({ threshold: 100, current: 95 }, 'Connection pool near capacity')

// Request logging middleware
export function requestLogger() {
  return (req: Request, res: Response, next: NextFunction) => {
    const start = Date.now()
    const requestId = crypto.randomUUID()

    req.log = logger.child({ requestId })

    res.on('finish', () => {
      const duration = Date.now() - start

      req.log.info({
        method: req.method,
        url: req.url,
        statusCode: res.statusCode,
        duration,
        contentLength: res.getHeader('content-length'),
      }, 'Request completed')

      // Alert on slow requests
      if (duration > 1000) {
        req.log.warn({ duration }, 'Slow request')
      }
    })

    next()
  }
}

Prometheus Metrics

import { Registry, Counter, Histogram, Gauge } from 'prom-client'

const register = new Registry()

// Request metrics
const httpRequestTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'route', 'status'],
  registers: [register],
})

const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'HTTP request duration in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
  registers: [register],
})

// Business metrics
const activeUsers = new Gauge({
  name: 'active_users_total',
  help: 'Number of active users in last 5 minutes',
  registers: [register],
})

const postsCreatedTotal = new Counter({
  name: 'posts_created_total',
  help: 'Total posts created',
  labelNames: ['category'],
  registers: [register],
})

// Middleware to track metrics
export function metricsMiddleware() {
  return (req: Request, res: Response, next: NextFunction) => {
    const start = process.hrtime.bigint()

    res.on('finish', () => {
      const duration = Number(process.hrtime.bigint() - start) / 1e9

      const labels = {
        method: req.method,
        route: req.route?.path || 'unknown',
        status: res.statusCode.toString(),
      }

      httpRequestTotal.inc(labels)
      httpRequestDuration.observe(labels, duration)
    })

    next()
  }
}

// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType)
  res.end(await register.metrics())
})

Grafana Dashboard

# docker-compose for monitoring stack
version: '3.9'

services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=30d'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards
    depends_on:
      - prometheus

volumes:
  prometheus_data:
  grafana_data:

# prometheus.yml
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'nodejs-app'
    static_configs:
      - targets: ['app:3000']
    metrics_path: '/metrics'

  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']

Distributed Tracing with OpenTelemetry

// tracing.ts — Initialize before app startup
import { NodeSDK } from '@opentelemetry/sdk-node'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
import { Resource } from '@opentelemetry/resources'
import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_SERVICE_VERSION } from '@opentelemetry/semantic-conventions'
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node'

const sdk = new NodeSDK({
  resource: new Resource({
    [SEMRESATTRS_SERVICE_NAME]: 'api',
    [SEMRESATTRS_SERVICE_VERSION]: process.env.npm_package_version,
  }),
  traceExporter: new OTLPTraceExporter({
    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4318/v1/traces',
  }),
  instrumentations: [getNodeAutoInstrumentations({
    '@opentelemetry/instrumentation-http': { enabled: true },
    '@opentelemetry/instrumentation-express': { enabled: true },
    '@opentelemetry/instrumentation-pg': { enabled: true },
    '@opentelemetry/instrumentation-redis': { enabled: true },
  })],
})

sdk.start()
process.on('SIGTERM', () => sdk.shutdown())

Error Tracking with Sentry

// sentry.ts
import * as Sentry from '@sentry/node'

Sentry.init({
  dsn: process.env.SENTRY_DSN,
  environment: process.env.NODE_ENV,
  release: process.env.npm_package_version,

  tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,

  beforeSend(event, hint) {
    // Filter noise
    const err = hint?.originalException as Error
    if (err?.message?.includes('ECONNREFUSED')) return null  // Network errors
    return event
  },
})

// Capture unhandled errors
app.use(Sentry.expressErrorHandler())

// Add user context to error reports
app.use((req, res, next) => {
  if (req.user) {
    Sentry.setUser({ id: req.user.id, email: req.user.email })
  }
  next()
})

// Capture specific errors
try {
  await riskyOperation()
} catch (err) {
  Sentry.captureException(err, {
    tags: { component: 'payment-processor' },
    extra: { orderId: '123', userId: '456' },
  })
  throw err
}

Alerting Rules

# prometheus/alerts.yml
groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High error rate: {{ $value | humanizePercentage }}"

      - alert: SlowResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "P95 response time > 1s: {{ $value }}s"

      - alert: DatabaseConnectionsHigh
        expr: pg_stat_database_numbackends > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "DB connections at {{ $value }}/100"

Golden Rule: your on-call rotation should be woken up by automated alerts, not by user complaints. Set up monitoring before you go to production.