Published on

Building Observability From Scratch — Metrics, Logs, and Traces Without the Complexity

Authors

Introduction

Observability is not monitoring dashboards or log aggregation—it's the ability to understand your system's behavior from the outside, asking arbitrary questions you didn't anticipate. Real observability starts simple: structured JSON logs, Prometheus-scraped metrics, and OpenTelemetry traces. This post builds observability from scratch: collecting metrics with Prometheus, storing logs with Loki in JSON format, tracing with Tempo, and correlating all three using trace IDs. No vendor lock-in, no complexity until you need it.

Prometheus Metrics Collection

Install Prometheus and scrape your application:

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'production'
    environment: 'prod'

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - localhost:9093

rule_files:
  - 'rules/*.yml'

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'api-service'
    static_configs:
      - targets: ['localhost:8080']
    metrics_path: '/metrics'
    scrape_interval: 10s

  - job_name: 'database'
    static_configs:
      - targets: ['localhost:5432']
    scrape_interval: 30s

  - job_name: 'redis'
    static_configs:
      - targets: ['localhost:6379']
    scrape_interval: 10s

Instrument your Node.js application:

// src/monitoring/metrics.ts
import { register, Counter, Histogram, Gauge } from 'prom-client';
import express from 'express';

// Request metrics
export const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.001, 0.01, 0.1, 0.5, 1, 2, 5],
});

export const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'route', 'status'],
});

// Business metrics
export const ordersTotal = new Counter({
  name: 'orders_total',
  help: 'Total orders processed',
  labelNames: ['status', 'currency'],
});

export const orderValue = new Histogram({
  name: 'order_value_usd',
  help: 'Order value in USD',
  labelNames: ['product_category'],
  buckets: [10, 50, 100, 500, 1000, 5000],
});

// Resource metrics
export const dbConnectionPoolSize = new Gauge({
  name: 'db_connection_pool_size',
  help: 'Number of database connections in pool',
  labelNames: ['pool_name'],
});

export const cacheHitRate = new Gauge({
  name: 'cache_hit_rate',
  help: 'Cache hit rate percentage',
  labelNames: ['cache_name'],
});

// Middleware to record request metrics
export function metricsMiddleware(
  req: express.Request,
  res: express.Response,
  next: express.NextFunction
) {
  const start = Date.now();
  const route = req.route?.path || req.path;

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    httpRequestDuration
      .labels(req.method, route, res.statusCode.toString())
      .observe(duration);
    httpRequestsTotal
      .labels(req.method, route, res.statusCode.toString())
      .inc();
  });

  next();
}

// Expose metrics endpoint
export function setupMetricsEndpoint(app: express.Application) {
  app.get('/metrics', async (req, res) => {
    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
  });
}

// Usage in application
export async function processOrder(orderId: string, amount: number) {
  try {
    ordersTotal.labels('completed', 'USD').inc();
    orderValue.labels('electronics').observe(amount);
  } catch (error) {
    ordersTotal.labels('failed', 'USD').inc();
    throw error;
  }
}

Define alert rules:

# prometheus/rules/alerts.yml
groups:
  - name: api-alerts
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: |
          (sum(rate(http_requests_total{status=~"5.."}[5m])) /
          sum(rate(http_requests_total[5m]))) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: 'High error rate detected'
          description: 'Error rate is {{ $value | humanizePercentage }}'

      - alert: HighLatency
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: 'High latency detected'
          description: 'p99 latency is {{ $value }}s'

      - alert: DatabaseConnectionPoolExhausted
        expr: db_connection_pool_used / db_connection_pool_size > 0.9
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: 'Database connection pool near capacity'

      - alert: CacheMissRate
        expr: (1 - cache_hit_rate) > 0.3
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: 'High cache miss rate ({{ $value | humanizePercentage }})'

Structured Logging with Pino

Use JSON logging for machine-readable logs:

// src/logging/logger.ts
import pino from 'pino';

export const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => {
      return { level: label.toUpperCase() };
    },
    bindings: (bindings) => {
      return {
        pid: bindings.pid,
        hostname: bindings.hostname,
      };
    },
  },
  timestamp: pino.stdTimeFunctions.isoTime,
  transport: {
    target: 'pino-pretty', // Pretty-print in dev, JSON in prod
    options: {
      colorize: process.env.NODE_ENV !== 'production',
      ignore: 'pid,hostname',
    },
  },
});

// Usage with context
export function createRequestLogger(
  req: express.Request,
  traceId: string
) {
  return logger.child({
    traceId,
    requestId: req.id,
    userId: req.user?.id,
    method: req.method,
    path: req.path,
    userAgent: req.get('user-agent'),
  });
}

// Example log output (JSON in prod):
// {
//   "level": "INFO",
//   "time": "2026-03-15T10:30:00Z",
//   "traceId": "a1b2c3d4",
//   "requestId": "req-123",
//   "userId": "user-456",
//   "method": "POST",
//   "path": "/api/orders",
//   "msg": "Order created successfully",
//   "orderId": "order-789",
//   "amount": 99.99,
//   "processingTimeMs": 245
// }

Log business events and errors:

export async function handleOrderRequest(
  req: express.Request,
  res: express.Response
) {
  const log = createRequestLogger(req, req.traceId);

  try {
    log.info('Processing order request', {
      itemCount: req.body.items.length,
      totalAmount: req.body.total,
    });

    const order = await createOrder(req.body);

    log.info('Order created successfully', {
      orderId: order.id,
      status: order.status,
      amount: order.total,
      processingTimeMs: Date.now() - req.startTime,
    });

    res.json(order);
  } catch (error) {
    log.error('Order creation failed', {
      error: error.message,
      stack: error.stack,
      errorCode: error.code,
      processingTimeMs: Date.now() - req.startTime,
    });

    res.status(500).json({ error: 'Order creation failed' });
  }
}

Loki for Log Aggregation

Set up Loki to store and query logs:

# loki/loki-config.yml
auth_enabled: false

ingester:
  max_chunk_age: 2h
  chunk_idle_period: 3m

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

server:
  http_listen_port: 3100
  log_level: info

storage_config:
  boltdb_shipper:
    active_index_directory: /loki/index
    shared_store: filesystem
  filesystem:
    directory: /loki/chunks

chunk_store_config:
  max_look_back_period: 0s

table_manager:
  retention_deletes_enabled: false
  retention_period: 0s

Query logs in Loki with LogQL:

// observability/log-queries.ts
export const LOG_QUERIES = {
  // Find all errors in production
  productionErrors: `{environment="production"} | json | level="ERROR"`,

  // Find requests for a specific user
  userRequests: (userId: string) =>
    `{environment="production"} | json | userId="${userId}"`,

  // Find slow requests
  slowRequests: `{environment="production"} | json | processingTimeMs > 1000`,

  // Find errors by service
  serviceErrors: (service: string) =>
    `{service="${service}"} | json | level="ERROR" | stats count() by error_code`,

  // Database query performance
  dbSlowQueries: `{environment="production"} | json | query_type="database" | duration_ms > 500`,

  // Orders by hour
  orderVolume: `{environment="production"} | json | event="order_created" | stats count() by bin(1h)`,
};

// Log sampling for high-volume events
export const logWithSampling = (
  logger: pino.Logger,
  message: string,
  context: any,
  samplingRate: number = 0.1 // Log 10% of events
) => {
  if (Math.random() < samplingRate) {
    logger.info(message, context);
  }
};

OpenTelemetry Traces with Tempo

Install and configure OpenTelemetry:

// src/tracing/instrumentation.ts
import {
  NodeSDK,
  node,
} from '@opentelemetry/auto-instrumentations-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';

const sdk = new NodeSDK({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
    [SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
    environment: process.env.NODE_ENV || 'development',
  }),
  traceExporter: new OTLPTraceExporter({
    url: 'http://tempo:4318/v1/traces',
  }),
  metricReader: new PeriodicExportingMetricReader({
    exporter: new OTLPMetricExporter({
      url: 'http://tempo:4318/v1/metrics',
    }),
  }),
  instrumentations: [getNodeAutoInstrumentations()],
});

sdk.start();

Create custom spans for business operations:

// src/tracing/spans.ts
import { trace, context, SpanStatusCode } from '@opentelemetry/api';

const tracer = trace.getTracer('api-service');

export async function createOrderWithTracing(orderData: any) {
  const span = tracer.startSpan('create_order', {
    attributes: {
      'order.item_count': orderData.items.length,
      'order.total_amount': orderData.total,
      'order.customer_id': orderData.customerId,
    },
  });

  return context.with(trace.setSpan(context.active(), span), async () => {
    try {
      // Validate input
      const validateSpan = tracer.startSpan('validate_order');
      try {
        await validateOrder(orderData);
        validateSpan.end();
      } catch (error) {
        validateSpan.recordException(error);
        validateSpan.setStatus({ code: SpanStatusCode.ERROR });
        validateSpan.end();
        throw error;
      }

      // Process payment
      const paymentSpan = tracer.startSpan('process_payment', {
        attributes: {
          'payment.method': orderData.paymentMethod,
          'payment.amount': orderData.total,
        },
      });
      try {
        const payment = await processPayment(orderData);
        paymentSpan.setAttributes({
          'payment.transaction_id': payment.transactionId,
          'payment.processing_time_ms': payment.processingTime,
        });
        paymentSpan.end();
      } catch (error) {
        paymentSpan.recordException(error);
        paymentSpan.setStatus({ code: SpanStatusCode.ERROR });
        paymentSpan.end();
        throw error;
      }

      // Save to database
      const dbSpan = tracer.startSpan('save_order_to_db', {
        attributes: {
          'db.operation': 'insert',
          'db.table': 'orders',
        },
      });
      try {
        const order = await saveOrder(orderData);
        dbSpan.setAttributes({
          'db.rows_affected': 1,
        });
        dbSpan.end();
        span.end();
        return order;
      } catch (error) {
        dbSpan.recordException(error);
        dbSpan.setStatus({ code: SpanStatusCode.ERROR });
        dbSpan.end();
        span.recordException(error);
        span.setStatus({ code: SpanStatusCode.ERROR });
        span.end();
        throw error;
      }
    } catch (error) {
      span.recordException(error);
      span.setStatus({ code: SpanStatusCode.ERROR });
      span.end();
      throw error;
    }
  });
}

Correlating Traces, Logs, and Metrics

Use trace IDs to correlate all signals:

// src/middleware/trace-correlation.ts
import { trace, context } from '@opentelemetry/api';
import express from 'express';
import { v4 as uuidv4 } from 'uuid';

export function traceCorrelationMiddleware(
  req: express.Request,
  res: express.Response,
  next: express.NextFunction
) {
  // Get or generate trace ID
  const traceId =
    req.get('x-trace-id') || req.get('traceparent')?.split('-')[1] || uuidv4();

  // Create span context
  const span = trace.getActiveSpan() || trace.getTracer('express').startSpan('http');
  const spanContext = span.spanContext();

  // Attach to request for logging
  req.traceId = traceId;
  req.spanId = spanContext.spanId;

  // Set response headers
  res.set('x-trace-id', traceId);
  res.set('x-span-id', spanContext.spanId);

  // All logs in this request will include the trace ID
  const childContext = trace.setSpan(context.active(), span);
  context.with(childContext, () => {
    next();
  });
}

// Enhance logger with trace context
declare global {
  namespace Express {
    interface Request {
      traceId: string;
      spanId: string;
    }
  }
}

export function enhanceLogContext(logger: pino.Logger, req: express.Request) {
  return logger.child({
    traceId: req.traceId,
    spanId: req.spanId,
  });
}

Alert Rules from SLI Metrics

Define alerts based on SLI (Service Level Indicators):

# prometheus/rules/sli-alerts.yml
groups:
  - name: sli-alerts
    interval: 1m
    rules:
      # SLI: 99% of requests complete in under 500ms
      - alert: SLILatencyViolation
        expr: |
          histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: critical
          sli: 'latency'
        annotations:
          summary: 'SLI violation: Latency p99 > 500ms'

      # SLI: 99.5% of requests succeed (< 0.5% error rate)
      - alert: SLIAvailabilityViolation
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) /
          sum(rate(http_requests_total[5m])) > 0.005
        for: 5m
        labels:
          severity: critical
          sli: 'availability'
        annotations:
          summary: 'SLI violation: Error rate > 0.5%'

      # SLI: 99.9% of database queries complete in under 100ms
      - alert: SLIDatabaseLatencyViolation
        expr: |
          histogram_quantile(
            0.999,
            rate(db_query_duration_seconds_bucket[5m])
          ) > 0.1
        for: 5m
        labels:
          severity: warning
          sli: 'database-latency'
        annotations:
          summary: 'Database query latency p999 > 100ms'

Grafana Dashboard Setup

Create a Grafana dashboard that ties everything together:

{
  "dashboard": {
    "title": "API Service Health",
    "panels": [
      {
        "id": 1,
        "title": "Request Rate",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (method)"
          }
        ],
        "type": "graph"
      },
      {
        "id": 2,
        "title": "Error Rate",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"
          }
        ]
      },
      {
        "id": 3,
        "title": "Latency p99",
        "targets": [
          {
            "expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"
          }
        ]
      },
      {
        "id": 4,
        "title": "Recent Errors",
        "targets": [
          {
            "expr": "{environment='production'} | json | level='ERROR'"
          }
        ],
        "type": "logs"
      },
      {
        "id": 5,
        "title": "Trace Analysis",
        "type": "traces",
        "description": "Click on a trace to view full request flow"
      }
    ]
  }
}

Checklist

  • Install Prometheus and configure scrape targets
  • Instrument application with prom-client
  • Set up structured JSON logging with Pino
  • Deploy Loki for log aggregation
  • Install and configure OpenTelemetry SDK
  • Create custom spans for business operations
  • Add trace ID to logs and metrics
  • Define SLI-based alerts
  • Create Grafana dashboards
  • Set up correlation between traces, logs, and metrics

Conclusion

Start with the three pillars: Prometheus for metrics (performance), Loki for logs (debugging), Tempo for traces (understanding flow). Correlate them with trace IDs. Define alerts from SLI metrics, not arbitrary thresholds. Build dashboards that answer the questions you ask at 2 AM. When you can drill from a Grafana alert to the relevant logs and traces, you have observability. When you understand your system's behavior from the outside, you're unstoppable.