Metrics¶
Collect and visualize application metrics with Prometheus.
Metric Types¶
| Type | Use Case | Example |
|---|---|---|
| Counter | Cumulative count | Total requests |
| Gauge | Current value | Active connections |
| Histogram | Distribution | Response times |
| Summary | Distribution with quantiles | Request duration |
Setting Up Prometheus Metrics¶
Installation¶
Basic Setup¶
from prometheus_client import Counter, Gauge, Histogram, generate_latest, CONTENT_TYPE_LATEST
from fastapi import FastAPI, Response
app = FastAPI()
# Define metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['endpoint'],
buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
)
ACTIVE_REQUESTS = Gauge(
'http_requests_in_progress',
'HTTP requests currently in progress',
['endpoint']
)
# Metrics endpoint
@app.get("/metrics")
async def metrics():
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST
)
Middleware for Request Metrics¶
import time
from fastapi import Request
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
endpoint = request.url.path
method = request.method
ACTIVE_REQUESTS.labels(endpoint=endpoint).inc()
start_time = time.time()
try:
response = await call_next(request)
status = response.status_code
except Exception:
status = 500
raise
finally:
duration = time.time() - start_time
ACTIVE_REQUESTS.labels(endpoint=endpoint).dec()
REQUEST_COUNT.labels(method=method, endpoint=endpoint, status=status).inc()
REQUEST_LATENCY.labels(endpoint=endpoint).observe(duration)
return response
Custom Business Metrics¶
# Business metrics
ORDERS_CREATED = Counter(
'orders_created_total',
'Total orders created',
['payment_method']
)
ORDER_TOTAL = Histogram(
'order_total_dollars',
'Order total in dollars',
buckets=[10, 25, 50, 100, 250, 500, 1000]
)
ACTIVE_USERS = Gauge(
'active_users',
'Currently active users'
)
# Usage in code
@app.post("/orders")
async def create_order(order: OrderCreate):
# Create order...
ORDERS_CREATED.labels(payment_method=order.payment_method).inc()
ORDER_TOTAL.observe(order.total)
return order
# Track active users
async def user_connected(user_id: int):
ACTIVE_USERS.inc()
async def user_disconnected(user_id: int):
ACTIVE_USERS.dec()
Database Metrics¶
from sqlalchemy import event
DB_QUERY_DURATION = Histogram(
'db_query_duration_seconds',
'Database query duration',
['operation']
)
DB_CONNECTIONS = Gauge(
'db_connections',
'Database connections',
['state']
)
# SQLAlchemy event listeners
@event.listens_for(engine.sync_engine, "before_cursor_execute")
def before_cursor_execute(conn, cursor, statement, parameters, context, executemany):
conn.info['query_start'] = time.time()
@event.listens_for(engine.sync_engine, "after_cursor_execute")
def after_cursor_execute(conn, cursor, statement, parameters, context, executemany):
duration = time.time() - conn.info['query_start']
operation = statement.split()[0].upper() # SELECT, INSERT, etc.
DB_QUERY_DURATION.labels(operation=operation).observe(duration)
# Track connection pool
def update_pool_metrics():
pool = engine.pool
DB_CONNECTIONS.labels(state='active').set(pool.checkedout())
DB_CONNECTIONS.labels(state='idle').set(pool.checkedin())
Cache Metrics¶
CACHE_HITS = Counter('cache_hits_total', 'Cache hits', ['cache'])
CACHE_MISSES = Counter('cache_misses_total', 'Cache misses', ['cache'])
async def get_cached(key: str, fetch_fn):
value = await cache.get(key)
if value is not None:
CACHE_HITS.labels(cache='redis').inc()
return value
CACHE_MISSES.labels(cache='redis').inc()
value = await fetch_fn()
await cache.set(key, value)
return value
External API Metrics¶
EXTERNAL_API_REQUESTS = Counter(
'external_api_requests_total',
'External API requests',
['service', 'endpoint', 'status']
)
EXTERNAL_API_LATENCY = Histogram(
'external_api_duration_seconds',
'External API latency',
['service']
)
async def call_external_api(service: str, endpoint: str, **kwargs):
start = time.time()
try:
response = await httpx.get(endpoint, **kwargs)
status = response.status_code
return response
except Exception as e:
status = 'error'
raise
finally:
EXTERNAL_API_REQUESTS.labels(
service=service,
endpoint=endpoint,
status=status
).inc()
EXTERNAL_API_LATENCY.labels(service=service).observe(time.time() - start)
Prometheus Configuration¶
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'api'
static_configs:
- targets: ['api:8000']
metrics_path: /metrics
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
Grafana Dashboards¶
Essential Panels¶
Request Rate:
Error Rate:
Latency P95:
Requests in Progress:
Dashboard JSON Example¶
{
"title": "API Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
]
},
{
"title": "Error Rate",
"type": "stat",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100"
}
],
"thresholds": [
{"value": 0, "color": "green"},
{"value": 1, "color": "yellow"},
{"value": 5, "color": "red"}
]
},
{
"title": "Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
]
}
]
}
Best Practices¶
- Use labels wisely — High cardinality kills performance
- Choose right buckets — Match your SLOs
- Name consistently —
noun_verb_unitpattern - Include units —
_seconds,_bytes,_total - Document metrics — Help text is important
- Set up alerts — Metrics are useless without alerting
Common Pitfalls¶
# Bad: High cardinality label
REQUEST_COUNT.labels(user_id=user.id) # Don't!
# Good: Use broader categories
REQUEST_COUNT.labels(user_tier=user.tier)
# Bad: Counter for current value
current_users = Counter('current_users', ...) # Don't!
# Good: Gauge for current value
current_users = Gauge('current_users', ...)
See Also¶
- Alerting -- Setting up alert rules based on the metrics collected here
- Database Monitoring -- PostgreSQL-specific metrics, pg_stat views, and connection pool tracking