Database Monitoring¶
Track database health, performance, and resource usage.
Key Metrics to Monitor¶
| Category | Metrics | Why |
|---|---|---|
| Connections | Active, idle, total | Pool exhaustion, leaks |
| Queries | Slow queries, query count | Performance issues |
| Locks | Lock waits, deadlocks | Contention problems |
| Replication | Lag, state | Data consistency |
| Storage | Table size, bloat | Capacity planning |
| Cache | Hit ratio | Memory efficiency |
pg_stat Views¶
Connection Statistics¶
-- Active connections by state
SELECT
state,
COUNT(*) as connections,
COUNT(*) FILTER (WHERE wait_event IS NOT NULL) as waiting
FROM pg_stat_activity
WHERE datname = 'sartiq'
GROUP BY state;
-- Connection details
SELECT
pid,
usename,
application_name,
client_addr,
state,
wait_event_type,
wait_event,
NOW() - query_start as query_duration,
LEFT(query, 100) as query_preview
FROM pg_stat_activity
WHERE datname = 'sartiq'
AND pid != pg_backend_pid()
ORDER BY query_start;
-- Long-running queries
SELECT
pid,
NOW() - query_start as duration,
state,
query
FROM pg_stat_activity
WHERE datname = 'sartiq'
AND state != 'idle'
AND NOW() - query_start > INTERVAL '1 minute';
Query Statistics (pg_stat_statements)¶
-- Enable extension
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- Top queries by total time
SELECT
LEFT(query, 80) as query,
calls,
ROUND(total_exec_time::numeric, 2) as total_ms,
ROUND(mean_exec_time::numeric, 2) as mean_ms,
rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;
-- Queries with high variability
SELECT
LEFT(query, 80) as query,
calls,
ROUND(mean_exec_time::numeric, 2) as mean_ms,
ROUND(stddev_exec_time::numeric, 2) as stddev_ms
FROM pg_stat_statements
WHERE calls > 100
ORDER BY stddev_exec_time DESC
LIMIT 10;
-- Cache hit ratio by query
SELECT
LEFT(query, 60) as query,
calls,
ROUND(100.0 * shared_blks_hit / NULLIF(shared_blks_hit + shared_blks_read, 0), 2) as cache_hit_pct
FROM pg_stat_statements
WHERE calls > 100
ORDER BY shared_blks_read DESC
LIMIT 20;
Table Statistics¶
-- Table sizes and row counts
SELECT
schemaname,
relname as table_name,
n_live_tup as row_count,
pg_size_pretty(pg_total_relation_size(relid)) as total_size,
pg_size_pretty(pg_relation_size(relid)) as table_size,
pg_size_pretty(pg_indexes_size(relid)) as index_size
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(relid) DESC;
-- Sequential vs index scans
SELECT
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
ROUND(100.0 * idx_scan / NULLIF(seq_scan + idx_scan, 0), 2) as idx_scan_pct
FROM pg_stat_user_tables
WHERE seq_scan + idx_scan > 100
ORDER BY seq_scan DESC;
-- Tables needing vacuum
SELECT
schemaname,
relname,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) as dead_pct,
last_autovacuum,
last_autoanalyze
FROM pg_stat_user_tables
WHERE n_dead_tup > 1000
ORDER BY n_dead_tup DESC;
Index Statistics¶
-- Index usage
SELECT
schemaname,
relname as table_name,
indexrelname as index_name,
idx_scan as scans,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;
-- Unused indexes
SELECT
schemaname,
relname as table_name,
indexrelname as index_name,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexrelname NOT LIKE '%_pkey'
ORDER BY pg_relation_size(indexrelid) DESC;
-- Index bloat estimate
SELECT
schemaname,
relname,
indexrelname,
pg_size_pretty(pg_relation_size(indexrelid)) as index_size,
ROUND(100.0 * pg_stat_get_dead_tuples(indexrelid) /
NULLIF(pg_stat_get_live_tuples(indexrelid), 0), 2) as bloat_pct
FROM pg_stat_user_indexes
WHERE pg_relation_size(indexrelid) > 1024 * 1024; -- > 1MB
Lock Statistics¶
-- Current locks
SELECT
l.pid,
l.locktype,
l.mode,
l.granted,
a.state,
a.query
FROM pg_locks l
JOIN pg_stat_activity a ON l.pid = a.pid
WHERE NOT l.granted
ORDER BY l.pid;
-- Lock waits
SELECT
blocked.pid as blocked_pid,
blocked.query as blocked_query,
blocking.pid as blocking_pid,
blocking.query as blocking_query
FROM pg_stat_activity blocked
JOIN pg_locks blocked_locks ON blocked.pid = blocked_locks.pid
JOIN pg_locks blocking_locks ON blocked_locks.locktype = blocking_locks.locktype
AND blocked_locks.database IS NOT DISTINCT FROM blocking_locks.database
AND blocked_locks.relation IS NOT DISTINCT FROM blocking_locks.relation
AND blocked_locks.page IS NOT DISTINCT FROM blocking_locks.page
AND blocked_locks.tuple IS NOT DISTINCT FROM blocking_locks.tuple
AND blocked_locks.virtualxid IS NOT DISTINCT FROM blocking_locks.virtualxid
AND blocked_locks.transactionid IS NOT DISTINCT FROM blocking_locks.transactionid
AND blocked_locks.classid IS NOT DISTINCT FROM blocking_locks.classid
AND blocked_locks.objid IS NOT DISTINCT FROM blocking_locks.objid
AND blocked_locks.objsubid IS NOT DISTINCT FROM blocking_locks.objsubid
AND blocked_locks.pid != blocking_locks.pid
JOIN pg_stat_activity blocking ON blocking_locks.pid = blocking.pid
WHERE NOT blocked_locks.granted;
Replication Statistics¶
-- Replication status
SELECT
client_addr,
state,
sent_lsn,
write_lsn,
flush_lsn,
replay_lsn,
pg_wal_lsn_diff(sent_lsn, replay_lsn) as lag_bytes,
sync_state
FROM pg_stat_replication;
-- Replication lag in seconds
SELECT
client_addr,
EXTRACT(EPOCH FROM (NOW() - pg_last_xact_replay_timestamp())) as lag_seconds
FROM pg_stat_replication;
Cache Hit Ratio¶
-- Database cache hit ratio
SELECT
datname,
ROUND(100.0 * blks_hit / NULLIF(blks_hit + blks_read, 0), 2) as cache_hit_ratio
FROM pg_stat_database
WHERE datname = 'sartiq';
-- Table cache hit ratio
SELECT
schemaname,
relname,
ROUND(100.0 * heap_blks_hit / NULLIF(heap_blks_hit + heap_blks_read, 0), 2) as heap_hit_ratio,
ROUND(100.0 * idx_blks_hit / NULLIF(idx_blks_hit + idx_blks_read, 0), 2) as idx_hit_ratio
FROM pg_statio_user_tables
WHERE heap_blks_hit + heap_blks_read > 100
ORDER BY heap_blks_read DESC;
Alerting¶
Critical Alerts¶
-- Long-running queries (> 5 minutes)
SELECT COUNT(*) > 0 as alert
FROM pg_stat_activity
WHERE state = 'active'
AND NOW() - query_start > INTERVAL '5 minutes';
-- Connection count > 80% of max
SELECT
COUNT(*) > (0.8 * current_setting('max_connections')::int) as alert
FROM pg_stat_activity;
-- Replication lag > 1 minute
SELECT
EXISTS(
SELECT 1 FROM pg_stat_replication
WHERE pg_wal_lsn_diff(sent_lsn, replay_lsn) > 16 * 1024 * 1024 -- ~1min of WAL
) as alert;
-- Deadlocks detected
SELECT
deadlocks > 0 as alert
FROM pg_stat_database
WHERE datname = 'sartiq'
AND stats_reset > NOW() - INTERVAL '1 hour';
Warning Alerts¶
-- Cache hit ratio < 95%
SELECT
ROUND(100.0 * blks_hit / NULLIF(blks_hit + blks_read, 0), 2) < 95 as alert
FROM pg_stat_database
WHERE datname = 'sartiq';
-- Tables with > 10% dead tuples
SELECT COUNT(*) > 0 as alert
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000
AND n_dead_tup > 0.1 * n_live_tup;
-- Unused indexes > 100MB
SELECT COUNT(*) > 0 as alert
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND pg_relation_size(indexrelid) > 100 * 1024 * 1024;
Monitoring Dashboards¶
Grafana + Prometheus¶
# prometheus/postgres_exporter queries
pg_stat_activity_count:
query: "SELECT state, COUNT(*) FROM pg_stat_activity GROUP BY state"
metrics:
- state:
usage: "LABEL"
- count:
usage: "GAUGE"
pg_stat_statements_total_time:
query: |
SELECT
LEFT(query, 50) as query,
calls,
total_exec_time
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20
Key Dashboard Panels¶
- Connection Pool
- Active connections
- Idle connections
-
Connection wait time
-
Query Performance
- Queries per second
- Average query time
-
Slow query count
-
Cache Performance
- Buffer cache hit ratio
- Index cache hit ratio
-
Disk reads per second
-
Replication
- Replication lag (bytes)
- Replication lag (seconds)
-
Replica states
-
Storage
- Database size
- Table bloat
- WAL generation rate
Logging¶
PostgreSQL Logging Config¶
# postgresql.conf
# Log slow queries
log_min_duration_statement = 100 # Log queries > 100ms
# Log connections
log_connections = on
log_disconnections = on
# Log locks
log_lock_waits = on
deadlock_timeout = 1s
# Log checkpoints
log_checkpoints = on
# Log format
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
Query Logging¶
# Log all queries (development only!)
log_statement = 'all'
# Log only DDL
log_statement = 'ddl'
# Log queries with errors
log_min_error_statement = error
Health Check Endpoint¶
from fastapi import APIRouter
from sqlalchemy import text
router = APIRouter()
@router.get("/health/db")
async def database_health(db: AsyncSession = Depends(get_db)):
"""Database health check endpoint."""
try:
# Basic connectivity
await db.execute(text("SELECT 1"))
# Connection pool status
pool = engine.pool
# Query performance (last hour)
result = await db.execute(text("""
SELECT
COUNT(*) as slow_queries
FROM pg_stat_statements
WHERE mean_exec_time > 1000 -- > 1 second
"""))
slow_queries = result.scalar()
return {
"status": "healthy",
"pool": {
"size": pool.size(),
"checkedin": pool.checkedin(),
"checkedout": pool.checkedout(),
},
"slow_queries_count": slow_queries,
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
}
Best Practices¶
- Enable pg_stat_statements — Essential for query analysis
- Log slow queries — Set
log_min_duration_statement - Monitor connections — Alert before exhaustion
- Track replication lag — Critical for read replicas
- Watch cache hit ratios — Should be > 95%
- Review unused indexes — Remove to improve write performance
- Monitor vacuum activity — Prevent bloat
- Set up alerting — Don't wait for user reports