Skip to content

Database Monitoring

Track database health, performance, and resource usage.

Key Metrics to Monitor

Category Metrics Why
Connections Active, idle, total Pool exhaustion, leaks
Queries Slow queries, query count Performance issues
Locks Lock waits, deadlocks Contention problems
Replication Lag, state Data consistency
Storage Table size, bloat Capacity planning
Cache Hit ratio Memory efficiency

pg_stat Views

Connection Statistics

-- Active connections by state
SELECT
    state,
    COUNT(*) as connections,
    COUNT(*) FILTER (WHERE wait_event IS NOT NULL) as waiting
FROM pg_stat_activity
WHERE datname = 'sartiq'
GROUP BY state;

-- Connection details
SELECT
    pid,
    usename,
    application_name,
    client_addr,
    state,
    wait_event_type,
    wait_event,
    NOW() - query_start as query_duration,
    LEFT(query, 100) as query_preview
FROM pg_stat_activity
WHERE datname = 'sartiq'
  AND pid != pg_backend_pid()
ORDER BY query_start;

-- Long-running queries
SELECT
    pid,
    NOW() - query_start as duration,
    state,
    query
FROM pg_stat_activity
WHERE datname = 'sartiq'
  AND state != 'idle'
  AND NOW() - query_start > INTERVAL '1 minute';

Query Statistics (pg_stat_statements)

-- Enable extension
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;

-- Top queries by total time
SELECT
    LEFT(query, 80) as query,
    calls,
    ROUND(total_exec_time::numeric, 2) as total_ms,
    ROUND(mean_exec_time::numeric, 2) as mean_ms,
    rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;

-- Queries with high variability
SELECT
    LEFT(query, 80) as query,
    calls,
    ROUND(mean_exec_time::numeric, 2) as mean_ms,
    ROUND(stddev_exec_time::numeric, 2) as stddev_ms
FROM pg_stat_statements
WHERE calls > 100
ORDER BY stddev_exec_time DESC
LIMIT 10;

-- Cache hit ratio by query
SELECT
    LEFT(query, 60) as query,
    calls,
    ROUND(100.0 * shared_blks_hit / NULLIF(shared_blks_hit + shared_blks_read, 0), 2) as cache_hit_pct
FROM pg_stat_statements
WHERE calls > 100
ORDER BY shared_blks_read DESC
LIMIT 20;

Table Statistics

-- Table sizes and row counts
SELECT
    schemaname,
    relname as table_name,
    n_live_tup as row_count,
    pg_size_pretty(pg_total_relation_size(relid)) as total_size,
    pg_size_pretty(pg_relation_size(relid)) as table_size,
    pg_size_pretty(pg_indexes_size(relid)) as index_size
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(relid) DESC;

-- Sequential vs index scans
SELECT
    relname,
    seq_scan,
    seq_tup_read,
    idx_scan,
    idx_tup_fetch,
    ROUND(100.0 * idx_scan / NULLIF(seq_scan + idx_scan, 0), 2) as idx_scan_pct
FROM pg_stat_user_tables
WHERE seq_scan + idx_scan > 100
ORDER BY seq_scan DESC;

-- Tables needing vacuum
SELECT
    schemaname,
    relname,
    n_live_tup,
    n_dead_tup,
    ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) as dead_pct,
    last_autovacuum,
    last_autoanalyze
FROM pg_stat_user_tables
WHERE n_dead_tup > 1000
ORDER BY n_dead_tup DESC;

Index Statistics

-- Index usage
SELECT
    schemaname,
    relname as table_name,
    indexrelname as index_name,
    idx_scan as scans,
    pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;

-- Unused indexes
SELECT
    schemaname,
    relname as table_name,
    indexrelname as index_name,
    pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
  AND indexrelname NOT LIKE '%_pkey'
ORDER BY pg_relation_size(indexrelid) DESC;

-- Index bloat estimate
SELECT
    schemaname,
    relname,
    indexrelname,
    pg_size_pretty(pg_relation_size(indexrelid)) as index_size,
    ROUND(100.0 * pg_stat_get_dead_tuples(indexrelid) /
          NULLIF(pg_stat_get_live_tuples(indexrelid), 0), 2) as bloat_pct
FROM pg_stat_user_indexes
WHERE pg_relation_size(indexrelid) > 1024 * 1024;  -- > 1MB

Lock Statistics

-- Current locks
SELECT
    l.pid,
    l.locktype,
    l.mode,
    l.granted,
    a.state,
    a.query
FROM pg_locks l
JOIN pg_stat_activity a ON l.pid = a.pid
WHERE NOT l.granted
ORDER BY l.pid;

-- Lock waits
SELECT
    blocked.pid as blocked_pid,
    blocked.query as blocked_query,
    blocking.pid as blocking_pid,
    blocking.query as blocking_query
FROM pg_stat_activity blocked
JOIN pg_locks blocked_locks ON blocked.pid = blocked_locks.pid
JOIN pg_locks blocking_locks ON blocked_locks.locktype = blocking_locks.locktype
    AND blocked_locks.database IS NOT DISTINCT FROM blocking_locks.database
    AND blocked_locks.relation IS NOT DISTINCT FROM blocking_locks.relation
    AND blocked_locks.page IS NOT DISTINCT FROM blocking_locks.page
    AND blocked_locks.tuple IS NOT DISTINCT FROM blocking_locks.tuple
    AND blocked_locks.virtualxid IS NOT DISTINCT FROM blocking_locks.virtualxid
    AND blocked_locks.transactionid IS NOT DISTINCT FROM blocking_locks.transactionid
    AND blocked_locks.classid IS NOT DISTINCT FROM blocking_locks.classid
    AND blocked_locks.objid IS NOT DISTINCT FROM blocking_locks.objid
    AND blocked_locks.objsubid IS NOT DISTINCT FROM blocking_locks.objsubid
    AND blocked_locks.pid != blocking_locks.pid
JOIN pg_stat_activity blocking ON blocking_locks.pid = blocking.pid
WHERE NOT blocked_locks.granted;

Replication Statistics

-- Replication status
SELECT
    client_addr,
    state,
    sent_lsn,
    write_lsn,
    flush_lsn,
    replay_lsn,
    pg_wal_lsn_diff(sent_lsn, replay_lsn) as lag_bytes,
    sync_state
FROM pg_stat_replication;

-- Replication lag in seconds
SELECT
    client_addr,
    EXTRACT(EPOCH FROM (NOW() - pg_last_xact_replay_timestamp())) as lag_seconds
FROM pg_stat_replication;

Cache Hit Ratio

-- Database cache hit ratio
SELECT
    datname,
    ROUND(100.0 * blks_hit / NULLIF(blks_hit + blks_read, 0), 2) as cache_hit_ratio
FROM pg_stat_database
WHERE datname = 'sartiq';

-- Table cache hit ratio
SELECT
    schemaname,
    relname,
    ROUND(100.0 * heap_blks_hit / NULLIF(heap_blks_hit + heap_blks_read, 0), 2) as heap_hit_ratio,
    ROUND(100.0 * idx_blks_hit / NULLIF(idx_blks_hit + idx_blks_read, 0), 2) as idx_hit_ratio
FROM pg_statio_user_tables
WHERE heap_blks_hit + heap_blks_read > 100
ORDER BY heap_blks_read DESC;

Alerting

Critical Alerts

-- Long-running queries (> 5 minutes)
SELECT COUNT(*) > 0 as alert
FROM pg_stat_activity
WHERE state = 'active'
  AND NOW() - query_start > INTERVAL '5 minutes';

-- Connection count > 80% of max
SELECT
    COUNT(*) > (0.8 * current_setting('max_connections')::int) as alert
FROM pg_stat_activity;

-- Replication lag > 1 minute
SELECT
    EXISTS(
        SELECT 1 FROM pg_stat_replication
        WHERE pg_wal_lsn_diff(sent_lsn, replay_lsn) > 16 * 1024 * 1024  -- ~1min of WAL
    ) as alert;

-- Deadlocks detected
SELECT
    deadlocks > 0 as alert
FROM pg_stat_database
WHERE datname = 'sartiq'
  AND stats_reset > NOW() - INTERVAL '1 hour';

Warning Alerts

-- Cache hit ratio < 95%
SELECT
    ROUND(100.0 * blks_hit / NULLIF(blks_hit + blks_read, 0), 2) < 95 as alert
FROM pg_stat_database
WHERE datname = 'sartiq';

-- Tables with > 10% dead tuples
SELECT COUNT(*) > 0 as alert
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000
  AND n_dead_tup > 0.1 * n_live_tup;

-- Unused indexes > 100MB
SELECT COUNT(*) > 0 as alert
FROM pg_stat_user_indexes
WHERE idx_scan = 0
  AND pg_relation_size(indexrelid) > 100 * 1024 * 1024;

Monitoring Dashboards

Grafana + Prometheus

# prometheus/postgres_exporter queries
pg_stat_activity_count:
  query: "SELECT state, COUNT(*) FROM pg_stat_activity GROUP BY state"
  metrics:
    - state:
        usage: "LABEL"
    - count:
        usage: "GAUGE"

pg_stat_statements_total_time:
  query: |
    SELECT
      LEFT(query, 50) as query,
      calls,
      total_exec_time
    FROM pg_stat_statements
    ORDER BY total_exec_time DESC
    LIMIT 20

Key Dashboard Panels

  1. Connection Pool
  2. Active connections
  3. Idle connections
  4. Connection wait time

  5. Query Performance

  6. Queries per second
  7. Average query time
  8. Slow query count

  9. Cache Performance

  10. Buffer cache hit ratio
  11. Index cache hit ratio
  12. Disk reads per second

  13. Replication

  14. Replication lag (bytes)
  15. Replication lag (seconds)
  16. Replica states

  17. Storage

  18. Database size
  19. Table bloat
  20. WAL generation rate

Logging

PostgreSQL Logging Config

# postgresql.conf

# Log slow queries
log_min_duration_statement = 100  # Log queries > 100ms

# Log connections
log_connections = on
log_disconnections = on

# Log locks
log_lock_waits = on
deadlock_timeout = 1s

# Log checkpoints
log_checkpoints = on

# Log format
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '

Query Logging

# Log all queries (development only!)
log_statement = 'all'

# Log only DDL
log_statement = 'ddl'

# Log queries with errors
log_min_error_statement = error

Health Check Endpoint

from fastapi import APIRouter
from sqlalchemy import text

router = APIRouter()

@router.get("/health/db")
async def database_health(db: AsyncSession = Depends(get_db)):
    """Database health check endpoint."""
    try:
        # Basic connectivity
        await db.execute(text("SELECT 1"))

        # Connection pool status
        pool = engine.pool

        # Query performance (last hour)
        result = await db.execute(text("""
            SELECT
                COUNT(*) as slow_queries
            FROM pg_stat_statements
            WHERE mean_exec_time > 1000  -- > 1 second
        """))
        slow_queries = result.scalar()

        return {
            "status": "healthy",
            "pool": {
                "size": pool.size(),
                "checkedin": pool.checkedin(),
                "checkedout": pool.checkedout(),
            },
            "slow_queries_count": slow_queries,
        }
    except Exception as e:
        return {
            "status": "unhealthy",
            "error": str(e),
        }

Best Practices

  1. Enable pg_stat_statements — Essential for query analysis
  2. Log slow queries — Set log_min_duration_statement
  3. Monitor connections — Alert before exhaustion
  4. Track replication lag — Critical for read replicas
  5. Watch cache hit ratios — Should be > 95%
  6. Review unused indexes — Remove to improve write performance
  7. Monitor vacuum activity — Prevent bloat
  8. Set up alerting — Don't wait for user reports

See Also

  • Metrics -- Prometheus metrics collection for application and database monitoring
  • Alerting -- Setting up alert rules and notification channels