SQL Fundamentals¶
Master the SQL operations you'll use daily with PostgreSQL.
JOINs¶
Visual Reference¶
| Join Type | Diagram | Returns |
|---|---|---|
| INNER JOIN | A ∩ B | Only matching rows |
| LEFT JOIN | A + (A ∩ B) | All of A, matches from B |
| RIGHT JOIN | (A ∩ B) + B | All of B, matches from A |
| FULL OUTER JOIN | A ∪ B | All rows from both tables |
INNER JOIN¶
Returns rows only when there's a match in both tables.
-- Users with bookings
SELECT u.name, b.date, b.amount
FROM users u
INNER JOIN bookings b ON b.user_id = u.id;
LEFT JOIN¶
Returns all rows from left table, with matches from right (NULL if no match).
-- All users, with their bookings (if any)
SELECT u.name, b.date, b.amount
FROM users u
LEFT JOIN bookings b ON b.user_id = u.id;
-- Users WITHOUT bookings
SELECT u.name
FROM users u
LEFT JOIN bookings b ON b.user_id = u.id
WHERE b.id IS NULL;
RIGHT JOIN¶
Returns all rows from right table. Rarely used (rewrite as LEFT JOIN).
-- Same as LEFT JOIN with tables swapped
SELECT u.name, b.date
FROM bookings b
RIGHT JOIN users u ON b.user_id = u.id;
FULL OUTER JOIN¶
Returns all rows from both tables, with NULLs where no match.
-- All users and all bookings, matched where possible
SELECT u.name, b.date
FROM users u
FULL OUTER JOIN bookings b ON b.user_id = u.id;
CROSS JOIN¶
Cartesian product — every row paired with every other row.
-- Generate all time slot combinations
SELECT d.date, t.time_slot
FROM dates d
CROSS JOIN time_slots t;
-- Implicit syntax (comma)
SELECT d.date, t.time_slot
FROM dates d, time_slots t;
Self JOIN¶
Join a table to itself.
-- Find employees and their managers
SELECT e.name AS employee, m.name AS manager
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.id;
Common Table Expressions (CTEs)¶
CTEs make complex queries readable by breaking them into named steps.
Basic CTE¶
WITH active_users AS (
SELECT id, name, email
FROM users
WHERE status = 'active'
AND last_login > NOW() - INTERVAL '30 days'
)
SELECT au.name, COUNT(b.id) AS booking_count
FROM active_users au
LEFT JOIN bookings b ON b.user_id = au.id
GROUP BY au.id, au.name;
Multiple CTEs¶
WITH
monthly_revenue AS (
SELECT
DATE_TRUNC('month', created_at) AS month,
SUM(amount) AS revenue
FROM bookings
WHERE status = 'completed'
GROUP BY DATE_TRUNC('month', created_at)
),
monthly_costs AS (
SELECT
DATE_TRUNC('month', date) AS month,
SUM(amount) AS costs
FROM expenses
GROUP BY DATE_TRUNC('month', date)
)
SELECT
r.month,
r.revenue,
c.costs,
r.revenue - COALESCE(c.costs, 0) AS profit
FROM monthly_revenue r
LEFT JOIN monthly_costs c ON c.month = r.month
ORDER BY r.month;
Recursive CTEs¶
For hierarchical data (org charts, categories, threads).
-- Get all subcategories of a category
WITH RECURSIVE category_tree AS (
-- Base case: start with parent category
SELECT id, name, parent_id, 1 AS depth
FROM categories
WHERE id = 5
UNION ALL
-- Recursive case: get children
SELECT c.id, c.name, c.parent_id, ct.depth + 1
FROM categories c
INNER JOIN category_tree ct ON c.parent_id = ct.id
WHERE ct.depth < 10 -- Prevent infinite loops
)
SELECT * FROM category_tree;
Window Functions¶
Perform calculations across related rows without grouping.
ROW_NUMBER¶
Assign sequential numbers to rows.
-- Number bookings per user
SELECT
user_id,
created_at,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at) AS booking_num
FROM bookings;
-- Get first booking per user
WITH numbered AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at) AS rn
FROM bookings
)
SELECT * FROM numbered WHERE rn = 1;
RANK and DENSE_RANK¶
Handle ties differently than ROW_NUMBER.
-- Rank users by total spending
SELECT
user_id,
total_spent,
RANK() OVER (ORDER BY total_spent DESC) AS rank,
DENSE_RANK() OVER (ORDER BY total_spent DESC) AS dense_rank
FROM (
SELECT user_id, SUM(amount) AS total_spent
FROM bookings
GROUP BY user_id
) t;
-- RANK: 1, 2, 2, 4 (skips 3)
-- DENSE_RANK: 1, 2, 2, 3 (no gaps)
LAG and LEAD¶
Access previous/next row values.
-- Compare each booking to previous
SELECT
user_id,
created_at,
amount,
LAG(amount) OVER (PARTITION BY user_id ORDER BY created_at) AS prev_amount,
amount - LAG(amount) OVER (PARTITION BY user_id ORDER BY created_at) AS diff
FROM bookings;
-- Get next scheduled appointment
SELECT
user_id,
date,
LEAD(date) OVER (PARTITION BY user_id ORDER BY date) AS next_date
FROM appointments;
Running Totals and Averages¶
-- Running total of revenue
SELECT
date,
amount,
SUM(amount) OVER (ORDER BY date) AS running_total,
AVG(amount) OVER (ORDER BY date) AS running_avg
FROM daily_revenue;
-- Running total within each month
SELECT
date,
amount,
SUM(amount) OVER (
PARTITION BY DATE_TRUNC('month', date)
ORDER BY date
) AS monthly_running_total
FROM daily_revenue;
Window Frame Clauses¶
Control which rows are included in the window.
-- 7-day moving average
SELECT
date,
amount,
AVG(amount) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7d
FROM daily_revenue;
-- Include all preceding rows (default for running totals)
SUM(amount) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING)
-- Centered window (3 before, current, 3 after)
AVG(amount) OVER (
ORDER BY date
ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
)
Subqueries¶
Scalar Subqueries¶
Return a single value.
-- Users with above-average bookings
SELECT name
FROM users
WHERE id IN (
SELECT user_id
FROM bookings
GROUP BY user_id
HAVING COUNT(*) > (SELECT AVG(cnt) FROM (
SELECT COUNT(*) AS cnt FROM bookings GROUP BY user_id
) t)
);
Correlated Subqueries¶
Reference the outer query.
-- Users with their latest booking date
SELECT
u.name,
(SELECT MAX(created_at)
FROM bookings b
WHERE b.user_id = u.id) AS last_booking
FROM users u;
EXISTS¶
Check if rows exist (often faster than IN).
-- Users who have made at least one booking
SELECT u.name
FROM users u
WHERE EXISTS (
SELECT 1
FROM bookings b
WHERE b.user_id = u.id
);
-- Users who have NEVER booked
SELECT u.name
FROM users u
WHERE NOT EXISTS (
SELECT 1
FROM bookings b
WHERE b.user_id = u.id
);
PostgreSQL-Specific Features¶
DISTINCT ON¶
Get first row per group (simpler than window functions).
-- Latest booking per user
SELECT DISTINCT ON (user_id) *
FROM bookings
ORDER BY user_id, created_at DESC;
LATERAL Joins¶
Subquery can reference columns from preceding tables.
-- Top 3 bookings per user
SELECT u.name, b.date, b.amount
FROM users u
CROSS JOIN LATERAL (
SELECT date, amount
FROM bookings
WHERE user_id = u.id
ORDER BY amount DESC
LIMIT 3
) b;
Array Operations¶
-- Users with specific tags
SELECT * FROM users
WHERE tags @> ARRAY['premium', 'verified'];
-- Unnest array to rows
SELECT unnest(tags) AS tag FROM users WHERE id = 1;
-- Aggregate into array
SELECT user_id, array_agg(tag ORDER BY tag) AS all_tags
FROM user_tags
GROUP BY user_id;
JSONB Operations¶
-- Query JSON field
SELECT * FROM users
WHERE preferences->>'theme' = 'dark';
-- Deep access
SELECT preferences->'notifications'->>'email' AS email_pref
FROM users;
-- Contains operator
SELECT * FROM products
WHERE metadata @> '{"category": "photo"}';
-- Update JSON field
UPDATE users
SET preferences = preferences || '{"theme": "light"}'
WHERE id = 1;
-- Remove JSON key
UPDATE users
SET preferences = preferences - 'deprecated_key'
WHERE id = 1;
UPSERT (INSERT ... ON CONFLICT)¶
-- Insert or update
INSERT INTO user_preferences (user_id, key, value)
VALUES (1, 'theme', 'dark')
ON CONFLICT (user_id, key)
DO UPDATE SET value = EXCLUDED.value, updated_at = NOW();
-- Insert or ignore
INSERT INTO tags (name)
VALUES ('new-tag')
ON CONFLICT (name) DO NOTHING;
RETURNING Clause¶
Get affected rows back.
-- Get inserted row
INSERT INTO users (name, email)
VALUES ('John', 'john@example.com')
RETURNING id, created_at;
-- Get deleted rows
DELETE FROM sessions
WHERE expires_at < NOW()
RETURNING user_id, id;
-- Get updated rows
UPDATE bookings
SET status = 'confirmed'
WHERE status = 'pending' AND created_at < NOW() - INTERVAL '1 hour'
RETURNING *;
Generate Series¶
Create sequences of values.
-- Generate dates
SELECT generate_series(
'2024-01-01'::date,
'2024-12-31'::date,
'1 day'::interval
) AS date;
-- Fill gaps in time series data
SELECT
d.date,
COALESCE(r.revenue, 0) AS revenue
FROM generate_series(
'2024-01-01'::date,
'2024-01-31'::date,
'1 day'::interval
) d(date)
LEFT JOIN daily_revenue r ON r.date = d.date;
Aggregations¶
Basic Aggregates¶
SELECT
COUNT(*) AS total_rows,
COUNT(DISTINCT user_id) AS unique_users,
SUM(amount) AS total_amount,
AVG(amount) AS avg_amount,
MIN(amount) AS min_amount,
MAX(amount) AS max_amount
FROM bookings;
GROUP BY with Aggregates¶
-- Revenue by month
SELECT
DATE_TRUNC('month', created_at) AS month,
COUNT(*) AS bookings,
SUM(amount) AS revenue
FROM bookings
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month;
HAVING¶
Filter groups (WHERE filters rows before grouping).
-- Users with more than 5 bookings
SELECT user_id, COUNT(*) AS booking_count
FROM bookings
GROUP BY user_id
HAVING COUNT(*) > 5;
FILTER Clause¶
Conditional aggregation (cleaner than CASE).
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status = 'completed') AS completed,
COUNT(*) FILTER (WHERE status = 'pending') AS pending,
SUM(amount) FILTER (WHERE status = 'completed') AS completed_revenue
FROM bookings;
GROUPING SETS, CUBE, ROLLUP¶
Multiple grouping levels in one query.
-- Multiple grouping levels
SELECT
COALESCE(region, 'All Regions') AS region,
COALESCE(category, 'All Categories') AS category,
SUM(amount) AS total
FROM sales
GROUP BY GROUPING SETS (
(region, category),
(region),
(category),
()
);
-- ROLLUP: hierarchical totals
SELECT region, city, SUM(amount)
FROM sales
GROUP BY ROLLUP (region, city);
-- Produces: (region, city), (region), ()
-- CUBE: all combinations
SELECT region, category, SUM(amount)
FROM sales
GROUP BY CUBE (region, category);
-- Produces: (region, category), (region), (category), ()
Best Practices¶
- Use CTEs for readability — Break complex queries into named steps
- Prefer EXISTS over IN — For subqueries checking existence
- Use window functions — Instead of self-joins for row comparisons
- DISTINCT ON for first-per-group — Simpler than ROW_NUMBER
- JSONB for flexible data — But index it properly
- RETURNING for efficiency — Get data back without separate SELECT