Data & AnalyticsDocumented

sql-expert

Advanced SQL with window functions, CTEs, recursive queries, indexing strategies, EXPLAIN ANALYZE, JSONB, full-text search, and query optimization. PostgreSQL-first with coverage for MySQL and SQLite.

Installation

npx clawhub@latest install sql-expert

View the full skill documentation and source below.

Documentation

Advanced SQL Expert

PostgreSQL First (Applies to Most Databases)

Window Functions — Most Underused SQL Feature

Window functions compute across rows without collapsing them. No GROUP BY, no subqueries needed.

-- Syntax: function() OVER (PARTITION BY ... ORDER BY ... ROWS/RANGE ...)

-- Running total
SELECT
    order_date,
    amount,
    SUM(amount) OVER (ORDER BY order_date) AS running_total
FROM orders;

-- Rank within each department
SELECT
    name,
    department,
    salary,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank,
    DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_dense_rank,
    ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
FROM employees;

-- Get top 3 per category
WITH ranked AS (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS rn
    FROM products
)
SELECT * FROM ranked WHERE rn <= 3;

-- Moving average (7-day)
SELECT
    date,
    revenue,
    AVG(revenue) OVER (
        ORDER BY date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7d
FROM daily_revenue;

-- LAG / LEAD — compare with previous/next row
SELECT
    date,
    revenue,
    LAG(revenue, 1) OVER (ORDER BY date) AS prev_day,
    revenue - LAG(revenue, 1) OVER (ORDER BY date) AS day_over_day,
    LEAD(revenue, 1) OVER (ORDER BY date) AS next_day
FROM daily_revenue;

-- FIRST_VALUE / LAST_VALUE — keep context while aggregating
SELECT
    name,
    department,
    salary,
    FIRST_VALUE(name) OVER (
        PARTITION BY department
        ORDER BY salary DESC
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS highest_paid_in_dept
FROM employees;

-- NTILE — percentile buckets
SELECT
    name,
    salary,
    NTILE(4) OVER (ORDER BY salary) AS quartile,
    PERCENT_RANK() OVER (ORDER BY salary) AS pct_rank
FROM employees;

CTEs (Common Table Expressions)

-- Basic CTE — readable, not a performance benefit
WITH monthly_revenue AS (
    SELECT
        DATE_TRUNC('month', created_at) AS month,
        SUM(amount) AS total
    FROM orders
    WHERE status = 'completed'
    GROUP BY 1
),
monthly_growth AS (
    SELECT
        month,
        total,
        LAG(total) OVER (ORDER BY month) AS prev_month,
        ROUND(
            (total - LAG(total) OVER (ORDER BY month)) /
            NULLIF(LAG(total) OVER (ORDER BY month), 0) * 100, 2
        ) AS growth_pct
    FROM monthly_revenue
)
SELECT * FROM monthly_growth
ORDER BY month;

-- Multiple CTEs
WITH
  active_users AS (
      SELECT user_id FROM users WHERE status = 'active'
  ),
  recent_orders AS (
      SELECT * FROM orders
      WHERE created_at > NOW() - INTERVAL '30 days'
  ),
  user_order_counts AS (
      SELECT o.user_id, COUNT(*) AS order_count, SUM(o.amount) AS total_spent
      FROM recent_orders o
      JOIN active_users u ON u.user_id = o.user_id
      GROUP BY o.user_id
  )
SELECT
    u.email,
    uc.order_count,
    uc.total_spent
FROM user_order_counts uc
JOIN users u ON u.id = uc.user_id
ORDER BY uc.total_spent DESC;

Recursive CTEs — Hierarchies and Graphs

-- Employee hierarchy (org chart)
WITH RECURSIVE employee_tree AS (
    -- Base case: CEO (no manager)
    SELECT id, name, manager_id, 0 AS depth, name::TEXT AS path
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    -- Recursive case: reports to someone in the tree
    SELECT e.id, e.name, e.manager_id, et.depth + 1,
           et.path || ' > ' || e.name
    FROM employees e
    JOIN employee_tree et ON e.manager_id = et.id
)
SELECT id, name, depth, path
FROM employee_tree
ORDER BY path;

-- Category tree traversal
WITH RECURSIVE category_path AS (
    SELECT id, name, parent_id, ARRAY[id] AS ancestors
    FROM categories
    WHERE parent_id IS NULL
    
    UNION ALL
    
    SELECT c.id, c.name, c.parent_id, cp.ancestors || c.id
    FROM categories c
    JOIN category_path cp ON c.parent_id = cp.id
)
SELECT * FROM category_path;

-- Find all descendants of a node
WITH RECURSIVE subtree AS (
    SELECT id, name FROM categories WHERE id = 5  -- Start node
    
    UNION ALL
    
    SELECT c.id, c.name
    FROM categories c
    JOIN subtree s ON c.parent_id = s.id
)
SELECT * FROM subtree;

Query Optimization

EXPLAIN ANALYZE — Read It

-- Always use EXPLAIN ANALYZE to see actual execution
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) 
SELECT u.name, COUNT(o.id)
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id, u.name;

-- Key things to look for:
-- Seq Scan → table scan (bad on large tables, needs index)
-- Index Scan → using an index (good)
-- Hash Join → building hash table in memory
-- Nested Loop → O(n*m) — watch for this on large datasets
-- rows= in actual vs estimated → large divergence = bad statistics → ANALYZE table
-- Buffers: hit= (cached) vs read= (disk) → high reads = memory pressure

Indexing Strategy

-- B-tree index (default) — equality, range, ORDER BY
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_status_created ON orders(status, created_at DESC);

-- Partial index — index only the rows you query most
CREATE INDEX idx_active_users ON users(email) WHERE status = 'active';
CREATE INDEX idx_pending_orders ON orders(created_at) WHERE status = 'pending';

-- Covering index (include) — index-only scan, no heap fetch
CREATE INDEX idx_orders_user_covering ON orders(user_id) INCLUDE (amount, status, created_at);

-- GIN index for arrays and JSONB
CREATE INDEX idx_products_tags ON products USING GIN(tags);
CREATE INDEX idx_user_meta ON users USING GIN(metadata jsonb_path_ops);

-- Text search
CREATE INDEX idx_products_search ON products USING GIN(
    to_tsvector('english', name || ' ' || COALESCE(description, ''))
);

-- Multicolumn index column order matters!
-- Query: WHERE a = 1 AND b > 5 ORDER BY c
-- Good: INDEX ON (a, b, c) — leftmost prefix matches
-- Bad: INDEX ON (c, b, a) — can't use for this query

-- Check index usage
SELECT indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
WHERE relname = 'orders'
ORDER BY idx_scan DESC;

-- Find missing indexes (sequential scans on large tables)
SELECT relname, seq_scan, seq_tup_read, idx_scan,
       ROUND(seq_tup_read::NUMERIC / NULLIF(seq_scan, 0)) AS avg_rows_per_scan
FROM pg_stat_user_tables
WHERE seq_scan > 100
ORDER BY seq_tup_read DESC;

JSON/JSONB Operations (PostgreSQL)

-- JSONB is binary-stored, indexed, fast. Use it.

-- Create table with JSONB
CREATE TABLE events (
    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
    type TEXT NOT NULL,
    data JSONB NOT NULL,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Insert
INSERT INTO events (type, data) VALUES 
('user.signup', '{"user_id": "abc", "email": "[email protected]", "source": "organic"}');

-- Query by JSON field
SELECT * FROM events
WHERE data->>'email' = '[email protected]';  -- ->> extracts as text

SELECT * FROM events  
WHERE data->'metadata'->>'plan' = 'pro';   -- Nested field

-- Containment operator @> (uses GIN index)
SELECT * FROM events
WHERE data @> '{"source": "organic"}';     -- data contains this JSON

-- Array operations
SELECT data->'tags' FROM products WHERE id = 1;
SELECT * FROM products WHERE data->'tags' ? 'featured';    -- Contains element
SELECT * FROM products WHERE data->'tags' ?| ARRAY['sale', 'new'];  -- Any of

-- Update JSONB
UPDATE users
SET metadata = metadata || '{"plan": "pro"}'::JSONB
WHERE id = 'user-123';

UPDATE users
SET metadata = jsonb_set(metadata, '{address,city}', '"San Francisco"')
WHERE id = 'user-123';

-- Extract and aggregate
SELECT
    data->>'source' AS acquisition_source,
    COUNT(*) AS signups,
    COUNT(*) FILTER (WHERE data->>'plan' = 'pro') AS pro_signups
FROM events
WHERE type = 'user.signup'
GROUP BY 1
ORDER BY 2 DESC;

-- Full-text search on JSONB
SELECT * FROM products
WHERE to_tsvector('english', data->>'description') @@ plainto_tsquery('wireless headphones');

N+1 Query Problem and Solutions

-- BAD: N+1 (1 query for orders + N queries for users)
-- This happens in ORMs when you do: orders.each { |o| o.user.name }

-- GOOD: Single query with JOIN
SELECT o.id, o.amount, u.name, u.email
FROM orders o
JOIN users u ON u.id = o.user_id
WHERE o.status = 'pending';

-- Batch loading pattern with IN
-- Instead of: SELECT * FROM users WHERE id = ? (per row)
SELECT id, name, email
FROM users
WHERE id = ANY(ARRAY['id1', 'id2', 'id3']::UUID[]);

-- Lateral join — per-row subquery (powerful, explicit)
SELECT u.id, u.name, recent_orders.*
FROM users u
CROSS JOIN LATERAL (
    SELECT COUNT(*) AS order_count, SUM(amount) AS total_spent
    FROM orders
    WHERE user_id = u.id
      AND created_at > NOW() - INTERVAL '90 days'
) AS recent_orders;

Aggregations and Analytics Patterns

-- Conditional aggregation
SELECT
    DATE_TRUNC('month', created_at) AS month,
    COUNT(*) AS total_orders,
    COUNT(*) FILTER (WHERE status = 'completed') AS completed,
    COUNT(*) FILTER (WHERE status = 'refunded') AS refunded,
    SUM(amount) FILTER (WHERE status = 'completed') AS revenue,
    AVG(amount) AS avg_order_value,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) AS median_order
FROM orders
GROUP BY 1
ORDER BY 1;

-- Pivot table with CROSSTAB
SELECT *
FROM CROSSTAB(
    'SELECT department, quarter, SUM(sales)
     FROM sales_data
     GROUP BY 1, 2
     ORDER BY 1, 2',
    'VALUES (''Q1''), (''Q2''), (''Q3''), (''Q4'')'
) AS ct(department TEXT, q1 NUMERIC, q2 NUMERIC, q3 NUMERIC, q4 NUMERIC);

-- Cohort analysis
WITH cohorts AS (
    SELECT
        user_id,
        DATE_TRUNC('month', MIN(created_at)) AS cohort_month
    FROM orders
    GROUP BY user_id
),
order_cohorts AS (
    SELECT
        c.user_id,
        c.cohort_month,
        DATE_PART('month', AGE(DATE_TRUNC('month', o.created_at), c.cohort_month)) AS months_since_first
    FROM cohorts c
    JOIN orders o ON o.user_id = c.user_id
)
SELECT
    cohort_month,
    COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 0) AS month_0,
    COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 1) AS month_1,
    COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 2) AS month_2,
    COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 3) AS month_3
FROM order_cohorts
GROUP BY cohort_month
ORDER BY cohort_month;

Performance Patterns

-- UPSERT (PostgreSQL)
INSERT INTO user_stats (user_id, login_count, last_login)
VALUES ($1, 1, NOW())
ON CONFLICT (user_id)
DO UPDATE SET
    login_count = user_stats.login_count + 1,
    last_login = EXCLUDED.last_login;

-- Efficient pagination (keyset vs offset)
-- BAD: OFFSET 10000 LIMIT 20 (scans 10020 rows)
SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 10000;

-- GOOD: Keyset pagination (O(log n) with index)
SELECT * FROM orders
WHERE id > $last_id
ORDER BY id ASC
LIMIT 20;

-- Bulk insert
INSERT INTO events (type, data, created_at)
SELECT type, data, created_at
FROM UNNEST($1::TEXT[], $2::JSONB[], $3::TIMESTAMPTZ[])
    AS t(type, data, created_at);

-- Vacuuming — prevent table bloat
VACUUM ANALYZE orders;          -- Manual (usually auto)
VACUUM (VERBOSE, ANALYZE) orders;  -- See what it does

-- Table statistics
SELECT
    n_live_tup,
    n_dead_tup,
    ROUND(n_dead_tup * 100.0 / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct,
    last_vacuum,
    last_autovacuum,
    last_analyze
FROM pg_stat_user_tables
WHERE relname = 'orders';

Back to Skills Directory