Full-Text Search¶
Implement powerful text search using PostgreSQL's built-in capabilities.
Core Concepts¶
tsvector and tsquery¶
- tsvector: Document converted to searchable tokens (lexemes)
- tsquery: Search query parsed into tokens with operators
-- Convert text to tsvector
SELECT to_tsvector('english', 'The quick brown fox jumps over the lazy dog');
-- Result: 'brown':3 'dog':9 'fox':4 'jump':5 'lazi':8 'quick':2
-- Parse search query
SELECT to_tsquery('english', 'quick & brown');
-- Result: 'quick' & 'brown'
-- Search
SELECT to_tsvector('english', 'The quick brown fox') @@ to_tsquery('english', 'quick & fox');
-- Result: true
Text Search Configurations¶
-- Available configurations
SELECT cfgname FROM pg_ts_config;
-- Common configurations:
-- 'simple' - No stemming, no stop words
-- 'english' - English stemming and stop words
-- 'spanish' - Spanish stemming and stop words
Basic Implementation¶
Search Column Approach¶
-- Add search vector column
ALTER TABLE products ADD COLUMN search_vector tsvector;
-- Populate search vector
UPDATE products SET search_vector =
setweight(to_tsvector('english', COALESCE(name, '')), 'A') ||
setweight(to_tsvector('english', COALESCE(description, '')), 'B') ||
setweight(to_tsvector('english', COALESCE(category, '')), 'C');
-- Create GIN index
CREATE INDEX idx_products_search ON products USING GIN(search_vector);
-- Search query
SELECT name, description
FROM products
WHERE search_vector @@ to_tsquery('english', 'wireless & headphones')
ORDER BY ts_rank(search_vector, to_tsquery('english', 'wireless & headphones')) DESC;
Auto-Update with Trigger¶
-- Trigger function
CREATE OR REPLACE FUNCTION products_search_trigger()
RETURNS TRIGGER AS $$
BEGIN
NEW.search_vector :=
setweight(to_tsvector('english', COALESCE(NEW.name, '')), 'A') ||
setweight(to_tsvector('english', COALESCE(NEW.description, '')), 'B') ||
setweight(to_tsvector('english', COALESCE(NEW.category, '')), 'C');
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Apply trigger
CREATE TRIGGER products_search_update
BEFORE INSERT OR UPDATE ON products
FOR EACH ROW EXECUTE FUNCTION products_search_trigger();
SQLAlchemy Implementation¶
Model with Search Vector¶
from sqlalchemy import Column, Index, String, Text, func
from sqlalchemy.dialects.postgresql import TSVECTOR
class Product(Base):
__tablename__ = "products"
id = Column(Integer, primary_key=True)
name = Column(String(255), nullable=False)
description = Column(Text)
category = Column(String(100))
search_vector = Column(TSVECTOR)
__table_args__ = (
Index('idx_products_search', search_vector, postgresql_using='gin'),
)
# Update search vector on save
@event.listens_for(Product, 'before_insert')
@event.listens_for(Product, 'before_update')
def update_search_vector(mapper, connection, target):
target.search_vector = func.setweight(
func.to_tsvector('english', func.coalesce(target.name, '')), 'A'
).op('||')(
func.setweight(
func.to_tsvector('english', func.coalesce(target.description, '')), 'B'
)
)
Search Repository¶
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
class ProductSearchRepository:
def __init__(self, db: AsyncSession):
self.db = db
async def search(
self,
query: str,
limit: int = 20,
offset: int = 0
) -> list[Product]:
"""Search products by text query."""
# Parse search query
tsquery = func.plainto_tsquery('english', query)
# Build query with ranking
stmt = (
select(Product)
.where(Product.search_vector.op('@@')(tsquery))
.order_by(
func.ts_rank(Product.search_vector, tsquery).desc()
)
.limit(limit)
.offset(offset)
)
result = await self.db.execute(stmt)
return result.scalars().all()
async def search_with_highlights(
self,
query: str,
limit: int = 20
) -> list[dict]:
"""Search with highlighted snippets."""
tsquery = func.plainto_tsquery('english', query)
stmt = (
select(
Product.id,
Product.name,
func.ts_headline(
'english',
Product.description,
tsquery,
'StartSel=<mark>, StopSel=</mark>, MaxWords=35, MinWords=15'
).label('snippet'),
func.ts_rank(Product.search_vector, tsquery).label('rank')
)
.where(Product.search_vector.op('@@')(tsquery))
.order_by(func.ts_rank(Product.search_vector, tsquery).desc())
.limit(limit)
)
result = await self.db.execute(stmt)
return [dict(row._mapping) for row in result.all()]
Search Query Syntax¶
Query Operators¶
-- AND (both terms required)
SELECT * FROM products
WHERE search_vector @@ to_tsquery('english', 'wireless & bluetooth');
-- OR (either term)
SELECT * FROM products
WHERE search_vector @@ to_tsquery('english', 'wireless | wired');
-- NOT (exclude term)
SELECT * FROM products
WHERE search_vector @@ to_tsquery('english', 'headphones & !gaming');
-- Phrase (words in order)
SELECT * FROM products
WHERE search_vector @@ phraseto_tsquery('english', 'noise cancelling');
-- Prefix match
SELECT * FROM products
WHERE search_vector @@ to_tsquery('english', 'head:*');
User Query Parsing¶
def parse_user_query(query: str) -> str:
"""Convert user query to tsquery format."""
# Split into words, escape special chars
words = query.strip().split()
if not words:
return ""
# Join with AND, add prefix matching for last word
terms = [w.replace("'", "''") for w in words]
return " & ".join(terms[:-1] + [terms[-1] + ":*"])
# Usage
user_input = "wireless head"
parsed = parse_user_query(user_input) # "wireless & head:*"
Safe Query Building¶
from sqlalchemy import func
def safe_search_query(query: str):
"""Build safe tsquery from user input."""
# Use plainto_tsquery for simple AND search
# Or websearch_to_tsquery for Google-like syntax (PG 11+)
return func.websearch_to_tsquery('english', query)
# websearch_to_tsquery supports:
# "exact phrase"
# -excluded
# word1 OR word2
Ranking and Relevance¶
Basic Ranking¶
SELECT
name,
ts_rank(search_vector, query) AS rank
FROM products,
to_tsquery('english', 'headphones') AS query
WHERE search_vector @@ query
ORDER BY rank DESC;
Weighted Ranking¶
-- Weights: A=1.0, B=0.4, C=0.2, D=0.1 (default)
SELECT
name,
ts_rank('{0.1, 0.2, 0.4, 1.0}', search_vector, query) AS rank
FROM products,
to_tsquery('english', 'headphones') AS query
WHERE search_vector @@ query
ORDER BY rank DESC;
Combining with Other Factors¶
SELECT
name,
ts_rank(search_vector, query) * 0.7 +
(popularity_score / 100.0) * 0.3 AS combined_score
FROM products,
to_tsquery('english', 'headphones') AS query
WHERE search_vector @@ query
ORDER BY combined_score DESC;
Highlighting Results¶
Basic Highlighting¶
SELECT
name,
ts_headline('english', description, query,
'StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, MaxFragments=3')
AS snippet
FROM products,
to_tsquery('english', 'wireless') AS query
WHERE search_vector @@ query;
Highlight Options¶
ts_headline(config, document, query, options)
-- Options:
-- StartSel, StopSel: Highlight markers
-- MaxWords: Max words in each fragment
-- MinWords: Min words in each fragment
-- MaxFragments: Number of fragments (0 = whole document)
-- ShortWord: Min word length to keep
-- FragmentDelimiter: Separator between fragments
Index Optimization¶
GIN vs GiST¶
| Index | Build Speed | Size | Search Speed | Update Speed |
|---|---|---|---|---|
| GIN | Slow | Larger | Fast | Slow |
| GiST | Fast | Smaller | Slower | Fast |
Recommendation: Use GIN for most cases (search speed matters more).
Partial Index for Active Records¶
CREATE INDEX idx_products_search_active
ON products USING GIN(search_vector)
WHERE status = 'active';
Covering Index¶
-- Include commonly selected columns
CREATE INDEX idx_products_search_covering
ON products USING GIN(search_vector)
INCLUDE (name, price);
Multi-Table Search¶
Search Across Tables¶
-- Union search results from multiple tables
WITH search_results AS (
SELECT
'product' AS type,
id,
name AS title,
description AS content,
ts_rank(search_vector, query) AS rank
FROM products,
to_tsquery('english', 'camera') AS query
WHERE search_vector @@ query
UNION ALL
SELECT
'article' AS type,
id,
title,
body AS content,
ts_rank(search_vector, query) AS rank
FROM articles,
to_tsquery('english', 'camera') AS query
WHERE search_vector @@ query
)
SELECT * FROM search_results
ORDER BY rank DESC
LIMIT 20;
Materialized Search View¶
CREATE MATERIALIZED VIEW search_index AS
SELECT
'product' AS type,
id,
name AS title,
to_tsvector('english', name || ' ' || COALESCE(description, '')) AS search_vector
FROM products
WHERE status = 'active'
UNION ALL
SELECT
'article' AS type,
id,
title,
to_tsvector('english', title || ' ' || body) AS search_vector
FROM articles
WHERE published = true;
CREATE INDEX idx_search_index ON search_index USING GIN(search_vector);
-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY search_index;
Performance Tips¶
Index Maintenance¶
-- Check index size
SELECT pg_size_pretty(pg_relation_size('idx_products_search'));
-- Reindex if bloated
REINDEX INDEX CONCURRENTLY idx_products_search;
Query Analysis¶
EXPLAIN ANALYZE
SELECT * FROM products
WHERE search_vector @@ to_tsquery('english', 'headphones');
-- Should show: Bitmap Index Scan using idx_products_search
Batch Updates¶
async def reindex_products(db: AsyncSession, batch_size: int = 1000):
"""Reindex search vectors in batches."""
offset = 0
while True:
result = await db.execute(
text("""
UPDATE products
SET search_vector = setweight(to_tsvector('english', COALESCE(name, '')), 'A') ||
setweight(to_tsvector('english', COALESCE(description, '')), 'B')
WHERE id IN (
SELECT id FROM products
ORDER BY id
LIMIT :limit OFFSET :offset
)
"""),
{"limit": batch_size, "offset": offset}
)
if result.rowcount == 0:
break
offset += batch_size
await db.commit()
Best Practices¶
- Store tsvector in column — Don't compute on every query
- Use weights — Title should rank higher than description
- Create GIN index — Essential for performance
- Use plainto_tsquery for user input — Safer than manual parsing
- Add highlighting — Users expect to see why results matched
- Consider websearch_to_tsquery — For familiar Google-like syntax
- Monitor index size — Reindex if bloated
- Test with real data — Relevance tuning needs real content