PostgreSQL
Architecture Overview
Process Model
PostgreSQL uses a multi-process model rather than multi-threading. Each client connection spawns a dedicated backend process. The postmaster is the parent daemon that listens for connections and forks new backends. Key background processes include:
- postmaster — listener, process manager, crash recovery coordinator
- bgwriter — proactively writes dirty shared_buffers pages to disk to reduce checkpoint pressure
- checkpointer — performs periodic checkpoints, writing all dirty pages and advancing the WAL checkpoint LSN
- walwriter — flushes WAL buffers to disk, ensuring durability
- autovacuum launcher + workers — reclaims dead tuples and updates visibility maps
- stats collector — collects activity and table-level statistics consumed by
pg_stat_*views - wal sender / wal receiver — stream WAL to standby replicas
Shared Memory
All backend processes share a single region of shared memory. The most important component is
shared_buffers — the in-memory page cache for data blocks. Other regions include the
WAL buffers (wal_buffers), lock table, and process arrays. Data flows: disk → shared_buffers → backend process local memory → query output.
Write-Ahead Log (WAL)
Every change is first written to the WAL (a sequential log of change records) before the actual data pages are modified. This ensures durability — even if the server crashes mid-write, the WAL can replay changes during recovery. WAL is also the foundation of streaming replication, logical replication, and PITR.
Performance Tuning: postgresql.conf
The following configuration is tuned for a dedicated 32 GB RAM OLTP server. Adjust proportionally for your hardware profile.
# ============================================================
# postgresql.conf — Production OLTP Tuning (32 GB RAM server)
# ============================================================
# ---- Memory ----
shared_buffers = 8GB # ~25% of RAM; primary data page cache
work_mem = 64MB # Per sort/hash operation per query node; can multiply with parallel workers
maintenance_work_mem = 2GB # VACUUM, CREATE INDEX, ALTER TABLE — can be large
effective_cache_size = 24GB # Estimate of OS + PG cache; used by planner to cost index scans
huge_pages = try # Use Linux huge pages if available (reduces TLB pressure)
# ---- Connections ----
max_connections = 200 # Keep low; use PgBouncer for application pooling
superuser_reserved_connections = 5
# ---- WAL / Durability ----
wal_level = replica # Minimum for streaming replication; use 'logical' for logical replication
wal_buffers = 64MB # WAL write buffer; -1 = auto (1/32 of shared_buffers, min 64 kB)
synchronous_commit = on # on = wait for WAL flush before ACK; off gives ~3x write throughput at risk of data loss on crash
max_wal_size = 4GB # Max WAL before triggering a checkpoint; reduce checkpoint frequency
min_wal_size = 1GB # Keep at least this much WAL on disk
wal_compression = on # Compress WAL records (CPU trade for I/O savings)
# ---- Checkpoints ----
checkpoint_completion_target = 0.9 # Spread checkpoint writes over 90% of checkpoint interval
checkpoint_timeout = 15min # Maximum time between automatic checkpoints
# ---- Planner ----
random_page_cost = 1.1 # SSD: 1.1; HDD: 4.0; lower = planner prefers index scans
effective_io_concurrency = 200 # SSD: 200; HDD: 2; used for bitmap heap scans
default_statistics_target = 200 # More histogram buckets for better cardinality estimates
# ---- Parallelism ----
max_worker_processes = 16
max_parallel_workers_per_gather = 4
max_parallel_workers = 8
max_parallel_maintenance_workers = 4 # Parallel CREATE INDEX, VACUUM
# ---- Logging ----
log_min_duration_statement = 1000 # Log queries taking > 1 second
log_checkpoints = on
log_connections = off
log_lock_waits = on
log_temp_files = 0 # Log all temp file creation (0 = all, -1 = none)
log_autovacuum_min_duration = 250 # Log autovacuum runs taking > 250ms
# ---- Locale ----
default_text_search_config = 'pg_catalog.english'
shared_buffers = 25% of RAM,
effective_cache_size = 75% of RAM,
work_mem = RAM / (max_connections * 2) — use pgTune (pgtune.leopard.in.ua) to
generate a baseline configuration for your hardware.
Indexing Strategies
| Index Type | Data Structures | Best For | Notes |
|---|---|---|---|
| B-tree | Default; balanced tree | Equality, range queries, ORDER BY, LIKE 'prefix%' | Default index type; handles NULLs; supports unique constraints |
| GIN | Generalised Inverted Index | Full-text search (tsvector), JSONB containment (@>), array operators |
Slower writes (index update is expensive); very fast reads; use fastupdate for bulk inserts |
| GiST | Generalised Search Tree | Geometric types, PostGIS geography, range types, nearest-neighbour (KNN) | Lossy by nature — requires recheck; supports <<, &&, @> operators |
| BRIN | Block Range Index | Very large tables with naturally ordered data (timestamps, sequential IDs, IoT sensor data) | Tiny index footprint; only stores min/max per block range; much faster for time-series than B-tree |
| Hash | Hash table | Equality-only lookups | Faster than B-tree for equality since PostgreSQL 10 (now WAL-logged); no range support |
| SP-GiST | Space-partitioned GiST | Non-balanced structures: quadtrees, k-d trees, prefix trees (text) | Good for phone number prefix search, IP range trees |
-- Partial index: only index active users (reduces index size dramatically)
CREATE INDEX CONCURRENTLY idx_users_email_active
ON users (email)
WHERE is_active = true;
-- Covering index: include extra columns to enable index-only scans
CREATE INDEX CONCURRENTLY idx_orders_customer_created
ON orders (customer_id, created_at DESC)
INCLUDE (status, total_amount);
-- GIN index for JSONB containment queries
CREATE INDEX CONCURRENTLY idx_events_payload
ON events USING GIN (payload jsonb_path_ops);
-- BRIN index for append-only time-series table
CREATE INDEX idx_metrics_ts_brin
ON metrics USING BRIN (recorded_at)
WITH (pages_per_range = 128);
-- Full-text search with GIN
ALTER TABLE articles ADD COLUMN search_vector tsvector;
UPDATE articles SET search_vector = to_tsvector('english', title || ' ' || body);
CREATE INDEX idx_articles_fts ON articles USING GIN (search_vector);
VACUUM and AUTOVACUUM Tuning
PostgreSQL uses MVCC (Multi-Version Concurrency Control). Dead tuples from UPDATEs and DELETEs are not immediately removed — they accumulate until VACUUM reclaims them. Bloat causes table scans to read unnecessary pages, degrading performance. Autovacuum runs automatically but its defaults are conservative.
Table-Level Autovacuum Overrides
For high-churn tables, override autovacuum settings at the table level rather than globally to avoid vacuuming every table aggressively:
-- Global autovacuum settings in postgresql.conf
autovacuum = on
autovacuum_max_workers = 6 # Default: 3; increase for busy systems
autovacuum_naptime = 30s # How often autovacuum launcher wakes to check tables
autovacuum_vacuum_threshold = 50 # Min dead tuples before triggering VACUUM
autovacuum_vacuum_scale_factor = 0.02 # + 2% of table rows (default 0.2 is too high for large tables)
autovacuum_analyze_threshold = 50
autovacuum_analyze_scale_factor = 0.01 # Analyze after 1% of rows change
autovacuum_vacuum_cost_delay = 2ms # Throttle vacuum I/O; lower = faster vacuum (more I/O)
autovacuum_vacuum_cost_limit = 800 # Max cost per delay cycle (default 200 is conservative)
-- Per-table override for a high-churn orders table
ALTER TABLE orders SET (
autovacuum_vacuum_scale_factor = 0.01,
autovacuum_vacuum_threshold = 100,
autovacuum_analyze_scale_factor = 0.005,
autovacuum_vacuum_cost_delay = 0 -- no throttling for this table
);
-- Check bloat and last vacuum stats
SELECT
schemaname,
relname AS table_name,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct,
last_autovacuum,
last_autoanalyze
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC
LIMIT 20;
-- Force manual VACUUM ANALYZE on a specific table
VACUUM (VERBOSE, ANALYZE, PARALLEL 4) orders;
age(datfrozenxid) — alert at 1.5B, emergency vacuum
at 1.8B.
High Availability: Patroni Cluster
Patroni is the de facto standard for automated PostgreSQL HA. It uses a distributed configuration store (etcd, Consul, or ZooKeeper) as a DCS (Distributed Consensus Store) to manage leader election and store cluster state. PgBouncer or HAProxy sits in front for connection routing.
Patroni Configuration (patroni.yml)
# /etc/patroni/patroni.yml
# Node: pg-primary-1 (run equivalent configs on pg-replica-1, pg-replica-2)
scope: postgres-ha-cluster
namespace: /db/
name: pg-primary-1
restapi:
listen: 0.0.0.0:8008
connect_address: 10.0.1.10:8008
etcd3:
hosts:
- etcd-1:2379
- etcd-2:2379
- etcd-3:2379
bootstrap:
dcs:
ttl: 30
loop_wait: 10
retry_timeout: 10
maximum_lag_on_failover: 1048576 # 1 MB; replicas lagging more than this won't become primary
postgresql:
use_pg_rewind: true
use_slots: true
parameters:
max_connections: 200
shared_buffers: 8GB
wal_level: replica
hot_standby: on
wal_keep_size: 2GB
max_replication_slots: 10
max_wal_senders: 10
synchronous_commit: on
initdb:
- encoding: UTF8
- data-checksums # Enable page checksums to detect silent data corruption
pg_hba:
- host replication replicator 10.0.1.0/24 scram-sha-256
- host all all 0.0.0.0/0 scram-sha-256
users:
admin:
password: "{{ vault_pg_admin_password }}"
options: [createrole, createdb]
replicator:
password: "{{ vault_pg_replicator_password }}"
options: [replication]
postgresql:
listen: 0.0.0.0:5432
connect_address: 10.0.1.10:5432
data_dir: /var/lib/postgresql/16/main
bin_dir: /usr/lib/postgresql/16/bin
pgpass: /tmp/pgpass0
authentication:
replication:
username: replicator
password: "{{ vault_pg_replicator_password }}"
superuser:
username: postgres
password: "{{ vault_pg_admin_password }}"
watchdog:
mode: automatic # Use watchdog device to prevent split-brain
device: /dev/watchdog
safety_margin: 5
tags:
nofailover: false
noloadbalance: false
clonefrom: false
nosync: false
Streaming Replication Verification
-- On primary: check replication lag
SELECT
application_name,
client_addr,
state,
sent_lsn,
write_lsn,
flush_lsn,
replay_lsn,
(sent_lsn - replay_lsn) AS replication_lag_bytes,
sync_state
FROM pg_stat_replication;
-- On replica: confirm recovery state
SELECT
now() - pg_last_xact_replay_timestamp() AS replication_delay,
pg_is_in_recovery() AS is_standby,
pg_last_wal_receive_lsn() AS receive_lsn,
pg_last_wal_replay_lsn() AS replay_lsn;
Connection Pooling: PgBouncer
PostgreSQL forks a new OS process per connection (~5–10 MB RAM each). At 500 connections, that is 2.5–5 GB of RAM just for idle processes. PgBouncer sits between applications and PostgreSQL, maintaining a small pool of server connections and multiplexing client connections onto them.
# /etc/pgbouncer/pgbouncer.ini
[databases]
# Database alias = connection string to PostgreSQL (Patroni VIP or HAProxy)
app_db = host=10.0.1.20 port=5432 dbname=app_db
app_db_readonly = host=10.0.1.21 port=5432 dbname=app_db
[pgbouncer]
listen_addr = 0.0.0.0
listen_port = 6432
# Pool mode:
# session = hold server connection for duration of client session (safest, least efficient)
# transaction = hold server connection only for duration of transaction (RECOMMENDED for most apps)
# statement = release after each statement (dangerous: breaks multi-statement transactions)
pool_mode = transaction
max_client_conn = 2000 # Total client connections PgBouncer accepts
default_pool_size = 25 # Server connections per database+user pair
min_pool_size = 5
reserve_pool_size = 10 # Extra connections if pool is exhausted
reserve_pool_timeout = 3 # Seconds to wait before using reserve pool
server_idle_timeout = 600 # Idle server connections closed after 10 min
client_idle_timeout = 0 # 0 = never timeout idle clients
server_lifetime = 3600 # Recycle server connections every hour
auth_type = scram-sha-256
auth_file = /etc/pgbouncer/userlist.txt
# Admin interface
admin_users = pgbouncer_admin
stats_users = monitoring
log_connections = 0
log_disconnections = 0
log_pooler_errors = 1
# TLS to PostgreSQL
server_tls_sslmode = require
server_tls_ca_file = /etc/ssl/certs/ca.crt
Backup & Restore
Logical Backup with pg_dump
# Dump a single database in custom format (compressed, supports parallel restore)
pg_dump \
--host=localhost \
--port=5432 \
--username=postgres \
--format=custom \
--compress=9 \
--jobs=4 \
--file=/backups/app_db_$(date +%Y%m%d_%H%M%S).dump \
app_db
# Restore from custom format dump
pg_restore \
--host=localhost \
--port=5432 \
--username=postgres \
--dbname=app_db_restored \
--jobs=8 \
--verbose \
/backups/app_db_20260101_020000.dump
Physical Backup with pg_basebackup
# Take a streaming base backup (suitable as PITR base)
pg_basebackup \
--host=10.0.1.10 \
--port=5432 \
--username=replicator \
--pgdata=/backups/basebackup_$(date +%Y%m%d) \
--format=tar \
--compress=9 \
--checkpoint=fast \
--wal-method=stream \ # Include WAL files needed to make backup consistent
--label="nightly_base" \
--progress \
--verbose
PITR with pgBackRest
# pgbackrest.conf
[global]
repo1-path=/var/lib/pgbackrest
repo1-type=s3
repo1-s3-bucket=my-pg-backups
repo1-s3-region=ap-southeast-1
repo1-s3-endpoint=s3.amazonaws.com
repo1-cipher-type=aes-256-cbc
repo1-cipher-pass=
repo1-retention-full=4 # Keep 4 full backups
repo1-retention-diff=14 # Keep 14 differential backups
start-fast=y
archive-async=y # Async WAL archiving for minimal performance impact
archive-push-queue-max=4GiB # Queue WAL locally if S3 is slow
[prod-cluster]
pg1-path=/var/lib/postgresql/16/main
pg1-host=pg-primary-1
pg1-host-user=postgres
# Commands
# Full backup
pgbackrest --stanza=prod-cluster --type=full backup
# Incremental backup
pgbackrest --stanza=prod-cluster --type=incr backup
# Point-in-time restore to specific timestamp
pgbackrest --stanza=prod-cluster \
--type=time \
--target="2026-03-28 14:30:00+07" \
--target-action=promote \
restore
Monitoring Queries
Active Connections and Blocking
-- Current active queries (filter out idle)
SELECT
pid,
now() - pg_stat_activity.query_start AS duration,
query,
state,
wait_event_type,
wait_event,
client_addr
FROM pg_stat_activity
WHERE state != 'idle'
AND query_start < now() - interval '5 seconds'
ORDER BY duration DESC;
-- Detect blocking locks
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.query AS blocked_query,
blocking_locks.pid AS blocking_pid,
blocking_activity.query AS blocking_query,
now() - blocked_activity.query_start AS blocked_duration
FROM pg_catalog.pg_locks AS blocked_locks
JOIN pg_catalog.pg_stat_activity AS blocked_activity
ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks AS blocking_locks
ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.pid != blocked_locks.pid
AND blocking_locks.granted
JOIN pg_catalog.pg_stat_activity AS blocking_activity
ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
Table Statistics and Cache Hit Rate
-- Buffer cache hit rates per table (aim for > 99%)
SELECT
relname,
heap_blks_read,
heap_blks_hit,
ROUND(100.0 * heap_blks_hit / NULLIF(heap_blks_hit + heap_blks_read, 0), 2) AS cache_hit_pct,
idx_blks_read,
idx_blks_hit,
ROUND(100.0 * idx_blks_hit / NULLIF(idx_blks_hit + idx_blks_read, 0), 2) AS idx_cache_hit_pct
FROM pg_statio_user_tables
ORDER BY heap_blks_read + idx_blks_read DESC
LIMIT 20;
-- Unused indexes (candidates for removal)
SELECT
schemaname,
relname AS table_name,
indexrelname AS index_name,
idx_scan,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexrelname NOT LIKE '%_pkey'
ORDER BY pg_relation_size(indexrelid) DESC;
-- Slow query detection via pg_stat_statements
SELECT
query,
calls,
ROUND(total_exec_time::numeric / calls, 2) AS avg_ms,
ROUND(total_exec_time::numeric, 2) AS total_ms,
rows,
shared_blks_hit,
shared_blks_read
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 15;
PostgreSQL on Kubernetes: CloudNativePG
CloudNativePG (CNPG) is a CNCF sandbox operator that manages the full lifecycle of PostgreSQL clusters on Kubernetes — provisioning, failover, backup, monitoring, and upgrades. It uses PostgreSQL streaming replication natively, without Patroni or etcd.
# cloudnativepg-cluster.yaml
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: pg-production
namespace: databases
spec:
instances: 3 # 1 primary + 2 replicas
imageName: ghcr.io/cloudnative-pg/postgresql:16.3
# Resource limits
resources:
requests:
memory: "8Gi"
cpu: "2"
limits:
memory: "16Gi"
cpu: "8"
# Persistent storage
storage:
size: 500Gi
storageClass: fast-ssd # Use a StorageClass with WaitForFirstConsumer binding
# WAL storage on a separate volume for I/O isolation
walStorage:
size: 50Gi
storageClass: fast-ssd
# PostgreSQL tuning parameters
postgresql:
parameters:
shared_buffers: "2GB"
work_mem: "32MB"
max_connections: "200"
wal_level: "logical"
max_replication_slots: "10"
max_wal_senders: "10"
effective_cache_size: "6GB"
checkpoint_completion_target: "0.9"
random_page_cost: "1.1"
log_min_duration_statement: "1000"
pg_hba:
- host all all 10.0.0.0/8 scram-sha-256
# Backup to S3-compatible object storage
backup:
barmanObjectStore:
destinationPath: "s3://my-pg-backups/cnpg/pg-production"
s3Credentials:
accessKeyId:
name: pg-backup-s3-secret
key: ACCESS_KEY_ID
secretAccessKey:
name: pg-backup-s3-secret
key: SECRET_ACCESS_KEY
wal:
compression: gzip
encryption: AES256
data:
compression: gzip
encryption: AES256
retentionPolicy: "30d"
# Scheduled backups
scheduledBackups:
- name: pg-production-daily
schedule: "0 2 * * *" # 02:00 daily
backupOwnerReference: self
# Monitoring: expose metrics for Prometheus
monitoring:
enablePodMonitor: true
# Bootstrap: init fresh or recover from backup
bootstrap:
initdb:
database: app_db
owner: app_user
encoding: UTF8
# Superuser secret
superuserSecret:
name: pg-superuser-secret
# scheduled-backup.yaml
apiVersion: postgresql.cnpg.io/v1
kind: ScheduledBackup
metadata:
name: pg-production-weekly-full
namespace: databases
spec:
schedule: "0 1 * * 0" # Every Sunday at 01:00
cluster:
name: pg-production
method: barmanObjectStore
immediate: false