Writing Schemas¶
Learn how to write effective ClickHouse schemas using Housekeeper's features and best practices.
Schema File Structure¶
Basic Schema File¶
A basic schema file contains DDL statements for your ClickHouse objects:
-- Basic schema example
-- File: db/main.sql
-- Create the database
CREATE DATABASE ecommerce ENGINE = Atomic COMMENT 'E-commerce platform database';
-- Create core tables
CREATE TABLE ecommerce.users (
id UInt64,
email String,
name String,
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(updated_at) ORDER BY id;
CREATE TABLE ecommerce.products (
id UInt64,
name String,
category_id UInt32,
price Decimal64(2),
created_at DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(created_at) ORDER BY id;
Modular Schema Organization¶
For larger projects, split schemas into logical modules:
-- File: db/main.sql
-- Main entrypoint that imports other schema files
-- Global objects first (roles, users, settings profiles)
-- housekeeper:import schemas/_global/roles/main.sql
-- Database definitions
-- housekeeper:import schemas/databases/ecommerce.sql
-- Core tables
-- housekeeper:import schemas/tables/users.sql
-- housekeeper:import schemas/tables/products.sql
-- housekeeper:import schemas/tables/orders.sql
-- Reference data
-- housekeeper:import schemas/dictionaries/countries.sql
-- housekeeper:import schemas/dictionaries/categories.sql
-- Analytics views
-- housekeeper:import schemas/views/sales_summary.sql
-- housekeeper:import schemas/views/user_analytics.sql
Import Order and Dependencies¶
The order of imports matters for proper migration generation. Housekeeper processes schema objects in dependency order:
- Global Objects (roles, users, settings profiles) - Available cluster-wide
- Databases - Container for other objects
- Tables - Data storage structures
- Dictionaries - External data lookups
- Views - Query abstractions that may depend on tables/dictionaries
-- File: db/main.sql - Proper import ordering
-- ✅ Correct order
-- 1. Global objects first - processed before anything else
-- housekeeper:import schemas/_global/roles/main.sql
-- 2. Database definitions
-- housekeeper:import schemas/databases/main.sql
-- 3. Tables and core data structures
-- housekeeper:import schemas/tables/main.sql
-- 4. Dictionaries that may reference tables
-- housekeeper:import schemas/dictionaries/main.sql
-- 5. Views that query tables and dictionaries
-- housekeeper:import schemas/views/main.sql
Why Global Objects Come First:
Global objects like roles are cluster-wide and may be referenced by other objects:
-- Role must exist before being granted to table operations
CREATE ROLE IF NOT EXISTS data_writer;
-- Later in the schema, tables may reference the role in grants
CREATE TABLE analytics.events (...) ENGINE = MergeTree() ORDER BY timestamp;
GRANT INSERT ON analytics.events TO data_writer;
See the Role Management guide for comprehensive role management patterns.
Database Design¶
Database Creation¶
-- Basic database
CREATE DATABASE analytics ENGINE = Atomic;
-- Database with cluster support
CREATE DATABASE analytics ON CLUSTER my_cluster ENGINE = Atomic;
-- Database with comment
CREATE DATABASE analytics
ENGINE = Atomic
COMMENT 'Analytics and reporting database';
-- Database with external engine
CREATE DATABASE mysql_replica
ENGINE = MaterializedMySQL('mysql-server:3306', 'source_db', 'user', 'password')
SETTINGS allows_query_when_mysql_lost = 1;
Database Engines¶
Choose appropriate database engines based on your needs:
-- Atomic (default) - Transactional database
CREATE DATABASE prod_data ENGINE = Atomic;
-- MySQL integration
CREATE DATABASE mysql_data
ENGINE = MySQL('mysql-host:3306', 'database', 'user', 'password');
-- PostgreSQL integration
CREATE DATABASE postgres_data
ENGINE = PostgreSQL('postgres-host:5432', 'database', 'user', 'password');
Table Design¶
Table Engines¶
MergeTree Family (Most Common)¶
-- Basic MergeTree for time-series data
CREATE TABLE analytics.events (
timestamp DateTime,
user_id UInt64,
event_type String,
properties Map(String, String)
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)
ORDER BY (timestamp, user_id);
-- ReplacingMergeTree for mutable data
CREATE TABLE ecommerce.users (
id UInt64,
email String,
name String,
updated_at DateTime
)
ENGINE = ReplacingMergeTree(updated_at) -- Version column
ORDER BY id;
-- SummingMergeTree for pre-aggregated data
CREATE TABLE analytics.daily_stats (
date Date,
metric String,
value UInt64
)
ENGINE = SummingMergeTree(value) -- Sum column
ORDER BY (date, metric);
Integration Engines¶
-- Kafka integration
CREATE TABLE streaming.kafka_events (
timestamp DateTime,
user_id UInt64,
event_data String
)
ENGINE = Kafka()
SETTINGS
kafka_broker_list = 'localhost:9092',
kafka_topic_list = 'events',
kafka_group_name = 'clickhouse_consumer',
kafka_format = 'JSONEachRow';
-- MySQL integration
CREATE TABLE external.mysql_users (
id UInt64,
name String,
email String
)
ENGINE = MySQL('mysql-host:3306', 'database', 'users', 'user', 'password');
Column Definitions¶
Data Types¶
CREATE TABLE comprehensive_example (
-- Numeric types
id UInt64,
age UInt8,
balance Decimal64(2),
score Float32,
-- String types
name String,
status LowCardinality(String), -- For repeated values
country_code FixedString(2), -- Fixed length
-- Date/time types
created_at DateTime,
event_time DateTime64(3), -- Millisecond precision
birth_date Date,
-- Boolean and UUID
is_active Bool,
session_id UUID,
-- Complex types
tags Array(String),
metadata Map(String, String),
coordinates Tuple(Float64, Float64),
-- Nullable types
phone Nullable(String),
last_login Nullable(DateTime)
) ENGINE = MergeTree() ORDER BY id;
Column Attributes¶
CREATE TABLE advanced_columns (
id UInt64,
-- Default values
created_at DateTime DEFAULT now(),
status String DEFAULT 'pending',
-- Materialized columns (computed on insert)
date Date MATERIALIZED toDate(created_at),
month_year String MATERIALIZED formatDateTime(created_at, '%Y-%m'),
-- Alias columns (computed on read)
age_years UInt8 ALIAS dateDiff('year', birth_date, today()),
-- Compression
large_text String CODEC(ZSTD(3)),
metrics Array(Float64) CODEC(Delta, LZ4),
-- Comments
user_id UInt64 COMMENT 'Reference to users table',
-- TTL for specific columns
sensitive_data Nullable(String) TTL created_at + INTERVAL 30 DAY,
birth_date Date
) ENGINE = MergeTree() ORDER BY id;
Table Properties¶
Partitioning¶
-- Time-based partitioning
CREATE TABLE time_series (
timestamp DateTime,
value Float64
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp) -- Monthly partitions
ORDER BY timestamp;
-- Multi-dimensional partitioning
CREATE TABLE events (
timestamp DateTime,
event_type LowCardinality(String),
user_id UInt64
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMM(timestamp), event_type) -- By month and type
ORDER BY (timestamp, user_id);
-- Custom partitioning expression
CREATE TABLE user_data (
user_id UInt64,
data String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY intDiv(user_id, 1000000) -- Partition by user ID ranges
ORDER BY user_id;
Ordering and Primary Keys¶
-- Basic ordering
CREATE TABLE simple_table (
timestamp DateTime,
user_id UInt64,
event_type String
)
ENGINE = MergeTree()
ORDER BY timestamp;
-- Compound ordering
CREATE TABLE compound_order (
user_id UInt64,
timestamp DateTime,
session_id String
)
ENGINE = MergeTree()
ORDER BY (user_id, timestamp, session_id);
-- Explicit primary key (subset of ORDER BY)
CREATE TABLE optimized_table (
user_id UInt64,
timestamp DateTime,
event_type String,
data String
)
ENGINE = MergeTree()
ORDER BY (user_id, timestamp, event_type)
PRIMARY KEY (user_id, timestamp); -- Smaller primary key for performance
TTL Policies¶
-- Table-level TTL
CREATE TABLE temporary_data (
timestamp DateTime,
data String
)
ENGINE = MergeTree()
ORDER BY timestamp
TTL timestamp + INTERVAL 7 DAY; -- Delete after 7 days
-- Multi-level TTL
CREATE TABLE tiered_storage (
timestamp DateTime,
data String
)
ENGINE = MergeTree()
ORDER BY timestamp
TTL timestamp + INTERVAL 30 DAY TO DISK 'cold', -- Move to cold storage
timestamp + INTERVAL 365 DAY DELETE; -- Delete after 1 year
-- Column-specific TTL
CREATE TABLE privacy_aware (
timestamp DateTime,
user_id UInt64,
personal_data Nullable(String) TTL timestamp + INTERVAL 30 DAY,
analytics_data String
)
ENGINE = MergeTree()
ORDER BY timestamp
TTL timestamp + INTERVAL 2 YEAR; -- Keep analytics longer
CREATE TABLE AS - Schema Copying¶
The CREATE TABLE AS syntax allows you to create a new table by copying the schema from an existing table. This is particularly useful for:
- Creating distributed tables that mirror local tables
- Creating backup tables with identical structure
- Quickly duplicating table schemas with different engines
Basic Syntax¶
-- Copy schema from existing table
CREATE TABLE copy AS source ENGINE = MergeTree() ORDER BY id;
-- With database qualifiers
CREATE TABLE db1.table_copy AS db2.source_table ENGINE = Memory;
-- With all options
CREATE OR REPLACE TABLE IF NOT EXISTS backup_users AS users
ENGINE = MergeTree()
ORDER BY user_id;
Distributed Table Pattern¶
The most common use case is creating distributed tables that mirror local table schemas:
-- Define local table
CREATE TABLE events_local (
id UInt64,
timestamp DateTime,
event_type LowCardinality(String),
user_id UInt64,
data Map(String, String)
) ENGINE = MergeTree()
ORDER BY (timestamp, user_id)
PARTITION BY toYYYYMM(timestamp);
-- Create distributed table with same schema
CREATE TABLE events_all ON CLUSTER production AS events_local
ENGINE = Distributed(production, currentDatabase(), events_local, rand());
-- Create distributed table with specific sharding
CREATE TABLE events_distributed ON CLUSTER analytics AS events_local
ENGINE = Distributed(analytics, currentDatabase(), events_local, cityHash64(user_id));
Backup Table Pattern¶
Create backup tables with identical structure but different engines or settings:
-- Original table
CREATE TABLE users (
id UInt64,
name String,
email String,
created_at DateTime DEFAULT now()
) ENGINE = MergeTree() ORDER BY id;
-- Create backup with same schema
CREATE TABLE users_backup AS users
ENGINE = MergeTree()
ORDER BY id
SETTINGS index_granularity = 1024; -- Different settings
-- Create memory-based staging table
CREATE TABLE users_staging AS users ENGINE = Memory;
Migration Behavior¶
When using CREATE TABLE AS with Housekeeper, understand these key behaviors:
-
Schema Resolution: The AS reference is resolved at schema processing time, copying all column definitions from the source table.
-
Independence: Once created, AS tables are independent entities. They can be dropped, renamed, or modified without affecting the source.
-
Column Propagation: When the source table gets column changes (ADD/DROP/MODIFY), Housekeeper automatically propagates these to AS dependents:
- MergeTree family: Uses ALTER TABLE to preserve data
-
Distributed/Memory engines: Uses DROP+CREATE (safe, as they don't store local data)
-
Structural Operations Don't Propagate: DROP, DETACH, ATTACH, or RENAME of the source table has NO effect on AS dependents.
Example migration scenario:
-- Current schema
CREATE TABLE metrics_local (
timestamp DateTime,
value Float64
) ENGINE = MergeTree() ORDER BY timestamp;
CREATE TABLE metrics_all AS metrics_local
ENGINE = Distributed(cluster, currentDatabase(), metrics_local, rand());
-- When metrics_local gets new column, migration generates:
-- 1. ALTER TABLE metrics_local ADD COLUMN metric_name String;
-- 2. DROP TABLE metrics_all;
-- 3. CREATE TABLE metrics_all (...with new column...) ENGINE = Distributed(...);
Best Practices¶
- Use
CREATE TABLE ASfor distributed tables that should mirror local table schemas - Use it for creating backup or staging tables with same structure
- Remember that AS tables are independent after creation - the source can be dropped
- Place AS table definitions after their source tables in schema files
- Consider using AS for tables that should maintain schema consistency with a source
Dictionary Design¶
Basic Dictionaries¶
-- Simple lookup dictionary
CREATE DICTIONARY reference.countries (
code String,
name String,
continent String
)
PRIMARY KEY code
SOURCE(FILE(path '/data/countries.csv' format 'CSVWithNames'))
LAYOUT(HASHED())
LIFETIME(86400); -- Reload daily
Advanced Dictionary Features¶
-- Complex dictionary with multiple attributes
CREATE DICTIONARY analytics.user_segments (
user_id UInt64 IS_OBJECT_ID, -- Object identifier
parent_user_id UInt64 DEFAULT 0 HIERARCHICAL, -- Hierarchy support
segment String INJECTIVE, -- One-to-one mapping
score Float32,
created_at DateTime,
-- Computed attributes
segment_level UInt8 EXPRESSION
CASE segment
WHEN 'premium' THEN 3
WHEN 'standard' THEN 2
ELSE 1
END
)
PRIMARY KEY user_id, parent_user_id
SOURCE(CLICKHOUSE(
host 'localhost' port 9000
user 'default' password ''
db 'ml_models' table 'user_segments'
query 'SELECT user_id, parent_user_id, segment, score, created_at FROM user_segments WHERE updated_at > {created_at}'
))
LAYOUT(COMPLEX_KEY_HASHED(size_in_cells 1000000))
LIFETIME(MIN 300 MAX 600) -- Random refresh between 5-10 minutes
SETTINGS(max_threads = 4)
COMMENT 'ML-generated user segmentation data';
Dictionary Sources¶
-- HTTP source with authentication
CREATE DICTIONARY external.api_data (
id UInt64,
value String
)
PRIMARY KEY id
SOURCE(HTTP(
url 'https://api.example.com/data'
format 'JSONEachRow'
headers('Authorization' 'Bearer YOUR_TOKEN', 'Content-Type' 'application/json')
))
LAYOUT(HASHED())
LIFETIME(3600);
-- MySQL source
CREATE DICTIONARY external.mysql_lookup (
id UInt64,
name String
)
PRIMARY KEY id
SOURCE(MYSQL(
host 'mysql-server' port 3306
user 'readonly' password 'secret'
db 'reference' table 'lookup_data'
update_field 'updated_at' -- Incremental updates
))
LAYOUT(FLAT()) -- Best for UInt64 keys
LIFETIME(600);
-- File source
CREATE DICTIONARY reference.static_data (
code String,
description String
)
PRIMARY KEY code
SOURCE(FILE(path '/opt/data/reference.tsv' format 'TabSeparated'))
LAYOUT(HASHED())
LIFETIME(0); -- Never reload (static data)
View Design¶
Simple Views¶
-- Basic view for common queries
CREATE VIEW analytics.active_users AS
SELECT
user_id,
max(timestamp) as last_activity,
count() as event_count
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 30 DAY
GROUP BY user_id
HAVING event_count > 10;
-- View with complex joins
CREATE VIEW ecommerce.order_summary AS
SELECT
o.id as order_id,
o.user_id,
u.email,
o.total_amount,
o.created_at,
groupArray(p.name) as product_names
FROM ecommerce.orders o
JOIN ecommerce.users u ON o.user_id = u.id
JOIN ecommerce.order_items oi ON o.id = oi.order_id
JOIN ecommerce.products p ON oi.product_id = p.id
GROUP BY o.id, o.user_id, u.email, o.total_amount, o.created_at;
Materialized Views¶
-- Real-time aggregation
CREATE MATERIALIZED VIEW analytics.hourly_stats
ENGINE = SummingMergeTree((event_count, unique_users))
ORDER BY (date, hour, event_type)
POPULATE -- Backfill existing data
AS SELECT
toDate(timestamp) as date,
toHour(timestamp) as hour,
event_type,
count() as event_count,
uniq(user_id) as unique_users
FROM analytics.events
GROUP BY date, hour, event_type;
-- Materialized view with target table
CREATE TABLE analytics.user_stats_target (
date Date,
user_id UInt64,
event_count UInt32,
last_activity DateTime
) ENGINE = ReplacingMergeTree(last_activity) ORDER BY (date, user_id);
CREATE MATERIALIZED VIEW analytics.user_stats_mv
TO analytics.user_stats_target
AS SELECT
toDate(timestamp) as date,
user_id,
count() as event_count,
max(timestamp) as last_activity
FROM analytics.events
GROUP BY date, user_id;
Advanced Patterns¶
Polymorphic Tables¶
-- Single table for multiple entity types
CREATE TABLE analytics.entity_events (
timestamp DateTime,
entity_type LowCardinality(String), -- 'user', 'product', 'order'
entity_id UInt64,
event_type LowCardinality(String),
properties Map(String, String)
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMM(timestamp), entity_type)
ORDER BY (timestamp, entity_type, entity_id);
Event Sourcing Pattern¶
-- Event store table
CREATE TABLE event_store.events (
event_id UUID DEFAULT generateUUIDv4(),
aggregate_id UInt64,
aggregate_type LowCardinality(String),
event_type LowCardinality(String),
event_version UInt32,
event_data String, -- JSON payload
metadata Map(String, String),
timestamp DateTime DEFAULT now()
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMM(timestamp), aggregate_type)
ORDER BY (aggregate_id, event_version, timestamp);
-- Materialized view for current state
CREATE MATERIALIZED VIEW event_store.user_current_state
ENGINE = ReplacingMergeTree(event_version)
ORDER BY aggregate_id
AS SELECT
aggregate_id as user_id,
event_version,
JSONExtractString(event_data, 'name') as name,
JSONExtractString(event_data, 'email') as email,
JSONExtractString(event_data, 'status') as status,
timestamp as updated_at
FROM event_store.events
WHERE aggregate_type = 'user';
Time Series with Retention¶
-- Multi-resolution time series
CREATE TABLE metrics.raw_metrics (
timestamp DateTime,
metric_name LowCardinality(String),
value Float64,
tags Map(String, String)
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)
ORDER BY (metric_name, timestamp)
TTL timestamp + INTERVAL 7 DAY; -- Keep raw data for 7 days
-- 1-minute aggregates
CREATE MATERIALIZED VIEW metrics.minute_aggregates
ENGINE = MergeTree()
ORDER BY (metric_name, timestamp)
TTL timestamp + INTERVAL 30 DAY -- Keep for 30 days
AS SELECT
toStartOfMinute(timestamp) as timestamp,
metric_name,
avg(value) as avg_value,
min(value) as min_value,
max(value) as max_value,
count() as sample_count,
tags
FROM metrics.raw_metrics
GROUP BY timestamp, metric_name, tags;
-- 1-hour aggregates
CREATE MATERIALIZED VIEW metrics.hour_aggregates
ENGINE = MergeTree()
ORDER BY (metric_name, timestamp)
TTL timestamp + INTERVAL 365 DAY -- Keep for 1 year
AS SELECT
toStartOfHour(timestamp) as timestamp,
metric_name,
avg(avg_value) as avg_value,
min(min_value) as min_value,
max(max_value) as max_value,
sum(sample_count) as sample_count,
tags
FROM metrics.minute_aggregates
GROUP BY timestamp, metric_name, tags;
Best Practices¶
Naming Conventions¶
-- Use clear, descriptive names
CREATE DATABASE user_analytics; -- Not: db1, analytics_db
CREATE TABLE user_analytics.page_views; -- Not: pv, page_view_table
-- Use consistent prefixes
CREATE MATERIALIZED VIEW mv_daily_stats; -- Prefix: mv_
CREATE DICTIONARY dict_countries; -- Prefix: dict_
Documentation¶
-- Use comments extensively
CREATE DATABASE ecommerce
ENGINE = Atomic
COMMENT 'E-commerce platform database - contains users, products, and orders';
CREATE TABLE ecommerce.users (
id UInt64 COMMENT 'Unique user identifier',
email String COMMENT 'User email address (unique)',
name String COMMENT 'Full user name',
created_at DateTime DEFAULT now() COMMENT 'Account creation timestamp',
status LowCardinality(String) DEFAULT 'active' COMMENT 'User status: active, inactive, suspended'
)
ENGINE = ReplacingMergeTree(created_at)
ORDER BY id
COMMENT 'User profiles and account information';
Performance Considerations¶
-- Optimize for your query patterns
CREATE TABLE analytics.events (
timestamp DateTime,
user_id UInt64,
event_type LowCardinality(String),
session_id String
)
ENGINE = MergeTree()
-- Order by most selective columns first for your queries
ORDER BY (user_id, timestamp, event_type) -- If querying by user_id most often
-- ORDER BY (timestamp, event_type, user_id) -- If querying by time ranges most often
SETTINGS index_granularity = 8192;
Next Steps¶
- Migration Process - Learn how to apply schema changes
- Schema Management - Advanced schema design patterns
- Configuration - Configure Housekeeper for your environment
- Examples - See complete schema examples