-- ============================================================================= -- ClickHouse logs_ingest 表优化脚本 -- -- 说明: -- - 所有 ALTER 操作均为在线操作,无需停服 -- - 建议按阶段顺序执行,每阶段执行后观察 system.parts 确认生效 -- - 压缩编解码器变更仅影响新写入的 part,存量数据需等 merge 或手动 OPTIMIZE -- -- 执行方式: -- clickhouse-client --host 127.0.0.1 --port 9000 --user default --password 'xxx' < optimize_schema.sql -- ============================================================================= -- ============================================= -- 阶段 1:大字段压缩优化(效果最显著) -- ============================================= -- 大文本字段改用 ZSTD(3),对 JSON / HTTP 文本压缩率远优于默认 LZ4 -- 预期效果:磁盘占用减少 40%-60% ALTER TABLE logs_ingest MODIFY COLUMN request_headers String CODEC(ZSTD(3)); ALTER TABLE logs_ingest MODIFY COLUMN request_body String CODEC(ZSTD(3)); ALTER TABLE logs_ingest MODIFY COLUMN response_headers String CODEC(ZSTD(3)); ALTER TABLE logs_ingest MODIFY COLUMN response_body String CODEC(ZSTD(3)); -- 中等长度文本字段用 ZSTD(1),平衡压缩率与 CPU 开销 ALTER TABLE logs_ingest MODIFY COLUMN ua String CODEC(ZSTD(1)); ALTER TABLE logs_ingest MODIFY COLUMN path String CODEC(ZSTD(1)); ALTER TABLE logs_ingest MODIFY COLUMN referer String CODEC(ZSTD(1)); -- 低基数字段改用 LowCardinality(内存+磁盘双降) -- method 的基数极低(GET/POST/PUT/DELETE 等),host 基数取决于站点数量 ALTER TABLE logs_ingest MODIFY COLUMN method LowCardinality(String); ALTER TABLE logs_ingest MODIFY COLUMN log_type LowCardinality(String); ALTER TABLE logs_ingest MODIFY COLUMN host LowCardinality(String); -- 数值字段使用 Delta + ZSTD 编码(利用相邻行的时间/大小相关性) ALTER TABLE logs_ingest MODIFY COLUMN bytes_in UInt64 CODEC(Delta, ZSTD(1)); ALTER TABLE logs_ingest MODIFY COLUMN bytes_out UInt64 CODEC(Delta, ZSTD(1)); ALTER TABLE logs_ingest MODIFY COLUMN cost_ms UInt32 CODEC(Delta, ZSTD(1)); -- ============================================= -- 阶段 2:添加 Skipping Index(加速高频过滤查询) -- ============================================= -- trace_id 精确查找(查看日志详情 FindByTraceId) -- bloom_filter(0.01) = 1% 误判率,GRANULARITY 4 = 每 4 个 granule 一个 bloom block ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_trace_id trace_id TYPE bloom_filter(0.01) GRANULARITY 4; -- IP 精确查找 ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_ip ip TYPE bloom_filter(0.01) GRANULARITY 4; -- host 模糊查询支持(tokenbf_v1 对 LIKE '%xxx%' 有效) ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_host host TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4; -- firewall_policy_id 过滤(HasFirewallPolicy: > 0) ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_fw_policy firewall_policy_id TYPE minmax GRANULARITY 4; -- status 范围过滤(HasError: status >= 400) ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_status status TYPE minmax GRANULARITY 4; -- ============================================= -- 阶段 3:物化索引到现有数据(对存量数据生效) -- ============================================= -- 注意:MATERIALIZE INDEX 会触发后台 mutation,大表可能需要一定时间 -- 可通过 SELECT * FROM system.mutations WHERE is_done = 0 监控进度 ALTER TABLE logs_ingest MATERIALIZE INDEX idx_trace_id; ALTER TABLE logs_ingest MATERIALIZE INDEX idx_ip; ALTER TABLE logs_ingest MATERIALIZE INDEX idx_host; ALTER TABLE logs_ingest MATERIALIZE INDEX idx_fw_policy; ALTER TABLE logs_ingest MATERIALIZE INDEX idx_status; -- ============================================================================= -- dns_logs_ingest 表优化(DNS 日志表) -- ============================================================================= -- 大文本字段压缩 ALTER TABLE dns_logs_ingest MODIFY COLUMN content_json String CODEC(ZSTD(3)); ALTER TABLE dns_logs_ingest MODIFY COLUMN error String CODEC(ZSTD(1)); -- 低基数字段 ALTER TABLE dns_logs_ingest MODIFY COLUMN question_type LowCardinality(String); ALTER TABLE dns_logs_ingest MODIFY COLUMN record_type LowCardinality(String); ALTER TABLE dns_logs_ingest MODIFY COLUMN networking LowCardinality(String); -- request_id 精确查找 ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_request_id request_id TYPE bloom_filter(0.01) GRANULARITY 4; -- remote_addr 精确查找 ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_remote_addr remote_addr TYPE bloom_filter(0.01) GRANULARITY 4; -- question_name 模糊查询 ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_question_name question_name TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4; -- domain_id 过滤 ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_domain_id domain_id TYPE minmax GRANULARITY 4; -- 物化索引到现有数据 ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_request_id; ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_remote_addr; ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_question_name; ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_domain_id; -- ============================================================================= -- 验证命令(执行完上述 ALTER 后运行) -- ============================================================================= -- 查看列的压缩编解码器 -- SELECT name, type, compression_codec FROM system.columns WHERE table = 'logs_ingest' AND database = currentDatabase(); -- 查看表的压缩率 -- SELECT table, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio FROM system.columns WHERE table IN ('logs_ingest', 'dns_logs_ingest') GROUP BY table; -- 查看各列占用的磁盘空间(找出最大的列) -- SELECT name, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed FROM system.columns WHERE table = 'logs_ingest' GROUP BY name ORDER BY sum(data_compressed_bytes) DESC; -- 查看 mutation 进度 -- SELECT database, table, mutation_id, command, is_done, parts_to_do FROM system.mutations WHERE is_done = 0; -- 强制触发 merge(可选,让压缩编解码器变更对存量数据生效) -- OPTIMIZE TABLE logs_ingest FINAL; -- OPTIMIZE TABLE dns_logs_ingest FINAL;