Files
waf-platform/deploy/clickhouse/optimize_schema.sql
2026-02-13 22:36:17 +08:00

124 lines
6.4 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- =============================================================================
-- ClickHouse logs_ingest 表优化脚本
--
-- 说明:
-- - 所有 ALTER 操作均为在线操作,无需停服
-- - 建议按阶段顺序执行,每阶段执行后观察 system.parts 确认生效
-- - 压缩编解码器变更仅影响新写入的 part存量数据需等 merge 或手动 OPTIMIZE
--
-- 执行方式:
-- clickhouse-client --host 127.0.0.1 --port 9000 --user default --password 'xxx' < optimize_schema.sql
-- =============================================================================
-- =============================================
-- 阶段 1大字段压缩优化效果最显著
-- =============================================
-- 大文本字段改用 ZSTD(3),对 JSON / HTTP 文本压缩率远优于默认 LZ4
-- 预期效果:磁盘占用减少 40%-60%
ALTER TABLE logs_ingest MODIFY COLUMN request_headers String CODEC(ZSTD(3));
ALTER TABLE logs_ingest MODIFY COLUMN request_body String CODEC(ZSTD(3));
ALTER TABLE logs_ingest MODIFY COLUMN response_headers String CODEC(ZSTD(3));
ALTER TABLE logs_ingest MODIFY COLUMN response_body String CODEC(ZSTD(3));
-- 中等长度文本字段用 ZSTD(1),平衡压缩率与 CPU 开销
ALTER TABLE logs_ingest MODIFY COLUMN ua String CODEC(ZSTD(1));
ALTER TABLE logs_ingest MODIFY COLUMN path String CODEC(ZSTD(1));
ALTER TABLE logs_ingest MODIFY COLUMN referer String CODEC(ZSTD(1));
-- 低基数字段改用 LowCardinality内存+磁盘双降)
-- method 的基数极低GET/POST/PUT/DELETE 等host 基数取决于站点数量
ALTER TABLE logs_ingest MODIFY COLUMN method LowCardinality(String);
ALTER TABLE logs_ingest MODIFY COLUMN log_type LowCardinality(String);
ALTER TABLE logs_ingest MODIFY COLUMN host LowCardinality(String);
-- 数值字段使用 Delta + ZSTD 编码(利用相邻行的时间/大小相关性)
ALTER TABLE logs_ingest MODIFY COLUMN bytes_in UInt64 CODEC(Delta, ZSTD(1));
ALTER TABLE logs_ingest MODIFY COLUMN bytes_out UInt64 CODEC(Delta, ZSTD(1));
ALTER TABLE logs_ingest MODIFY COLUMN cost_ms UInt32 CODEC(Delta, ZSTD(1));
-- =============================================
-- 阶段 2添加 Skipping Index加速高频过滤查询
-- =============================================
-- trace_id 精确查找(查看日志详情 FindByTraceId
-- bloom_filter(0.01) = 1% 误判率GRANULARITY 4 = 每 4 个 granule 一个 bloom block
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_trace_id trace_id TYPE bloom_filter(0.01) GRANULARITY 4;
-- IP 精确查找
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_ip ip TYPE bloom_filter(0.01) GRANULARITY 4;
-- host 模糊查询支持tokenbf_v1 对 LIKE '%xxx%' 有效)
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_host host TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4;
-- firewall_policy_id 过滤HasFirewallPolicy: > 0
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_fw_policy firewall_policy_id TYPE minmax GRANULARITY 4;
-- status 范围过滤HasError: status >= 400
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_status status TYPE minmax GRANULARITY 4;
-- =============================================
-- 阶段 3物化索引到现有数据对存量数据生效
-- =============================================
-- 注意MATERIALIZE INDEX 会触发后台 mutation大表可能需要一定时间
-- 可通过 SELECT * FROM system.mutations WHERE is_done = 0 监控进度
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_trace_id;
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_ip;
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_host;
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_fw_policy;
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_status;
-- =============================================================================
-- dns_logs_ingest 表优化DNS 日志表)
-- =============================================================================
-- 大文本字段压缩
ALTER TABLE dns_logs_ingest MODIFY COLUMN content_json String CODEC(ZSTD(3));
ALTER TABLE dns_logs_ingest MODIFY COLUMN error String CODEC(ZSTD(1));
-- 低基数字段
ALTER TABLE dns_logs_ingest MODIFY COLUMN question_type LowCardinality(String);
ALTER TABLE dns_logs_ingest MODIFY COLUMN record_type LowCardinality(String);
ALTER TABLE dns_logs_ingest MODIFY COLUMN networking LowCardinality(String);
-- request_id 精确查找
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_request_id request_id TYPE bloom_filter(0.01) GRANULARITY 4;
-- remote_addr 精确查找
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_remote_addr remote_addr TYPE bloom_filter(0.01) GRANULARITY 4;
-- question_name 模糊查询
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_question_name question_name TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4;
-- domain_id 过滤
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_domain_id domain_id TYPE minmax GRANULARITY 4;
-- 物化索引到现有数据
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_request_id;
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_remote_addr;
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_question_name;
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_domain_id;
-- =============================================================================
-- 验证命令(执行完上述 ALTER 后运行)
-- =============================================================================
-- 查看列的压缩编解码器
-- SELECT name, type, compression_codec FROM system.columns WHERE table = 'logs_ingest' AND database = currentDatabase();
-- 查看表的压缩率
-- SELECT table, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio FROM system.columns WHERE table IN ('logs_ingest', 'dns_logs_ingest') GROUP BY table;
-- 查看各列占用的磁盘空间(找出最大的列)
-- SELECT name, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed FROM system.columns WHERE table = 'logs_ingest' GROUP BY name ORDER BY sum(data_compressed_bytes) DESC;
-- 查看 mutation 进度
-- SELECT database, table, mutation_id, command, is_done, parts_to_do FROM system.mutations WHERE is_done = 0;
-- 强制触发 merge可选让压缩编解码器变更对存量数据生效
-- OPTIMIZE TABLE logs_ingest FINAL;
-- OPTIMIZE TABLE dns_logs_ingest FINAL;