124 lines
6.4 KiB
SQL
124 lines
6.4 KiB
SQL
-- =============================================================================
|
||
-- ClickHouse logs_ingest 表优化脚本
|
||
--
|
||
-- 说明:
|
||
-- - 所有 ALTER 操作均为在线操作,无需停服
|
||
-- - 建议按阶段顺序执行,每阶段执行后观察 system.parts 确认生效
|
||
-- - 压缩编解码器变更仅影响新写入的 part,存量数据需等 merge 或手动 OPTIMIZE
|
||
--
|
||
-- 执行方式:
|
||
-- clickhouse-client --host 127.0.0.1 --port 9000 --user default --password 'xxx' < optimize_schema.sql
|
||
-- =============================================================================
|
||
|
||
-- =============================================
|
||
-- 阶段 1:大字段压缩优化(效果最显著)
|
||
-- =============================================
|
||
|
||
-- 大文本字段改用 ZSTD(3),对 JSON / HTTP 文本压缩率远优于默认 LZ4
|
||
-- 预期效果:磁盘占用减少 40%-60%
|
||
ALTER TABLE logs_ingest MODIFY COLUMN request_headers String CODEC(ZSTD(3));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN request_body String CODEC(ZSTD(3));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN response_headers String CODEC(ZSTD(3));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN response_body String CODEC(ZSTD(3));
|
||
|
||
-- 中等长度文本字段用 ZSTD(1),平衡压缩率与 CPU 开销
|
||
ALTER TABLE logs_ingest MODIFY COLUMN ua String CODEC(ZSTD(1));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN path String CODEC(ZSTD(1));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN referer String CODEC(ZSTD(1));
|
||
|
||
-- 低基数字段改用 LowCardinality(内存+磁盘双降)
|
||
-- method 的基数极低(GET/POST/PUT/DELETE 等),host 基数取决于站点数量
|
||
ALTER TABLE logs_ingest MODIFY COLUMN method LowCardinality(String);
|
||
ALTER TABLE logs_ingest MODIFY COLUMN log_type LowCardinality(String);
|
||
ALTER TABLE logs_ingest MODIFY COLUMN host LowCardinality(String);
|
||
|
||
-- 数值字段使用 Delta + ZSTD 编码(利用相邻行的时间/大小相关性)
|
||
ALTER TABLE logs_ingest MODIFY COLUMN bytes_in UInt64 CODEC(Delta, ZSTD(1));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN bytes_out UInt64 CODEC(Delta, ZSTD(1));
|
||
ALTER TABLE logs_ingest MODIFY COLUMN cost_ms UInt32 CODEC(Delta, ZSTD(1));
|
||
|
||
-- =============================================
|
||
-- 阶段 2:添加 Skipping Index(加速高频过滤查询)
|
||
-- =============================================
|
||
|
||
-- trace_id 精确查找(查看日志详情 FindByTraceId)
|
||
-- bloom_filter(0.01) = 1% 误判率,GRANULARITY 4 = 每 4 个 granule 一个 bloom block
|
||
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_trace_id trace_id TYPE bloom_filter(0.01) GRANULARITY 4;
|
||
|
||
-- IP 精确查找
|
||
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_ip ip TYPE bloom_filter(0.01) GRANULARITY 4;
|
||
|
||
-- host 模糊查询支持(tokenbf_v1 对 LIKE '%xxx%' 有效)
|
||
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_host host TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4;
|
||
|
||
-- firewall_policy_id 过滤(HasFirewallPolicy: > 0)
|
||
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_fw_policy firewall_policy_id TYPE minmax GRANULARITY 4;
|
||
|
||
-- status 范围过滤(HasError: status >= 400)
|
||
ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_status status TYPE minmax GRANULARITY 4;
|
||
|
||
-- =============================================
|
||
-- 阶段 3:物化索引到现有数据(对存量数据生效)
|
||
-- =============================================
|
||
-- 注意:MATERIALIZE INDEX 会触发后台 mutation,大表可能需要一定时间
|
||
-- 可通过 SELECT * FROM system.mutations WHERE is_done = 0 监控进度
|
||
|
||
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_trace_id;
|
||
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_ip;
|
||
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_host;
|
||
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_fw_policy;
|
||
ALTER TABLE logs_ingest MATERIALIZE INDEX idx_status;
|
||
|
||
|
||
-- =============================================================================
|
||
-- dns_logs_ingest 表优化(DNS 日志表)
|
||
-- =============================================================================
|
||
|
||
-- 大文本字段压缩
|
||
ALTER TABLE dns_logs_ingest MODIFY COLUMN content_json String CODEC(ZSTD(3));
|
||
ALTER TABLE dns_logs_ingest MODIFY COLUMN error String CODEC(ZSTD(1));
|
||
|
||
-- 低基数字段
|
||
ALTER TABLE dns_logs_ingest MODIFY COLUMN question_type LowCardinality(String);
|
||
ALTER TABLE dns_logs_ingest MODIFY COLUMN record_type LowCardinality(String);
|
||
ALTER TABLE dns_logs_ingest MODIFY COLUMN networking LowCardinality(String);
|
||
|
||
-- request_id 精确查找
|
||
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_request_id request_id TYPE bloom_filter(0.01) GRANULARITY 4;
|
||
|
||
-- remote_addr 精确查找
|
||
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_remote_addr remote_addr TYPE bloom_filter(0.01) GRANULARITY 4;
|
||
|
||
-- question_name 模糊查询
|
||
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_question_name question_name TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4;
|
||
|
||
-- domain_id 过滤
|
||
ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_domain_id domain_id TYPE minmax GRANULARITY 4;
|
||
|
||
-- 物化索引到现有数据
|
||
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_request_id;
|
||
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_remote_addr;
|
||
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_question_name;
|
||
ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_domain_id;
|
||
|
||
|
||
-- =============================================================================
|
||
-- 验证命令(执行完上述 ALTER 后运行)
|
||
-- =============================================================================
|
||
|
||
-- 查看列的压缩编解码器
|
||
-- SELECT name, type, compression_codec FROM system.columns WHERE table = 'logs_ingest' AND database = currentDatabase();
|
||
|
||
-- 查看表的压缩率
|
||
-- SELECT table, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio FROM system.columns WHERE table IN ('logs_ingest', 'dns_logs_ingest') GROUP BY table;
|
||
|
||
-- 查看各列占用的磁盘空间(找出最大的列)
|
||
-- SELECT name, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed FROM system.columns WHERE table = 'logs_ingest' GROUP BY name ORDER BY sum(data_compressed_bytes) DESC;
|
||
|
||
-- 查看 mutation 进度
|
||
-- SELECT database, table, mutation_id, command, is_done, parts_to_do FROM system.mutations WHERE is_done = 0;
|
||
|
||
-- 强制触发 merge(可选,让压缩编解码器变更对存量数据生效)
|
||
-- OPTIMIZE TABLE logs_ingest FINAL;
|
||
-- OPTIMIZE TABLE dns_logs_ingest FINAL;
|