diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eb9f4ce --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# Local runtime/state artifacts (do not commit) +default.etcd/ +**/.DS_Store +deploy/fluent-bit/logs.db +deploy/fluent-bit/logs.db-shm +deploy/fluent-bit/logs.db-wal +deploy/fluent-bit/storage/ diff --git a/EdgeAPI/internal/installers/fluent_bit.go b/EdgeAPI/internal/installers/fluent_bit.go index be009dd..2272230 100644 --- a/EdgeAPI/internal/installers/fluent_bit.go +++ b/EdgeAPI/internal/installers/fluent_bit.go @@ -28,7 +28,6 @@ const ( fluentBitParsersFile = "/etc/fluent-bit/parsers.conf" fluentBitManagedMetaFile = "/etc/fluent-bit/.edge-managed.json" fluentBitManagedEnvFile = "/etc/fluent-bit/.edge-managed.env" - fluentBitLogrotateFile = "/etc/logrotate.d/edge-goedge" fluentBitDropInDir = "/etc/systemd/system/fluent-bit.service.d" fluentBitDropInFile = "/etc/systemd/system/fluent-bit.service.d/edge-managed.conf" fluentBitServiceName = "fluent-bit" @@ -85,7 +84,7 @@ func (this *BaseInstaller) SetupFluentBit(role nodeconfigs.NodeRole) error { return err } - _, stderr, err := this.client.Exec("mkdir -p " + shQuote(fluentBitConfigDir) + " " + shQuote(fluentBitStorageDir) + " /etc/logrotate.d") + _, stderr, err := this.client.Exec("mkdir -p " + shQuote(fluentBitConfigDir) + " " + shQuote(fluentBitStorageDir)) if err != nil { return fmt.Errorf("prepare fluent-bit directories failed: %w, stderr: %s", err, stderr) } @@ -536,13 +535,6 @@ func (this *BaseInstaller) applyManagedConfig(tempDir string, desired *fluentBit return false, err } - localLogrotate := filepath.Join(Tea.Root, "deploy", "fluent-bit", "logrotate.conf") - if _, err := os.Stat(localLogrotate); err == nil { - if err := this.copyLocalFileToRemote(tempDir, localLogrotate, fluentBitLogrotateFile, 0644); err != nil { - return false, err - } - } - return true, nil } @@ -566,13 +558,13 @@ func renderManagedConfig(desired *fluentBitDesiredConfig) (string, error) { lines := []string{ "# " + fluentBitManagedMarker, "[SERVICE]", - " Flush 2", + " Flush 1", " Log_Level info", " Parsers_File " + fluentBitParsersFile, " storage.path " + fluentBitStorageDir, " storage.sync normal", " storage.checksum off", - " storage.backlog.mem_limit 256MB", + " storage.backlog.mem_limit 512MB", "", } @@ -587,7 +579,7 @@ func renderManagedConfig(desired *fluentBitDesiredConfig) (string, error) { " Read_from_Head false", " DB /var/lib/fluent-bit/http-logs.db", " storage.type filesystem", - " Mem_Buf_Limit 128MB", + " Mem_Buf_Limit 256MB", " Skip_Long_Lines On", "", ) @@ -604,7 +596,7 @@ func renderManagedConfig(desired *fluentBitDesiredConfig) (string, error) { " Read_from_Head false", " DB /var/lib/fluent-bit/dns-logs.db", " storage.type filesystem", - " Mem_Buf_Limit 128MB", + " Mem_Buf_Limit 256MB", " Skip_Long_Lines On", "", ) @@ -623,7 +615,7 @@ func renderManagedConfig(desired *fluentBitDesiredConfig) (string, error) { " http_passwd ${CH_PASSWORD}", " json_date_key timestamp", " json_date_format epoch", - " workers 1", + " workers 2", " net.keepalive On", " Retry_Limit False", ) @@ -654,7 +646,7 @@ func renderManagedConfig(desired *fluentBitDesiredConfig) (string, error) { " http_passwd ${CH_PASSWORD}", " json_date_key timestamp", " json_date_format epoch", - " workers 1", + " workers 2", " net.keepalive On", " Retry_Limit False", ) diff --git a/EdgeAdmin/.DS_Store b/EdgeAdmin/.DS_Store deleted file mode 100644 index 6067cdb..0000000 Binary files a/EdgeAdmin/.DS_Store and /dev/null differ diff --git a/EdgeAdmin/build/build.sh b/EdgeAdmin/build/build.sh index a09eca2..56c1c96 100644 --- a/EdgeAdmin/build/build.sh +++ b/EdgeAdmin/build/build.sh @@ -108,7 +108,7 @@ function build() { unzip -q "$(basename "$EDGE_API_ZIP_FILE")" rm -f "$(basename "$EDGE_API_ZIP_FILE")" - # ensure edge-api package always contains fluent-bit templates/packages + # ensure edge-api package always contains fluent-bit runtime assets/packages FLUENT_ROOT="$ROOT/../../deploy/fluent-bit" FLUENT_DIST="$DIST/edge-api/deploy/fluent-bit" if [ -d "$FLUENT_ROOT" ]; then @@ -117,17 +117,7 @@ function build() { mkdir -p "$FLUENT_DIST" FLUENT_FILES=( - "fluent-bit.conf" - "fluent-bit-dns.conf" - "fluent-bit-https.conf" - "fluent-bit-dns-https.conf" - "fluent-bit-windows.conf" - "fluent-bit-windows-https.conf" "parsers.conf" - "clickhouse-upstream.conf" - "clickhouse-upstream-windows.conf" - "logrotate.conf" - "README.md" ) for file in "${FLUENT_FILES[@]}"; do if [ -f "$FLUENT_ROOT/$file" ]; then diff --git a/EdgeCommon/build/.DS_Store b/EdgeCommon/build/.DS_Store deleted file mode 100644 index 57c9f7f..0000000 Binary files a/EdgeCommon/build/.DS_Store and /dev/null differ diff --git a/EdgeDNS/.DS_Store b/EdgeDNS/.DS_Store deleted file mode 100644 index 0086f2a..0000000 Binary files a/EdgeDNS/.DS_Store and /dev/null differ diff --git a/EdgeDNS/build/build.sh b/EdgeDNS/build/build.sh index 4d5ea3c..6da3934 100644 --- a/EdgeDNS/build/build.sh +++ b/EdgeDNS/build/build.sh @@ -112,7 +112,7 @@ function copy_fluent_bit_assets() { rm -rf "$FLUENT_DIST" mkdir -p "$FLUENT_DIST" - for file in fluent-bit.conf fluent-bit-dns.conf fluent-bit-https.conf fluent-bit-dns-https.conf fluent-bit-windows.conf fluent-bit-windows-https.conf parsers.conf clickhouse-upstream.conf clickhouse-upstream-windows.conf logrotate.conf README.md; do + for file in fluent-bit.conf fluent-bit-dns.conf fluent-bit-https.conf fluent-bit-dns-https.conf fluent-bit-windows.conf fluent-bit-windows-https.conf parsers.conf clickhouse-upstream.conf clickhouse-upstream-windows.conf README.md; do if [ -f "$FLUENT_ROOT/$file" ]; then cp "$FLUENT_ROOT/$file" "$FLUENT_DIST/" fi diff --git a/EdgeNode/.DS_Store b/EdgeNode/.DS_Store deleted file mode 100644 index 3bc3844..0000000 Binary files a/EdgeNode/.DS_Store and /dev/null differ diff --git a/EdgeNode/build/build.sh b/EdgeNode/build/build.sh index f648399..bf7991f 100644 --- a/EdgeNode/build/build.sh +++ b/EdgeNode/build/build.sh @@ -186,7 +186,7 @@ function copy_fluent_bit_assets() { rm -rf "$FLUENT_DIST" mkdir -p "$FLUENT_DIST" - for file in fluent-bit.conf fluent-bit-dns.conf fluent-bit-https.conf fluent-bit-dns-https.conf fluent-bit-windows.conf fluent-bit-windows-https.conf parsers.conf clickhouse-upstream.conf clickhouse-upstream-windows.conf logrotate.conf README.md; do + for file in fluent-bit.conf fluent-bit-dns.conf fluent-bit-https.conf fluent-bit-dns-https.conf fluent-bit-windows.conf fluent-bit-windows-https.conf parsers.conf clickhouse-upstream.conf clickhouse-upstream-windows.conf README.md; do if [ -f "$FLUENT_ROOT/$file" ]; then cp "$FLUENT_ROOT/$file" "$FLUENT_DIST/" fi diff --git a/EdgeNode/dist/.DS_Store b/EdgeNode/dist/.DS_Store deleted file mode 100644 index 47271f8..0000000 Binary files a/EdgeNode/dist/.DS_Store and /dev/null differ diff --git a/EdgeUser/.DS_Store b/EdgeUser/.DS_Store deleted file mode 100644 index d1ff0b4..0000000 Binary files a/EdgeUser/.DS_Store and /dev/null differ diff --git a/EdgeUser/dist/.DS_Store b/EdgeUser/dist/.DS_Store deleted file mode 100644 index 712681e..0000000 Binary files a/EdgeUser/dist/.DS_Store and /dev/null differ diff --git a/GoEdge HTTPDNS 需求文档 v2.0.docx b/GoEdge HTTPDNS 需求文档 v2.0.docx deleted file mode 100644 index 2b583de..0000000 Binary files a/GoEdge HTTPDNS 需求文档 v2.0.docx and /dev/null differ diff --git a/HTTPDNS_技术实施方案.md b/HTTPDNS_技术实施方案.md deleted file mode 100644 index dbc44b3..0000000 --- a/HTTPDNS_技术实施方案.md +++ /dev/null @@ -1,1290 +0,0 @@ -# GoEdge HTTPDNS 技术实施方案 - -> 版本: 1.0 | 作者: AI Assistant | 日期: 2026-02-09 - ---- - -## 一、项目概述 - -### 1.1 目标 -在 GoEdge 平台实现完整的 HTTPDNS 服务,包括: -- 基于 HTTPS 的 DNS 解析接口 -- 动态指纹校验(WAF) -- App 管理后台 -- 移动端 SDK 示例 - -### 1.2 设计决策 -| 决策项 | 选择 | 理由 | -|--------|------|------| -| WAF 指纹校验 | 必须 | 防止非法请求绕过解析直接攻击源站 | -| App 管理界面 | 必须 | 标准产品功能,支持多租户 | -| SDK 示例 | 提供 | 降低客户接入成本 | -| 部署位置 | 复用 Edge-DNS | 减少运维复杂度 | -| 接口路径 | 新增 `/httpdns/resolve` | 保持向后兼容 | - ---- - -## 二、系统架构 - -### 2.1 整体架构图 - -``` -┌─────────────────────────────────────────────────────────────────────────┐ -│ 移动端 App │ -│ ┌───────────────────────────────────────────────────────────────────┐ │ -│ │ HTTPDNS SDK │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ -│ │ │ Resolver │ │CacheManager │ │NetworkMonitor│ │ │ -│ │ │ 解析引擎 │ │ 缓存管理 │ │ 网络感知 │ │ │ -│ │ │ -多节点容错 │ │ -内存LRU │ │ -切换监听 │ │ │ -│ │ │ -超时降级 │ │ -持久化 │ │ -自动清缓存 │ │ │ -│ │ └─────────────┘ │ -软过期 │ │ -IPv6检测 │ │ │ -│ │ └─────────────┘ └─────────────┘ │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ -│ │ │ Signer │ │ Reporter │ │ Prefetch │ │ │ -│ │ │ 签名模块 │ │ 监控上报 │ │ 预解析 │ │ │ -│ │ │ -HMAC-SHA256│ │ -成功率 │ │ -冷启动优化 │ │ │ -│ │ │ -防重放 │ │ -耗时统计 │ │ -批量解析 │ │ │ -│ │ └─────────────┘ │ -缓存命中率│ └─────────────┘ │ │ -│ │ └─────────────┘ │ │ -│ └───────────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────────┘ - │ - │ HTTPS - ▼ -┌─────────────────────────────────────────────────────────────────────────┐ -│ GoEdge 平台 │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Edge-DNS │ │ Edge-Node │ │ Edge-Admin │ │ -│ │ │ │ │ │ │ │ -│ │ /httpdns/ │ │ WAF 校验 │ │ App 管理 │ │ -│ │ resolve │ │ 指纹验证 │ │ AppID/Secret│ │ -│ │ │ │ │ │ │ │ -│ │ 智能调度 │ │ 流量转发 │ │ SDK统计接收 │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ │ │ │ │ -│ └───────────────────┴───────────────────┘ │ -│ │ │ -│ ┌──────────────┐ │ -│ │ Edge-API │ │ -│ │ 数据服务 │ │ -│ └──────────────┘ │ -│ │ │ -│ ┌──────────────┐ │ -│ │ MySQL │ │ -│ └──────────────┘ │ -└─────────────────────────────────────────────────────────────────────────┘ - │ - │ HTTPS (IP 直连) - ▼ -┌─────────────────────────────────────────────────────────────────────────┐ -│ 源站服务器 │ -└─────────────────────────────────────────────────────────────────────────┘ -``` - -### 2.2 请求流程(带缓存与降级) - -```mermaid -sequenceDiagram - participant App as 移动 App - participant Cache as SDK缓存 - participant Resolver as SDK解析器 - participant DNS as Edge-DNS - participant SysDNS as 系统DNS - participant Node as Edge-Node - participant API as Edge-API - - Note over App,API: 阶段一:DNS 解析(含缓存与降级) - - App->>Resolver: resolve("api.example.com") - - Resolver->>Cache: 查询缓存 - alt 缓存有效 - Cache-->>Resolver: 返回 IP (命中) - Resolver-->>App: ["1.2.3.4"] - else 缓存软过期 - Cache-->>Resolver: 返回 IP + 标记需刷新 - Resolver-->>App: ["1.2.3.4"] (先返回) - Resolver->>DNS: 后台异步刷新 - DNS-->>Resolver: 新 IP - Resolver->>Cache: 更新缓存 - else 缓存完全过期或不存在 - Resolver->>DNS: GET /httpdns/resolve - alt HTTPDNS 正常 - DNS-->>Resolver: {"ips": ["1.2.3.4"], "ttl": 600} - Resolver->>Cache: 写入缓存 - Resolver-->>App: ["1.2.3.4"] - else HTTPDNS 超时/失败 - Resolver->>SysDNS: 降级到系统 DNS - SysDNS-->>Resolver: 1.2.3.4 - Resolver-->>App: ["1.2.3.4"] (降级) - end - end - - Note over App,API: 阶段二:业务请求(带签名) - App->>Node: HTTPS://1.2.3.4/v1/user + 签名Header - Node->>API: 查询 AppSecret - API-->>Node: AppSecret - Node->>Node: HMAC-SHA256 验证 - alt 验证失败 - Node-->>App: 403 Forbidden - else 验证成功 - Node-->>App: 200 OK + 响应数据 - end - - Note over App,API: 阶段三:监控上报(异步) - Resolver-->>API: 定时上报统计数据 -``` - -### 2.3 网络切换处理流程 - -```mermaid -flowchart LR - A[WiFi] -->|切换| B[4G/5G] - B --> C{NetworkMonitor检测} - C --> D[清空所有缓存] - D --> E[下次请求重新解析] - E --> F[获取新网络最优IP] -``` - ---- - -## 三、模块详细设计 - -### 3.1 Edge-DNS: HTTPDNS 解析接口 - -#### 3.1.1 接口定义 - -| 项目 | 说明 | -|------|------| -| **Endpoint** | `GET /httpdns/resolve` | -| **协议** | HTTPS (443) | -| **认证** | 无(解析接口公开,业务请求才需要签名) | - -#### 3.1.2 请求参数 - -| 参数 | 类型 | 必填 | 说明 | -|------|------|------|------| -| `host` | string | 是 | 待解析的域名 | -| `type` | string | 否 | 记录类型,默认 `A,AAAA`(同时返回) | -| `ip` | string | 否 | 客户端 IP(用于调试/代理场景) | - -#### 3.1.3 响应格式 - -```json -{ - "status": "ok", - "dns_server_time": 1700000000, - "client_ip": "114.114.114.114", - "data": [ - { - "host": "api.example.com", - "type": "A", - "ips": ["1.1.1.1", "1.1.1.2"], - "ips_v6": ["240e:xxx::1"], - "ttl": 600 - } - ] -} -``` - -#### 3.1.4 代码实现 - -**文件**: `EdgeDNS/internal/nodes/httpdns.go` (新建) - -```go -package nodes - -import ( - "encoding/json" - "net" - "net/http" - "strings" - "time" -) - -// HTTPDNSResponse HTTPDNS 响应结构 -type HTTPDNSResponse struct { - Status string `json:"status"` - DNSServerTime int64 `json:"dns_server_time"` - ClientIP string `json:"client_ip"` - Data []HTTPDNSRecord `json:"data"` - Error string `json:"error,omitempty"` -} - -// HTTPDNSRecord 单条解析记录 -type HTTPDNSRecord struct { - Host string `json:"host"` - Type string `json:"type"` - IPs []string `json:"ips"` - IPsV6 []string `json:"ips_v6"` - TTL int `json:"ttl"` -} - -// handleHTTPDNSResolve 处理 HTTPDNS 解析请求 -func (this *Server) handleHTTPDNSResolve(writer http.ResponseWriter, req *http.Request) { - writer.Header().Set("Content-Type", "application/json") - writer.Header().Set("Access-Control-Allow-Origin", "*") - - // 1. 解析参数 - query := req.URL.Query() - host := strings.TrimSpace(query.Get("host")) - if host == "" { - this.writeHTTPDNSError(writer, "missing 'host' parameter") - return - } - - // 2. 获取客户端 IP - clientIP := query.Get("ip") - if clientIP == "" { - clientIP = this.extractClientIP(req) - } - - // 3. 查询 A 记录 - ipsV4 := this.resolveRecords(host, "A", clientIP) - - // 4. 查询 AAAA 记录 - ipsV6 := this.resolveRecords(host, "AAAA", clientIP) - - // 5. 获取 TTL - ttl := this.getRecordTTL(host, clientIP) - if ttl == 0 { - ttl = 600 - } - - // 6. 构造响应 - resp := HTTPDNSResponse{ - Status: "ok", - DNSServerTime: time.Now().Unix(), - ClientIP: clientIP, - Data: []HTTPDNSRecord{{ - Host: host, - Type: "A", - IPs: ipsV4, - IPsV6: ipsV6, - TTL: ttl, - }}, - } - - json.NewEncoder(writer).Encode(resp) -} - -// extractClientIP 从请求中提取客户端真实 IP -func (this *Server) extractClientIP(req *http.Request) string { - xff := req.Header.Get("X-Forwarded-For") - if xff != "" { - parts := strings.Split(xff, ",") - return strings.TrimSpace(parts[0]) - } - xri := req.Header.Get("X-Real-IP") - if xri != "" { - return xri - } - host, _, _ := net.SplitHostPort(req.RemoteAddr) - return host -} - -// resolveRecords 解析指定类型的记录 -func (this *Server) resolveRecords(host, recordType, clientIP string) []string { - var result []string - if !strings.HasSuffix(host, ".") { - host += "." - } - domain, recordName := sharedDomainManager.SplitDomain(host) - if domain == nil { - return result - } - routeCodes := sharedRouteManager.FindRouteCodes(clientIP, domain.UserId) - records, _ := sharedRecordManager.FindRecords(domain.Id, routeCodes, recordName, recordType, false) - for _, record := range records { - if record.Value != "" { - result = append(result, record.Value) - } - } - return result -} - -// getRecordTTL 获取记录 TTL -func (this *Server) getRecordTTL(host, clientIP string) int { - if !strings.HasSuffix(host, ".") { - host += "." - } - domain, recordName := sharedDomainManager.SplitDomain(host) - if domain == nil { - return 0 - } - routeCodes := sharedRouteManager.FindRouteCodes(clientIP, domain.UserId) - records, _ := sharedRecordManager.FindRecords(domain.Id, routeCodes, recordName, "A", false) - if len(records) > 0 { - return int(records[0].Ttl) - } - return 0 -} - -// writeHTTPDNSError 写入错误响应 -func (this *Server) writeHTTPDNSError(writer http.ResponseWriter, errMsg string) { - writer.WriteHeader(http.StatusBadRequest) - resp := HTTPDNSResponse{ - Status: "error", - DNSServerTime: time.Now().Unix(), - Error: errMsg, - } - json.NewEncoder(writer).Encode(resp) -} -``` - -**修改**: `EdgeDNS/internal/nodes/server.go` 第735行附近 - -```go -func (this *Server) handleHTTP(writer http.ResponseWriter, req *http.Request) { - if req.URL.Path == "/dns-query" { - this.handleHTTPDNSMessage(writer, req) - return - } - // 新增 HTTPDNS JSON API - if req.URL.Path == "/httpdns/resolve" { - this.handleHTTPDNSResolve(writer, req) - return - } - if req.URL.Path == "/resolve" { - this.handleHTTPJSONAPI(writer, req) - return - } - writer.WriteHeader(http.StatusNotFound) -} -``` - ---- - -### 3.2 Edge-Node: WAF 指纹校验 - -#### 3.2.1 校验逻辑 - -| 步骤 | 说明 | -|------|------| -| 1 | 提取 `X-GE-AppID`, `X-GE-Timestamp`, `X-GE-Token` | -| 2 | 根据 AppID 查询 AppSecret | -| 3 | 校验时间戳(±300 秒内有效) | -| 4 | 计算 HMAC-SHA256 并比对 | - -#### 3.2.2 代码实现 - -**文件**: `EdgeNode/internal/waf/checkpoints/checkpoint_httpdns_fingerprint.go` (新建) - -```go -package checkpoints - -import ( - "crypto/hmac" - "crypto/sha256" - "encoding/hex" - "math" - "net/http" - "strconv" - "time" - - "github.com/TeaOSLab/EdgeCommon/pkg/rpc/pb" - "github.com/TeaOSLab/EdgeNode/internal/rpc" -) - -type CheckpointHTTPDNSFingerprint struct { - Checkpoint -} - -func (this *CheckpointHTTPDNSFingerprint) RequestValue( - req CheckpointRequest, param string, options map[string]string, ruleId int64, -) (value any, hasRequestBody bool, sysErr error, userErr error) { - httpReq, ok := req.WAFRaw().(*http.Request) - if !ok { - return "INVALID_REQUEST", false, nil, nil - } - - appID := httpReq.Header.Get("X-GE-AppID") - token := httpReq.Header.Get("X-GE-Token") - tsStr := httpReq.Header.Get("X-GE-Timestamp") - - if appID == "" && token == "" && tsStr == "" { - return "", false, nil, nil - } - if appID == "" { - return "MISSING_APPID", false, nil, nil - } - if token == "" { - return "MISSING_TOKEN", false, nil, nil - } - if tsStr == "" { - return "MISSING_TIMESTAMP", false, nil, nil - } - - appSecret, err := this.getAppSecret(appID) - if err != nil || appSecret == "" { - return "INVALID_APPID", false, nil, nil - } - - ts, _ := strconv.ParseInt(tsStr, 10, 64) - if math.Abs(float64(time.Now().Unix()-ts)) > 300 { - return "TIMESTAMP_EXPIRED", false, nil, nil - } - - mac := hmac.New(sha256.New, []byte(appSecret)) - mac.Write([]byte(appID + tsStr + httpReq.URL.Path)) - expected := hex.EncodeToString(mac.Sum(nil)) - - if token != expected { - return "SIGNATURE_MISMATCH", false, nil, nil - } - return "OK", false, nil, nil -} - -func (this *CheckpointHTTPDNSFingerprint) ResponseValue( - req CheckpointRequest, param string, options map[string]string, ruleId int64, -) (value any, hasRequestBody bool, sysErr error, userErr error) { - return "", false, nil, nil -} - -func (this *CheckpointHTTPDNSFingerprint) getAppSecret(appID string) (string, error) { - client, err := rpc.SharedRPC() - if err != nil { - return "", err - } - resp, err := client.HTTPDNSAppRPC.FindHTTPDNSAppSecret( - client.Context(), &pb.FindHTTPDNSAppSecretRequest{AppId: appID}, - ) - if err != nil { - return "", err - } - return resp.AppSecret, nil -} -``` - ---- - -### 3.3 Edge-API: App 管理服务 - -#### 3.3.1 数据库表 - -```sql -CREATE TABLE IF NOT EXISTS edgeHTTPDNSApps ( - id BIGINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, - appId VARCHAR(64) NOT NULL UNIQUE COMMENT 'App标识', - appSecret VARCHAR(128) NOT NULL COMMENT 'App密钥', - name VARCHAR(255) NOT NULL DEFAULT '' COMMENT '应用名称', - description TEXT COMMENT '描述', - userId BIGINT UNSIGNED DEFAULT 0 COMMENT '关联用户ID', - isOn TINYINT(1) DEFAULT 1 COMMENT '是否启用', - createdAt INT UNSIGNED DEFAULT 0, - state TINYINT(1) DEFAULT 1, - UNIQUE KEY uk_appId (appId), - KEY idx_userId (userId) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -``` - -#### 3.3.2 gRPC Proto - -**文件**: `EdgeCommon/pkg/rpc/protos/service_httpdns_app.proto` (新建) - -```protobuf -syntax = "proto3"; -option go_package = "./pb"; -package pb; - -service HTTPDNSAppService { - rpc createHTTPDNSApp(CreateHTTPDNSAppRequest) returns (CreateHTTPDNSAppResponse); - rpc findHTTPDNSAppSecret(FindHTTPDNSAppSecretRequest) returns (FindHTTPDNSAppSecretResponse); - rpc listHTTPDNSApps(ListHTTPDNSAppsRequest) returns (ListHTTPDNSAppsResponse); - rpc deleteHTTPDNSApp(DeleteHTTPDNSAppRequest) returns (RPCSuccess); -} - -message CreateHTTPDNSAppRequest { - string name = 1; - string description = 2; - int64 userId = 3; -} -message CreateHTTPDNSAppResponse { - int64 httpdnsAppId = 1; - string appId = 2; - string appSecret = 3; -} -message FindHTTPDNSAppSecretRequest { - string appId = 1; -} -message FindHTTPDNSAppSecretResponse { - string appSecret = 1; -} -message ListHTTPDNSAppsRequest { - int64 userId = 1; - int64 offset = 2; - int64 size = 3; -} -message ListHTTPDNSAppsResponse { - repeated HTTPDNSApp httpdnsApps = 1; -} -message DeleteHTTPDNSAppRequest { - int64 httpdnsAppId = 1; -} -message HTTPDNSApp { - int64 id = 1; - string appId = 2; - string appSecret = 3; - string name = 4; - int64 userId = 5; - bool isOn = 6; - int64 createdAt = 7; -} -``` - ---- - -### 3.4 SDK 完整设计 - -#### 3.4.1 SDK 架构概览 - -``` -┌─────────────────────────────────────────────────────────────┐ -│ HTTPDNS SDK │ -├─────────────────────────────────────────────────────────────┤ -│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ -│ │ 解析引擎 │ │ 缓存管理 │ │ 网络感知 │ │ -│ │ Resolver │ │ CacheManager │ │ NetworkMonitor│ │ -│ └───────────────┘ └───────────────┘ └───────────────┘ │ -│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ -│ │ 容错降级 │ │ 签名模块 │ │ 监控上报 │ │ -│ │ Failover │ │ Signer │ │ Reporter │ │ -│ └───────────────┘ └───────────────┘ └───────────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` - -#### 3.4.2 缓存管理模块 (CacheManager) - -**设计原则**: -- 两级缓存:内存 + 持久化 -- 软过期策略:过期后先返回旧数据,后台异步更新 -- LRU 淘汰:防止内存溢出 - -**Android 实现**: - -```kotlin -class HTTPDNSCacheManager(private val context: Context) { - // 内存缓存 (LRU) - private val memoryCache = object : LinkedHashMap(100, 0.75f, true) { - override fun removeEldestEntry(eldest: Map.Entry): Boolean { - return size > MAX_CACHE_SIZE - } - } - - // 持久化缓存 (MMKV) - private val mmkv = MMKV.mmkvWithID("httpdns_cache", MMKV.MULTI_PROCESS_MODE) - - companion object { - const val MAX_CACHE_SIZE = 100 - const val SOFT_EXPIRE_SECONDS = 60 // 软过期:TTL 到期后仍可用 60s - } - - data class CacheEntry( - val ips: List, - val ipsV6: List, - val expireAt: Long, // 硬过期时间 - val softExpireAt: Long, // 软过期时间 = expireAt + SOFT_EXPIRE_SECONDS - val createAt: Long = System.currentTimeMillis() - ) { - fun isExpired(): Boolean = System.currentTimeMillis() > expireAt - fun isSoftExpired(): Boolean = System.currentTimeMillis() > softExpireAt - fun toJson(): String = Gson().toJson(this) - } - - /** - * 获取缓存(支持软过期) - * @return Pair<缓存结果, 是否需要后台刷新> - */ - @Synchronized - fun get(host: String): Pair { - // 1. 先查内存 - memoryCache[host]?.let { entry -> - return when { - !entry.isExpired() -> Pair(entry, false) // 未过期 - !entry.isSoftExpired() -> Pair(entry, true) // 软过期,需刷新 - else -> Pair(null, true) // 完全过期 - } - } - - // 2. 查持久化 - val json = mmkv.decodeString(host) ?: return Pair(null, true) - val entry = Gson().fromJson(json, CacheEntry::class.java) - - // 回填内存 - memoryCache[host] = entry - - return when { - !entry.isExpired() -> Pair(entry, false) - !entry.isSoftExpired() -> Pair(entry, true) - else -> Pair(null, true) - } - } - - /** - * 写入缓存 - */ - @Synchronized - fun put(host: String, ips: List, ipsV6: List, ttl: Int) { - val now = System.currentTimeMillis() - val entry = CacheEntry( - ips = ips, - ipsV6 = ipsV6, - expireAt = now + ttl * 1000L, - softExpireAt = now + (ttl + SOFT_EXPIRE_SECONDS) * 1000L - ) - memoryCache[host] = entry - mmkv.encode(host, entry.toJson()) - } - - /** - * 清空所有缓存(网络切换时调用) - */ - @Synchronized - fun clear() { - memoryCache.clear() - mmkv.clearAll() - } -} -``` - -**iOS 实现**: - -```swift -class HTTPDNSCacheManager { - static let shared = HTTPDNSCacheManager() - - private var memoryCache: [String: CacheEntry] = [:] - private let defaults = UserDefaults(suiteName: "com.goedge.httpdns")! - private let queue = DispatchQueue(label: "httpdns.cache", attributes: .concurrent) - - private let maxCacheSize = 100 - private let softExpireSeconds: TimeInterval = 60 - - struct CacheEntry: Codable { - let ips: [String] - let ipsV6: [String] - let expireAt: Date - let softExpireAt: Date - let createAt: Date - - func isExpired() -> Bool { Date() > expireAt } - func isSoftExpired() -> Bool { Date() > softExpireAt } - } - - func get(host: String) -> (entry: CacheEntry?, needRefresh: Bool) { - return queue.sync { - // 查内存 - if let entry = memoryCache[host] { - if !entry.isExpired() { return (entry, false) } - if !entry.isSoftExpired() { return (entry, true) } - } - - // 查持久化 - guard let data = defaults.data(forKey: host), - let entry = try? JSONDecoder().decode(CacheEntry.self, from: data) else { - return (nil, true) - } - - // 回填内存 - memoryCache[host] = entry - - if !entry.isExpired() { return (entry, false) } - if !entry.isSoftExpired() { return (entry, true) } - return (nil, true) - } - } - - func put(host: String, ips: [String], ipsV6: [String], ttl: Int) { - queue.async(flags: .barrier) { - let entry = CacheEntry( - ips: ips, - ipsV6: ipsV6, - expireAt: Date().addingTimeInterval(TimeInterval(ttl)), - softExpireAt: Date().addingTimeInterval(TimeInterval(ttl) + self.softExpireSeconds), - createAt: Date() - ) - self.memoryCache[host] = entry - - // LRU 淘汰 - if self.memoryCache.count > self.maxCacheSize { - let oldest = self.memoryCache.min { $0.value.createAt < $1.value.createAt } - if let key = oldest?.key { self.memoryCache.removeValue(forKey: key) } - } - - // 持久化 - if let data = try? JSONEncoder().encode(entry) { - self.defaults.set(data, forKey: host) - } - } - } - - func clear() { - queue.async(flags: .barrier) { - self.memoryCache.removeAll() - // 清理持久化 - for key in self.defaults.dictionaryRepresentation().keys { - self.defaults.removeObject(forKey: key) - } - } - } -} -``` - -#### 3.4.3 网络感知模块 (NetworkMonitor) - -**功能**: -- 监听网络切换(WiFi ↔ 4G/5G) -- 网络切换时清空缓存 -- 检测 IPv6 可用性 - -**Android 实现**: - -```kotlin -class NetworkMonitor(private val context: Context) { - private val connectivityManager = context.getSystemService(Context.CONNECTIVITY_SERVICE) as ConnectivityManager - private var lastNetworkType: String? = null - var onNetworkChanged: (() -> Unit)? = null - - private val networkCallback = object : ConnectivityManager.NetworkCallback() { - override fun onAvailable(network: Network) { - checkNetworkChange() - } - - override fun onLost(network: Network) { - checkNetworkChange() - } - - override fun onCapabilitiesChanged(network: Network, caps: NetworkCapabilities) { - checkNetworkChange() - } - } - - fun start() { - val request = NetworkRequest.Builder() - .addCapability(NetworkCapabilities.NET_CAPABILITY_INTERNET) - .build() - connectivityManager.registerNetworkCallback(request, networkCallback) - lastNetworkType = getCurrentNetworkType() - } - - fun stop() { - connectivityManager.unregisterNetworkCallback(networkCallback) - } - - private fun checkNetworkChange() { - val currentType = getCurrentNetworkType() - if (currentType != lastNetworkType) { - Log.d("HTTPDNS", "Network changed: $lastNetworkType -> $currentType") - lastNetworkType = currentType - onNetworkChanged?.invoke() - } - } - - private fun getCurrentNetworkType(): String { - val network = connectivityManager.activeNetwork ?: return "NONE" - val caps = connectivityManager.getNetworkCapabilities(network) ?: return "UNKNOWN" - return when { - caps.hasTransport(NetworkCapabilities.TRANSPORT_WIFI) -> "WIFI" - caps.hasTransport(NetworkCapabilities.TRANSPORT_CELLULAR) -> "CELLULAR" - else -> "OTHER" - } - } - - /** - * 检测当前网络是否支持 IPv6 - */ - fun isIPv6Supported(): Boolean { - return try { - val addresses = NetworkInterface.getNetworkInterfaces().toList() - .flatMap { it.inetAddresses.toList() } - addresses.any { it is Inet6Address && !it.isLoopbackAddress && !it.isLinkLocalAddress } - } catch (e: Exception) { - false - } - } -} -``` - -**iOS 实现**: - -```swift -import Network - -class NetworkMonitor { - static let shared = NetworkMonitor() - - private let monitor = NWPathMonitor() - private let queue = DispatchQueue(label: "httpdns.network") - private var lastInterfaceType: NWInterface.InterfaceType? - - var onNetworkChanged: (() -> Void)? - - func start() { - monitor.pathUpdateHandler = { [weak self] path in - let currentType = path.availableInterfaces.first?.type - if currentType != self?.lastInterfaceType { - print("HTTPDNS: Network changed \(self?.lastInterfaceType?.description ?? "nil") -> \(currentType?.description ?? "nil")") - self?.lastInterfaceType = currentType - DispatchQueue.main.async { - self?.onNetworkChanged?() - } - } - } - monitor.start(queue: queue) - } - - func stop() { - monitor.cancel() - } - - func isIPv6Supported() -> Bool { - let path = monitor.currentPath - return path.supportsIPv6 - } -} -``` - -#### 3.4.4 容错降级模块 (Failover) - -**策略**: -1. 多节点轮询:主节点失败自动切备用 -2. 超时降级:HTTPDNS 超时自动降级到系统 DNS -3. 黑名单机制:特定域名强制走系统 DNS - -**Android 实现**: - -```kotlin -class HTTPDNSResolver( - private val serverUrls: List, // 多个服务节点 - private val cacheManager: HTTPDNSCacheManager, - private val timeout: Long = 3000L -) { - private val blacklist = setOf("localhost", "*.local", "*.internal") - private var currentServerIndex = 0 - - suspend fun resolve(host: String): ResolveResult { - // 1. 黑名单检查 - if (isBlacklisted(host)) { - return ResolveResult(systemResolve(host), ResolveSource.SYSTEM_DNS) - } - - // 2. 查缓存 - val (cached, needRefresh) = cacheManager.get(host) - if (cached != null) { - if (needRefresh) { - // 后台异步刷新 - CoroutineScope(Dispatchers.IO).launch { fetchFromServer(host) } - } - return ResolveResult(cached.ips, ResolveSource.CACHE) - } - - // 3. 请求服务器(带重试) - return try { - withTimeout(timeout) { - val ips = fetchFromServerWithRetry(host) - ResolveResult(ips, ResolveSource.HTTPDNS) - } - } catch (e: TimeoutCancellationException) { - // 4. 降级到系统 DNS - Log.w("HTTPDNS", "Timeout, fallback to system DNS") - ResolveResult(systemResolve(host), ResolveSource.SYSTEM_DNS) - } - } - - private suspend fun fetchFromServerWithRetry(host: String): List { - var lastError: Exception? = null - repeat(serverUrls.size) { attempt -> - try { - return fetchFromServer(host) - } catch (e: Exception) { - lastError = e - currentServerIndex = (currentServerIndex + 1) % serverUrls.size - Log.w("HTTPDNS", "Server ${serverUrls[currentServerIndex]} failed, trying next") - } - } - throw lastError ?: Exception("All servers failed") - } - - private suspend fun fetchFromServer(host: String): List { - val url = "${serverUrls[currentServerIndex]}/httpdns/resolve?host=$host" - val response = httpClient.get(url) - val result = parseResponse(response.body) - cacheManager.put(host, result.ips, result.ipsV6, result.ttl) - return result.ips - } - - private fun systemResolve(host: String): List { - return try { - InetAddress.getAllByName(host).map { it.hostAddress } - } catch (e: Exception) { - emptyList() - } - } - - private fun isBlacklisted(host: String): Boolean { - return blacklist.any { pattern -> - if (pattern.startsWith("*")) { - host.endsWith(pattern.substring(1)) - } else { - host == pattern - } - } - } - - enum class ResolveSource { CACHE, HTTPDNS, SYSTEM_DNS } - data class ResolveResult(val ips: List, val source: ResolveSource) -} -``` - -#### 3.4.5 监控上报模块 (Reporter) - -**上报指标**: -- 解析成功率 -- 解析耗时 -- 缓存命中率 -- 降级次数 - -**数据结构**: - -```kotlin -data class HTTPDNSStats( - val host: String, - val resolveCount: Int, - val successCount: Int, - val cacheHitCount: Int, - val fallbackCount: Int, - val avgLatencyMs: Long, - val timestamp: Long -) -``` - -**Android 实现**: - -```kotlin -class HTTPDNSReporter(private val reportUrl: String) { - private val stats = mutableMapOf() - private val reportInterval = 60_000L // 60秒上报一次 - - data class MutableStats( - var resolveCount: Int = 0, - var successCount: Int = 0, - var cacheHitCount: Int = 0, - var fallbackCount: Int = 0, - var totalLatencyMs: Long = 0 - ) - - fun recordResolve(host: String, source: ResolveSource, latencyMs: Long, success: Boolean) { - synchronized(stats) { - val s = stats.getOrPut(host) { MutableStats() } - s.resolveCount++ - if (success) s.successCount++ - s.totalLatencyMs += latencyMs - when (source) { - ResolveSource.CACHE -> s.cacheHitCount++ - ResolveSource.SYSTEM_DNS -> s.fallbackCount++ - else -> {} - } - } - } - - fun startPeriodicReport() { - CoroutineScope(Dispatchers.IO).launch { - while (true) { - delay(reportInterval) - report() - } - } - } - - private suspend fun report() { - val snapshot = synchronized(stats) { - val copy = stats.toMap() - stats.clear() - copy - } - - if (snapshot.isEmpty()) return - - val reports = snapshot.map { (host, s) -> - HTTPDNSStats( - host = host, - resolveCount = s.resolveCount, - successCount = s.successCount, - cacheHitCount = s.cacheHitCount, - fallbackCount = s.fallbackCount, - avgLatencyMs = if (s.resolveCount > 0) s.totalLatencyMs / s.resolveCount else 0, - timestamp = System.currentTimeMillis() - ) - } - - try { - httpClient.post(reportUrl) { - contentType(ContentType.Application.Json) - setBody(reports) - } - } catch (e: Exception) { - Log.e("HTTPDNS", "Report failed: ${e.message}") - } - } -} -``` - -#### 3.4.6 完整 SDK 集成示例 - -**Android 初始化**: - -```kotlin -class HTTPDNSManager private constructor(context: Context) { - private val cacheManager = HTTPDNSCacheManager(context) - private val networkMonitor = NetworkMonitor(context) - private val resolver = HTTPDNSResolver( - serverUrls = listOf( - "https://httpdns1.goedge.cn", - "https://httpdns2.goedge.cn" - ), - cacheManager = cacheManager - ) - private val signer = HTTPDNSSigner(appId = "ge_xxx", appSecret = "xxx") - private val reporter = HTTPDNSReporter("https://api.goedge.cn/httpdns/report") - - companion object { - @Volatile - private var instance: HTTPDNSManager? = null - - fun init(context: Context): HTTPDNSManager { - return instance ?: synchronized(this) { - instance ?: HTTPDNSManager(context.applicationContext).also { instance = it } - } - } - - fun get(): HTTPDNSManager = instance ?: throw IllegalStateException("Must call init first") - } - - init { - // 监听网络变化 - networkMonitor.onNetworkChanged = { - Log.d("HTTPDNS", "Network changed, clearing cache") - cacheManager.clear() - } - networkMonitor.start() - - // 启动监控上报 - reporter.startPeriodicReport() - } - - suspend fun resolve(host: String): List { - val startTime = System.currentTimeMillis() - val result = resolver.resolve(host) - val latency = System.currentTimeMillis() - startTime - reporter.recordResolve(host, result.source, latency, result.ips.isNotEmpty()) - return result.ips - } - - fun signRequest(request: Request): Request = signer.sign(request) - - /** - * 预解析核心域名(App 启动时调用) - */ - suspend fun prefetch(hosts: List) { - hosts.forEach { host -> - launch { resolve(host) } - } - } -} -``` - -**使用示例**: - -```kotlin -// Application.onCreate -HTTPDNSManager.init(this) - -// 预解析 -lifecycleScope.launch { - HTTPDNSManager.get().prefetch(listOf( - "api.example.com", - "cdn.example.com", - "img.example.com" - )) -} - -// 业务请求 -lifecycleScope.launch { - val ips = HTTPDNSManager.get().resolve("api.example.com") - if (ips.isNotEmpty()) { - val request = Request.Builder() - .url("https://${ips[0]}/v1/user") - .header("Host", "api.example.com") - .build() - val signedRequest = HTTPDNSManager.get().signRequest(request) - // 发起请求... - } -} -``` - ---- - -## 四、实施计划 - -### 4.1 任务清单 - -| 阶段 | 任务 | 工时 | -|------|------|------| -| **Phase 1** | 数据库 + DAO + gRPC | 3天 | -| **Phase 2** | Edge-DNS 接口 | 1天 | -| **Phase 3** | Edge-Node WAF | 1天 | -| **Phase 4** | Edge-Admin UI | 2天 | -| **Phase 5** | SDK 核心 (解析+签名) | 1天 | -| **Phase 6** | SDK 缓存模块 (内存+持久化+LRU+软过期) | 1天 | -| **Phase 7** | SDK 网络感知 (切换监听+IPv6检测) | 0.5天 | -| **Phase 8** | SDK 容错降级 (多节点重试+系统DNS降级) | 0.5天 | -| **Phase 9** | SDK 监控上报 | 0.5天 | -| **Phase 10** | SDK 集成测试 + 文档 | 0.5天 | -| **总计** | | **11天** | - -### 4.2 文件变更清单 - -| 操作 | 文件路径 | -|------|----------| -| 新建 | `EdgeDNS/internal/nodes/httpdns.go` | -| 修改 | `EdgeDNS/internal/nodes/server.go` | -| 新建 | `EdgeNode/internal/waf/checkpoints/checkpoint_httpdns_fingerprint.go` | -| 修改 | `EdgeNode/internal/waf/checkpoints/init.go` | -| 新建 | `EdgeAPI/internal/db/models/httpdns_app_dao.go` | -| 新建 | `EdgeAPI/internal/rpc/services/service_httpdns_app.go` | -| 新建 | `EdgeCommon/pkg/rpc/protos/service_httpdns_app.proto` | -| 新建 | `EdgeAdmin/internal/web/actions/httpdns/*.go` | -| 新建 | `EdgeAdmin/web/views/httpdns/apps/*.html` | - ---- - -## 五、测试验证 - -### 5.1 服务端测试 - -```bash -# 1. 测试 HTTPDNS 解析 -curl "https://httpdns.example.com/httpdns/resolve?host=api.example.com" - -# 2. 测试无签名访问业务(应被拦截) -curl "https://1.2.3.4/v1/user" -H "Host: api.example.com" - -# 3. 测试带签名访问 -curl "https://1.2.3.4/v1/user" \ - -H "Host: api.example.com" \ - -H "X-GE-AppID: ge_abc123" \ - -H "X-GE-Timestamp: 1700000000" \ - -H "X-GE-Token: " -``` - -### 5.2 SDK 测试矩阵 - -| 模块 | 测试场景 | 预期结果 | -|------|----------|----------| -| **缓存** | 首次解析 | 请求服务器,写入缓存 | -| **缓存** | 缓存有效期内再次解析 | 直接返回缓存,不发请求 | -| **缓存** | 缓存软过期 | 返回旧数据,后台异步刷新 | -| **缓存** | 缓存完全过期 | 重新请求服务器 | -| **缓存** | 缓存超过100条 | LRU 淘汰最旧条目 | -| **网络感知** | WiFi → 4G 切换 | 清空所有缓存 | -| **网络感知** | 4G → WiFi 切换 | 清空所有缓存 | -| **网络感知** | IPv6 检测(双栈网络) | 返回 true | -| **容错** | 主节点超时 | 自动切换备用节点 | -| **容错** | 所有节点失败 | 降级到系统 DNS | -| **容错** | 黑名单域名 | 直接走系统 DNS | -| **监控** | 60 秒内多次解析 | 统计聚合后上报 | -| **监控** | 上报失败 | 静默失败,不影响解析 | - -### 5.3 Android 单元测试示例 - -```kotlin -@Test -fun `cache hit returns immediately without network request`() = runTest { - // Given - cacheManager.put("api.example.com", listOf("1.2.3.4"), emptyList(), 600) - - // When - val result = resolver.resolve("api.example.com") - - // Then - assertEquals(listOf("1.2.3.4"), result.ips) - assertEquals(ResolveSource.CACHE, result.source) - verify(httpClient, never()).get(any()) -} - -@Test -fun `soft expired cache triggers background refresh`() = runTest { - // Given: cache with TTL=1s, soft expire = TTL+60s - cacheManager.put("api.example.com", listOf("1.2.3.4"), emptyList(), 1) - advanceTimeBy(2000) // TTL expired but within soft expire - - // When - val result = resolver.resolve("api.example.com") - - // Then - assertEquals(listOf("1.2.3.4"), result.ips) // Returns stale data - advanceUntilIdle() - verify(httpClient).get(any()) // Background refresh triggered -} - -@Test -fun `network change clears cache`() = runTest { - // Given - cacheManager.put("api.example.com", listOf("1.2.3.4"), emptyList(), 600) - - // When - networkMonitor.simulateNetworkChange() - - // Then - val (cached, _) = cacheManager.get("api.example.com") - assertNull(cached) -} - -@Test -fun `fallback to system DNS on timeout`() = runTest { - // Given - whenever(httpClient.get(any())).thenThrow(TimeoutException()) - - // When - val result = resolver.resolve("api.example.com") - - // Then - assertEquals(ResolveSource.SYSTEM_DNS, result.source) -} -``` - ---- - -## 六、上线清单 - -### 6.1 服务端部署 - -- [ ] 数据库迁移(edgeHTTPDNSApps 表) -- [ ] Edge-API 部署 -- [ ] Edge-DNS 部署 -- [ ] Edge-Node 部署(含 WAF 指纹校验) -- [ ] Edge-Admin 部署(App 管理 UI) -- [ ] 创建测试 App(获取 AppID/Secret) - -### 6.2 SDK 发布 - -- [ ] Android SDK 单元测试通过 -- [ ] iOS SDK 单元测试通过 -- [ ] Android SDK 集成测试(真机) -- [ ] iOS SDK 集成测试(真机) -- [ ] SDK 打包发布(Maven/CocoaPods) -- [ ] SDK 接入文档发布 - -### 6.3 SDK 功能验收 - -- [ ] 缓存命中验证 -- [ ] 软过期刷新验证 -- [ ] LRU 淘汰验证 -- [ ] 网络切换清缓存验证 -- [ ] 多节点切换验证 -- [ ] 系统 DNS 降级验证 -- [ ] 监控数据上报验证 -- [ ] 预解析功能验证 - diff --git a/HttpDNS SDK 功能设计规范.pdf b/HttpDNS SDK 功能设计规范.pdf deleted file mode 100644 index 99aa321..0000000 Binary files a/HttpDNS SDK 功能设计规范.pdf and /dev/null differ diff --git a/default.etcd/.DS_Store b/default.etcd/.DS_Store deleted file mode 100644 index e2c7bea..0000000 Binary files a/default.etcd/.DS_Store and /dev/null differ diff --git a/default.etcd/member/.DS_Store b/default.etcd/member/.DS_Store deleted file mode 100644 index c830ffe..0000000 Binary files a/default.etcd/member/.DS_Store and /dev/null differ diff --git a/default.etcd/member/snap/db b/default.etcd/member/snap/db deleted file mode 100644 index 8034dd3..0000000 Binary files a/default.etcd/member/snap/db and /dev/null differ diff --git a/default.etcd/member/wal/0.tmp b/default.etcd/member/wal/0.tmp deleted file mode 100644 index ac8a519..0000000 Binary files a/default.etcd/member/wal/0.tmp and /dev/null differ diff --git a/default.etcd/member/wal/0000000000000000-0000000000000000.wal b/default.etcd/member/wal/0000000000000000-0000000000000000.wal deleted file mode 100644 index 60e7fea..0000000 Binary files a/default.etcd/member/wal/0000000000000000-0000000000000000.wal and /dev/null differ diff --git a/deploy/clickhouse/README.md b/deploy/clickhouse/README.md index 0767a56..d2b76af 100644 --- a/deploy/clickhouse/README.md +++ b/deploy/clickhouse/README.md @@ -1,111 +1,197 @@ -# ClickHouse + Fluent Bit 使用手册(Ubuntu 22.04 / Amazon Linux 2023) +# ClickHouse + Fluent Bit 快速部署(Ubuntu 22.04 / Amazon Linux 2023) -## 1. 支持范围 +## 1. 脚本说明 -- Ubuntu 22.04 -- Amazon Linux 2023(AWS) - -安装脚本:`install_clickhouse_linux.sh`(自动识别上述系统)。 - -## 2. 安装 ClickHouse +- `setup_clickhouse.sh`:一键入口(推荐),默认顺序执行 安装 ClickHouse -> 配置 HTTPS -> 应用运行参数 -> 初始化日志表。 +- `install_clickhouse_linux.sh`:安装 `clickhouse-server`、`clickhouse-client`,并启动服务。 +- `configure_clickhouse_https.sh`:生成自签名 `server.crt + server.key`,写入 HTTPS 配置并重启服务。 +- `configure_clickhouse_runtime.sh`:默认将日志级别设为 `warning`,并禁用高开销系统日志表(`text_log`、`part_log`、`metric_log`、`asynchronous_metric_log`、`trace_log`)。 +- `init_waf_logs_tables.sh`:执行建表脚本。 +- `init_waf_logs_tables.sql`:`logs_ingest`、`dns_logs_ingest` 表结构定义。 +进入脚本所在目录 ```bash -cd /path/to/waf-platform/deploy/clickhouse -chmod +x install_clickhouse_linux.sh -sudo ./install_clickhouse_linux.sh +cd /opt/waf-platform/deploy/clickhouse +chmod +x setup_clickhouse.sh ``` -可选:安装时初始化 `default` 用户密码: +## 2. 一键部署 + +### 2.1 方式A:不设置 ClickHouse 密码(用户名固定 `default`) ```bash -sudo CLICKHOUSE_DEFAULT_PASSWORD='YourStrongPassword' ./install_clickhouse_linux.sh -``` - -## 3. 开启 HTTPS(默认仅 crt+key) - -脚本默认生成 `server.crt + server.key`(带 SAN)并启用 8443: - -```bash -cd /path/to/waf-platform/deploy/clickhouse -chmod +x configure_clickhouse_https.sh -sudo CH_HTTPS_PORT=8443 \ - CH_CERT_CN=clickhouse.example.com \ - CH_CERT_DNS=clickhouse.example.com \ - CH_CERT_IP= \ - ./configure_clickhouse_https.sh -``` - -使用已有证书: - -```bash -sudo SRC_CERT=/path/to/server.crt \ - SRC_KEY=/path/to/server.key \ - CH_HTTPS_PORT=8443 \ - ./configure_clickhouse_https.sh -``` - -## 4. 初始化日志表(含优化) - -```bash -cd /path/to/waf-platform/deploy/clickhouse -chmod +x init_waf_logs_tables.sh -sudo CH_HOST=127.0.0.1 \ - CH_PORT=9000 \ - CH_USER=default \ - CH_PASSWORD='YourStrongPassword' \ - CH_DATABASE=default \ - ./init_waf_logs_tables.sh +sudo ./setup_clickhouse.sh ``` 说明: -- `init_waf_logs_tables.sql` 已内置主要优化(`CODEC`、`LowCardinality`、跳数索引)。 -- `optimize_schema.sql` 主要用于历史表补齐优化,不是首次建表必需步骤。 +- ClickHouse 连接用户是 `default` +- 未设置密码时,后续平台连接密码留空 -## 5. 平台侧配置(EdgeAdmin) +### 2.2 方式B:设置用户名/密码(示例使用 `default`) -在 ClickHouse 设置页配置: +```bash +sudo CH_USER='default' \ + CH_PASSWORD='YourStrongPassword' \ + CH_DATABASE='default' \ + ./setup_clickhouse.sh +``` -- Host:ClickHouse 地址 -- Port:`8443` -- Database:`default` -- Scheme:`https` +说明: +- `CH_USER`/`CH_PASSWORD`:初始化日志表时用于连接 ClickHouse +- 如果你使用自定义用户,把 `CH_USER` 改为你的用户名,并保证该用户已有对应数据库权限 -当前实现说明: -- 前端不再提供 `TLS跳过校验` 和 `TLS Server Name` 配置项。 -- 后端固定 `TLSSkipVerify=true`(默认不校验证书)。 +可选:单独应用运行参数(日志级别/系统日志表开关): -保存后点击“测试连接”。 +```bash +sudo CH_LOG_LEVEL=warning ./setup_clickhouse.sh runtime +``` -## 6. Fluent Bit 配置方式 +## 3. ClickHouse 安装后关键目录 -推荐平台托管模式(在线安装/升级 Node、DNS 时自动下发): +- 配置目录:`/etc/clickhouse-server/` +- 客户端配置目录:`/etc/clickhouse-client/` +- 数据目录:`/var/lib/clickhouse/` +- 日志目录:`/var/log/clickhouse-server/` +- HTTPS 覆盖配置:`/etc/clickhouse-server/config.d/waf-https.xml` +- 运行参数覆盖配置:`/etc/clickhouse-server/config.d/waf-runtime.xml` +- HTTPS 证书和私钥:`/etc/clickhouse-server/server.crt`、`/etc/clickhouse-server/server.key` +- 证书生成中间文件目录:`/etc/clickhouse-server/pki/` -- `/etc/fluent-bit/fluent-bit.conf` -- `/etc/fluent-bit/.edge-managed.env` -- `/etc/fluent-bit/.edge-managed.json` +## 4. 管理平台配置(EdgeAdmin) -检查状态: +页面路径: +- 左侧菜单:`系统设置` -> `高级设置` +- 顶部标签:`日志数据库(ClickHouse)` + +表单填写: +- `连接地址(Host)`:ClickHouse 地址(IP 或域名),如 `10.0.0.8` 或 `clickhouse.example.com` +- `协议(Scheme)`:`https` +- `端口(Port)`:`8443` +- `用户名(User)`:`default`(或你自定义的用户名) +- `密码(Password)`:对应用户密码 +- `数据库(Database)`:`default`(或你初始化日志表时使用的库名) + +提交顺序: +1. 点“测试连接” +2. 连接成功后点“保存” + +## 5. Fluent Bit(两种方式) + +### 5.1 跟随节点在线自动安装(推荐) + +说明: +- Node / DNS 在线安装或升级时,平台会自动安装/升级 Fluent Bit 并下发配置。 +- 默认由平台托管,不需要逐台手改配置文件。 + +安装后所在节点关键文件: +- `/etc/fluent-bit/fluent-bit.conf`:Fluent Bit 主配置(输入日志路径、输出 ClickHouse、性能参数)。 +- `/etc/fluent-bit/parsers.conf`:日志解析器定义(当前主要使用 JSON parser)。 +- `/etc/fluent-bit/.edge-managed.env`:平台下发的 ClickHouse 认证环境变量(`CH_USER`/`CH_PASSWORD`)。 +- `/etc/fluent-bit/.edge-managed.json`:平台下发的元数据(角色、配置哈希、版本、更新时间)。 + + +说明: +- 在线安装时,节点上的 `/etc/fluent-bit/fluent-bit.conf` 会被平台下发覆盖。 + +fluent-bit中ClickHouse 账号密码下发与更新逻辑: +- 下发来源:管理平台 -日志数据库(ClickHouse)中保存的账号密码。 +- 落地文件:平台在线安装或升级时写入节点 `/etc/fluent-bit/.edge-managed.env`,内容为 `CH_USER`、`CH_PASSWORD`。 +- 更新触发:当平台里的 ClickHouse 账号或密码变更后,需触发一次节点安装/升级任务以下发新凭证。 + +- 常见问题:只在 ClickHouse 侧改密码、未同步更新平台配置时,Fluent Bit 会出现认证失败(401/unauthorized)。 + +高配机器调优(当前默认按 4C8G 参数): +- 当前默认参数:`Flush=1`、`storage.backlog.mem_limit=512MB`、`Mem_Buf_Limit=256MB`、`workers=2`。 +- 机器升配后优先调这 4 个参数: + - `storage.backlog.mem_limit`:总缓冲上限(先增大,降低突发堆积丢日志风险)。 + - `Mem_Buf_Limit`:每个 tail input 的内存缓冲(HTTP 与 DNS 两段都要改)。 + - `workers`:输出并发写入线程数(HTTP 与 DNS 两段都要改)。 + - `Flush`:刷盘/发送间隔(值越小越实时,CPU/网络开销更高)。 +- 8C16G 参考值可按 `deploy/fluent-bit/fluent-bit-sample-8c16g.conf`: + - `storage.backlog.mem_limit=1024MB` + - `Mem_Buf_Limit=512MB` + - `workers=4` + - `Refresh_Interval=1` +- 修改方法: + 1. 编辑 `EdgeAPI/internal/installers/fluent_bit.go` 的 `renderManagedConfig()`。 + 2. 按上面参数同步修改 Node/DNS 两段 `[INPUT]` 和 `[OUTPUT]`。 + 3. 重新发布 API 并触发节点安装/升级任务,下发新配置。 + +检查: ```bash sudo systemctl status fluent-bit --no-pager sudo cat /etc/fluent-bit/.edge-managed.json +sudo journalctl -u fluent-bit -n 100 --no-pager ``` -## 7. 验证与排障 +### 5.2 手动安装(自动安装失败时) -查看 Fluent Bit 日志: +说明: +- 适合节点在线自动安装 Fluent Bit 失败的场景。 +- 采用在线安装方式,由你手动安装并维护配置。 + +步骤: + +1. 在线安装 Fluent Bit。 + +Ubuntu 22.04: + +```bash +curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh +sudo apt-get update -y +sudo apt-get install -y fluent-bit +``` + +AWS 2023: + +```bash +curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh +sudo dnf makecache -y +sudo dnf install -y fluent-bit +``` + +2. 放置配置文件: + +```bash +sudo mkdir -p /etc/fluent-bit +sudo cp /opt/waf-platform/deploy/fluent-bit/fluent-bit.conf /etc/fluent-bit/ +sudo cp /opt/waf-platform/deploy/fluent-bit/clickhouse-upstream.conf /etc/fluent-bit/ +sudo cp /opt/waf-platform/deploy/fluent-bit/parsers.conf /etc/fluent-bit/ +``` + +3. 修改 `/etc/fluent-bit/clickhouse-upstream.conf` 的 ClickHouse `Host`、`Port`(如 `8443`)。 +4. 配置认证环境变量(按需): + +```bash +sudo tee /etc/fluent-bit/fluent-bit.env >/dev/null <<'EOF' +CH_USER=default +CH_PASSWORD=YourStrongPassword +EOF +``` + +5. 让 systemd 读取环境变量: + +```bash +sudo mkdir -p /etc/systemd/system/fluent-bit.service.d +sudo tee /etc/systemd/system/fluent-bit.service.d/override.conf >/dev/null <<'EOF' +[Service] +EnvironmentFile=/etc/fluent-bit/fluent-bit.env +EOF +``` + +6. 启动并检查: + +```bash +sudo systemctl daemon-reload +sudo systemctl enable fluent-bit +sudo systemctl restart fluent-bit +sudo systemctl status fluent-bit --no-pager +sudo journalctl -u fluent-bit -n 100 --no-pager +``` + +## 6. 验证 ```bash sudo journalctl -u fluent-bit -f ``` - -查看写入: - -```sql -SELECT count() FROM default.logs_ingest; -SELECT count() FROM default.dns_logs_ingest; -``` - -常见错误: -- `connection refused`:8443 未监听或网络未放行。 -- `legacy Common Name`:证书缺 SAN,需重签。 diff --git a/deploy/clickhouse/configure_clickhouse_https.sh b/deploy/clickhouse/configure_clickhouse_https.sh index 783cc37..ee120ea 100644 --- a/deploy/clickhouse/configure_clickhouse_https.sh +++ b/deploy/clickhouse/configure_clickhouse_https.sh @@ -46,18 +46,12 @@ CH_CERT_CN="${CH_CERT_CN:-$(hostname -f 2>/dev/null || hostname)}" CH_CERT_DNS="${CH_CERT_DNS:-}" CH_CERT_IP="${CH_CERT_IP:-}" CH_CERT_DAYS="${CH_CERT_DAYS:-825}" -CH_GENERATE_CA="${CH_GENERATE_CA:-false}" - -SRC_CERT="${SRC_CERT:-}" -SRC_KEY="${SRC_KEY:-}" -SRC_CA="${SRC_CA:-}" CH_DIR="/etc/clickhouse-server" CH_CONFIG_D_DIR="${CH_DIR}/config.d" PKI_DIR="${CH_DIR}/pki" SERVER_CERT="${CH_DIR}/server.crt" SERVER_KEY="${CH_DIR}/server.key" -CA_CERT="${CH_DIR}/ca.crt" OVERRIDE_FILE="${CH_CONFIG_D_DIR}/waf-https.xml" mkdir -p "${CH_CONFIG_D_DIR}" "${PKI_DIR}" @@ -117,72 +111,13 @@ EOF cp -f "${server_crt}" "${SERVER_CERT}" cp -f "${server_key}" "${SERVER_KEY}" - rm -f "${CA_CERT}" } -generate_cert_with_ca() { - echo "[INFO] generating local CA and server certificate ..." - local ca_key="${PKI_DIR}/ca.key" - local ca_crt="${PKI_DIR}/ca.crt" - local server_key="${PKI_DIR}/server.key" - local server_csr="${PKI_DIR}/server.csr" - local server_crt="${PKI_DIR}/server.crt" - local ext_file="${PKI_DIR}/server.ext" - local san_line - san_line="$(build_san_line)" - - openssl genrsa -out "${ca_key}" 4096 - openssl req -x509 -new -nodes -key "${ca_key}" -sha256 -days 3650 \ - -out "${ca_crt}" -subj "/CN=ClickHouse Local CA" - - openssl genrsa -out "${server_key}" 2048 - openssl req -new -key "${server_key}" -out "${server_csr}" -subj "/CN=${CH_CERT_CN}" - - cat >"${ext_file}" <"${OVERRIDE_FILE}" <"${OVERRIDE_FILE}" < + + ${CH_LOG_LEVEL} + + + + + + + + +EOF + +echo "[INFO] restarting clickhouse-server ..." +systemctl restart clickhouse-server +sleep 2 + +echo "[INFO] service status ..." +systemctl --no-pager -l status clickhouse-server | sed -n '1,15p' + +echo "[OK] ClickHouse runtime config applied" +echo " file : ${OVERRIDE_FILE}" +echo " logger level: ${CH_LOG_LEVEL}" diff --git a/deploy/clickhouse/optimize_schema.sql b/deploy/clickhouse/optimize_schema.sql deleted file mode 100644 index 2ad8586..0000000 --- a/deploy/clickhouse/optimize_schema.sql +++ /dev/null @@ -1,123 +0,0 @@ --- ============================================================================= --- ClickHouse logs_ingest 表优化脚本 --- --- 说明: --- - 所有 ALTER 操作均为在线操作,无需停服 --- - 建议按阶段顺序执行,每阶段执行后观察 system.parts 确认生效 --- - 压缩编解码器变更仅影响新写入的 part,存量数据需等 merge 或手动 OPTIMIZE --- --- 执行方式: --- clickhouse-client --host 127.0.0.1 --port 9000 --user default --password 'xxx' < optimize_schema.sql --- ============================================================================= - --- ============================================= --- 阶段 1:大字段压缩优化(效果最显著) --- ============================================= - --- 大文本字段改用 ZSTD(3),对 JSON / HTTP 文本压缩率远优于默认 LZ4 --- 预期效果:磁盘占用减少 40%-60% -ALTER TABLE logs_ingest MODIFY COLUMN request_headers String CODEC(ZSTD(3)); -ALTER TABLE logs_ingest MODIFY COLUMN request_body String CODEC(ZSTD(3)); -ALTER TABLE logs_ingest MODIFY COLUMN response_headers String CODEC(ZSTD(3)); -ALTER TABLE logs_ingest MODIFY COLUMN response_body String CODEC(ZSTD(3)); - --- 中等长度文本字段用 ZSTD(1),平衡压缩率与 CPU 开销 -ALTER TABLE logs_ingest MODIFY COLUMN ua String CODEC(ZSTD(1)); -ALTER TABLE logs_ingest MODIFY COLUMN path String CODEC(ZSTD(1)); -ALTER TABLE logs_ingest MODIFY COLUMN referer String CODEC(ZSTD(1)); - --- 低基数字段改用 LowCardinality(内存+磁盘双降) --- method 的基数极低(GET/POST/PUT/DELETE 等),host 基数取决于站点数量 -ALTER TABLE logs_ingest MODIFY COLUMN method LowCardinality(String); -ALTER TABLE logs_ingest MODIFY COLUMN log_type LowCardinality(String); -ALTER TABLE logs_ingest MODIFY COLUMN host LowCardinality(String); - --- 数值字段使用 Delta + ZSTD 编码(利用相邻行的时间/大小相关性) -ALTER TABLE logs_ingest MODIFY COLUMN bytes_in UInt64 CODEC(Delta, ZSTD(1)); -ALTER TABLE logs_ingest MODIFY COLUMN bytes_out UInt64 CODEC(Delta, ZSTD(1)); -ALTER TABLE logs_ingest MODIFY COLUMN cost_ms UInt32 CODEC(Delta, ZSTD(1)); - --- ============================================= --- 阶段 2:添加 Skipping Index(加速高频过滤查询) --- ============================================= - --- trace_id 精确查找(查看日志详情 FindByTraceId) --- bloom_filter(0.01) = 1% 误判率,GRANULARITY 4 = 每 4 个 granule 一个 bloom block -ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_trace_id trace_id TYPE bloom_filter(0.01) GRANULARITY 4; - --- IP 精确查找 -ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_ip ip TYPE bloom_filter(0.01) GRANULARITY 4; - --- host 模糊查询支持(tokenbf_v1 对 LIKE '%xxx%' 有效) -ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_host host TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4; - --- firewall_policy_id 过滤(HasFirewallPolicy: > 0) -ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_fw_policy firewall_policy_id TYPE minmax GRANULARITY 4; - --- status 范围过滤(HasError: status >= 400) -ALTER TABLE logs_ingest ADD INDEX IF NOT EXISTS idx_status status TYPE minmax GRANULARITY 4; - --- ============================================= --- 阶段 3:物化索引到现有数据(对存量数据生效) --- ============================================= --- 注意:MATERIALIZE INDEX 会触发后台 mutation,大表可能需要一定时间 --- 可通过 SELECT * FROM system.mutations WHERE is_done = 0 监控进度 - -ALTER TABLE logs_ingest MATERIALIZE INDEX idx_trace_id; -ALTER TABLE logs_ingest MATERIALIZE INDEX idx_ip; -ALTER TABLE logs_ingest MATERIALIZE INDEX idx_host; -ALTER TABLE logs_ingest MATERIALIZE INDEX idx_fw_policy; -ALTER TABLE logs_ingest MATERIALIZE INDEX idx_status; - - --- ============================================================================= --- dns_logs_ingest 表优化(DNS 日志表) --- ============================================================================= - --- 大文本字段压缩 -ALTER TABLE dns_logs_ingest MODIFY COLUMN content_json String CODEC(ZSTD(3)); -ALTER TABLE dns_logs_ingest MODIFY COLUMN error String CODEC(ZSTD(1)); - --- 低基数字段 -ALTER TABLE dns_logs_ingest MODIFY COLUMN question_type LowCardinality(String); -ALTER TABLE dns_logs_ingest MODIFY COLUMN record_type LowCardinality(String); -ALTER TABLE dns_logs_ingest MODIFY COLUMN networking LowCardinality(String); - --- request_id 精确查找 -ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_request_id request_id TYPE bloom_filter(0.01) GRANULARITY 4; - --- remote_addr 精确查找 -ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_remote_addr remote_addr TYPE bloom_filter(0.01) GRANULARITY 4; - --- question_name 模糊查询 -ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_question_name question_name TYPE tokenbf_v1(10240, 3, 0) GRANULARITY 4; - --- domain_id 过滤 -ALTER TABLE dns_logs_ingest ADD INDEX IF NOT EXISTS idx_domain_id domain_id TYPE minmax GRANULARITY 4; - --- 物化索引到现有数据 -ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_request_id; -ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_remote_addr; -ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_question_name; -ALTER TABLE dns_logs_ingest MATERIALIZE INDEX idx_domain_id; - - --- ============================================================================= --- 验证命令(执行完上述 ALTER 后运行) --- ============================================================================= - --- 查看列的压缩编解码器 --- SELECT name, type, compression_codec FROM system.columns WHERE table = 'logs_ingest' AND database = currentDatabase(); - --- 查看表的压缩率 --- SELECT table, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio FROM system.columns WHERE table IN ('logs_ingest', 'dns_logs_ingest') GROUP BY table; - --- 查看各列占用的磁盘空间(找出最大的列) --- SELECT name, formatReadableSize(sum(data_compressed_bytes)) AS compressed, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed FROM system.columns WHERE table = 'logs_ingest' GROUP BY name ORDER BY sum(data_compressed_bytes) DESC; - --- 查看 mutation 进度 --- SELECT database, table, mutation_id, command, is_done, parts_to_do FROM system.mutations WHERE is_done = 0; - --- 强制触发 merge(可选,让压缩编解码器变更对存量数据生效) --- OPTIMIZE TABLE logs_ingest FINAL; --- OPTIMIZE TABLE dns_logs_ingest FINAL; diff --git a/deploy/clickhouse/setup_clickhouse.sh b/deploy/clickhouse/setup_clickhouse.sh new file mode 100644 index 0000000..9af368d --- /dev/null +++ b/deploy/clickhouse/setup_clickhouse.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INSTALL_SCRIPT="${SCRIPT_DIR}/install_clickhouse_linux.sh" +HTTPS_SCRIPT="${SCRIPT_DIR}/configure_clickhouse_https.sh" +RUNTIME_SCRIPT="${SCRIPT_DIR}/configure_clickhouse_runtime.sh" +TABLES_SCRIPT="${SCRIPT_DIR}/init_waf_logs_tables.sh" + +usage() { + cat <<'EOF' +Usage: + sudo ./setup_clickhouse.sh [all|install|https|runtime|tables] + +Modes: + all Install ClickHouse, configure HTTPS, apply runtime config, init ingest tables (default) + install Install ClickHouse only + https Configure HTTPS only + runtime Apply ClickHouse runtime config only + tables Initialize ingest tables only + +Common env vars: + CLICKHOUSE_DEFAULT_PASSWORD Default user password set during install + CH_HTTPS_PORT HTTPS port (default: 8443) + CH_CERT_CN Certificate CN + CH_CERT_DNS Certificate SAN DNS list (comma-separated) + CH_CERT_IP Certificate SAN IP list (comma-separated) + CH_CERT_DAYS Certificate validity days (default: 825) + CH_LOG_LEVEL ClickHouse logger level (default: warning) + CH_HOST ClickHouse host for table init (default: 127.0.0.1) + CH_PORT ClickHouse port for table init (default: 9000) + CH_USER ClickHouse user for table init (default: default) + CH_PASSWORD ClickHouse password for table init + CH_DATABASE Database for table init (default: default) +EOF +} + +require_script() { + local script="$1" + if [[ ! -f "${script}" ]]; then + echo "[ERROR] required file not found: ${script}" + exit 1 + fi +} + +run_install() { + echo "[INFO] step 1/3: install ClickHouse ..." + bash "${INSTALL_SCRIPT}" +} + +run_https() { + echo "[INFO] step 2/3: configure ClickHouse HTTPS ..." + bash "${HTTPS_SCRIPT}" +} + +run_runtime() { + echo "[INFO] step 3/4: apply ClickHouse runtime config ..." + bash "${RUNTIME_SCRIPT}" +} + +run_tables() { + echo "[INFO] step 4/4: initialize ingest tables ..." + bash "${TABLES_SCRIPT}" +} + +MODE="${1:-all}" + +case "${MODE}" in + -h|--help|help) + usage + exit 0 + ;; + all|install|https|runtime|tables) + ;; + *) + echo "[ERROR] invalid mode: ${MODE}" + usage + exit 1 + ;; +esac + +require_script "${INSTALL_SCRIPT}" +require_script "${HTTPS_SCRIPT}" +require_script "${RUNTIME_SCRIPT}" +require_script "${TABLES_SCRIPT}" + +case "${MODE}" in + all) + run_install + run_https + run_runtime + run_tables + ;; + install) + run_install + ;; + https) + run_https + ;; + runtime) + run_runtime + ;; + tables) + run_tables + ;; +esac + +echo "[OK] setup completed: mode=${MODE}" diff --git a/deploy/fluent-bit/.gitignore b/deploy/fluent-bit/.gitignore deleted file mode 100644 index 7401f92..0000000 --- a/deploy/fluent-bit/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -fluent-bit-windows.conf -clickhouse-upstream-windows.conf diff --git a/deploy/fluent-bit/README.md b/deploy/fluent-bit/README.md deleted file mode 100644 index 3706623..0000000 --- a/deploy/fluent-bit/README.md +++ /dev/null @@ -1,471 +0,0 @@ -# 边缘节点日志链路部署(Fluent Bit + ClickHouse) - -与 [日志链路调整方案](../../log-pipeline-migration-plan.md) 配套的配置与部署说明。本文档为 **Fluent Bit 部署教程**,按步骤即可在边缘节点或日志采集机上跑通采集 → ClickHouse 写入。 - ---- - -## Fluent Bit 跑在哪台机器上? - -**Fluent Bit 应部署在写日志文件的节点机器上**(EdgeNode / EdgeDNS 同机),不要部署在 EdgeAPI 机器上。 - -- HTTP 日志文件默认在 `/var/log/edge/edge-node/*.log`,由 **EdgeNode** 本机写入;若配置了公用访问日志策略的文件 `path`,节点会优先复用该 `path` 所在目录。 -- DNS 日志文件默认在 `/var/log/edge/edge-dns/*.log`,由 **EdgeDNS** 本机写入;若配置了公用访问日志策略的文件 `path`,节点会优先复用该 `path` 所在目录。 -- Fluent Bit 使用 **tail** 读取本机路径,因此必须运行在这些日志文件所在机器上。 -- EdgeAPI 机器主要负责查询 ClickHouse/MySQL,不需要承担日志采集。 -- 多机部署时,每台写日志节点都跑一份 Fluent Bit,上报到同一 ClickHouse 集群。 - ---- - -## 一、前置条件 - -- **边缘节点(EdgeNode)** 已开启本地日志落盘,目录优先取“公用访问日志策略”的文件 `path`(取目录),为空时回退 `EDGE_LOG_DIR`,再回退默认 `/var/log/edge/edge-node`;生成 `access.log`、`waf.log`、`error.log`(JSON Lines)。 -- **DNS 节点(EdgeDNS)** 已开启本地日志落盘,目录优先取“公用访问日志策略”的文件 `path`(取目录),为空时回退 `EDGE_DNS_LOG_DIR`,再回退默认 `/var/log/edge/edge-dns`;生成 `access.log`(JSON Lines)。 -- **ClickHouse** 已安装并可访问(单机或集群),且已创建好 `logs_ingest` 表(见下文「五、ClickHouse 建表」)。 -- 若 Fluent Bit 与 ClickHouse 不在同一台机,需保证网络可达(默认 HTTPS 端口 8443)。 -- 日志轮转默认由 Node/DNS 内建 `lumberjack` 执行: - - `maxSizeMB=256` - - `maxBackups=14` - - `maxAgeDays=7` - - `compress=false` - - `localTime=true` - 可通过公用日志策略 `file.rotate` 调整。 - ---- - -## 二、安装 Fluent Bit - -### 2.1 Ubuntu / Debian - -```bash -# 添加 Fluent Bit 官方源并安装(以 Ubuntu 22.04 为例) -curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh -sudo apt-get install -y fluent-bit - -# 或使用 TD Agent Bit 源(若需 ClickHouse 等扩展) -# 见:https://docs.fluentbit.io/manual/installation/linux/ubuntu -``` - -### 2.2 CentOS / RHEL / Amazon Linux - -```bash -# 使用官方 install 脚本 -curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh - -# 或 yum/dnf 安装(以提供的仓库为准) -# sudo yum install -y fluent-bit -``` - -### 2.3 使用二进制包 - -从 [Fluent Bit 官方 Release](https://github.com/fluent/fluent-bit/releases) 下载对应架构的 tarball,解压后将 `bin/fluent-bit` 放到 PATH,并确保其 **Output 插件支持 ClickHouse**(部分发行版或自编译需启用 `out_clickhouse`)。 - ---- - -## 三、部署配置文件 - -### 3.1 放置配置 - -将本目录下配置文件放到同一目录,例如 `/etc/fluent-bit/` 或 `/opt/edge/fluent-bit/`: - -```bash -sudo mkdir -p /etc/fluent-bit -sudo cp fluent-bit.conf clickhouse-upstream.conf /etc/fluent-bit/ -``` - -两文件需在同一目录,因 `fluent-bit.conf` 中有 `@INCLUDE clickhouse-upstream.conf`。 - -### 3.2 修改 ClickHouse 地址(必做) - -编辑 `clickhouse-upstream.conf`,按实际环境填写 ClickHouse 的 Host/Port: - -- **单机**:保留一个 `[NODE]`,改 `Host`、`Port`(默认 8443)。 -- **集群**:复制多段 `[NODE]`,每段一个节点,例如: - -```ini -[UPSTREAM] - Name ch_backends - -[NODE] - Name node-01 - Host 192.168.1.10 - Port 8443 - -[NODE] - Name node-02 - Host 192.168.1.11 - Port 8443 -``` - -### 3.3 ClickHouse 账号密码(有密码时必做) - -不在 `clickhouse-upstream.conf` 里配置密码,而是通过 **环境变量** 传给 Fluent Bit: - -- `CH_USER`:ClickHouse 用户名(如 `default`)。 -- `CH_PASSWORD`:对应用户的密码。 - -在 systemd 或启动脚本中设置(见下文「四、以 systemd 方式运行」)。 - -### 3.4 日志路径与 parsers.conf - -- **日志路径**:`fluent-bit.conf` 里已同时配置 HTTP 与 DNS 两类路径: - - HTTP:`/var/log/edge/edge-node/*.log` - - DNS:`/var/log/edge/edge-dns/*.log` - 若你配置了公用访问日志策略的文件 `path`,或改了 `EDGE_LOG_DIR` / `EDGE_DNS_LOG_DIR`,请同步修改对应 `Path`。 -- **Parsers_File**:主配置引用了 `parsers.conf`。若安装包自带(如 `/etc/fluent-bit/parsers.conf`),无需改动;若启动报错找不到文件,可: - - 从 Fluent Bit 官方仓库复制 [conf/parsers.conf](https://github.com/fluent/fluent-bit/blob/master/conf/parsers.conf) 到同一目录,或 - - 在同一目录新建空文件 `parsers.conf`(仅当不使用任何 parser 时)。 - -### 3.5 数据与状态目录 - -Fluent Bit 会使用配置里的 `storage.path` 和 DB 路径,需保证进程有写权限: - -```bash -sudo mkdir -p /var/lib/fluent-bit/storage -sudo chown -R <运行 fluent-bit 的用户>:<同组> /var/lib/fluent-bit -``` - ---- - -## 四、以 systemd 方式运行 - -### 4.1 使用自带服务(若安装包已提供) - -若通过 apt/yum 安装,通常已有 `fluent-bit.service`。先改配置路径和环境变量: - -```bash -# 编辑服务文件(路径以实际为准,如 /lib/systemd/system/fluent-bit.service) -sudo systemctl edit fluent-bit --full -``` - -在 `[Service]` 中增加或修改: - -- `EnvironmentFile` 指向你的环境变量文件,或直接写: - - `Environment="CH_USER=default"` - - `Environment="CH_PASSWORD=你的密码"` -- `ExecStart` 中的配置文件路径改为你的 `fluent-bit.conf`,例如: - - `ExecStart=/opt/fluent-bit/bin/fluent-bit -c /etc/fluent-bit/fluent-bit.conf` - -然后: - -```bash -sudo systemctl daemon-reload -sudo systemctl enable fluent-bit -sudo systemctl start fluent-bit -sudo systemctl status fluent-bit -``` - -### 4.2 自定义 systemd 单元(无自带服务时) - -新建 `/etc/systemd/system/fluent-bit-edge.service`: - -```ini -[Unit] -Description=Fluent Bit - Edge Node Logs to ClickHouse -After=network.target - -[Service] -Type=simple -ExecStart=/usr/bin/fluent-bit -c /etc/fluent-bit/fluent-bit.conf -Restart=always -RestartSec=5 -# ClickHouse 认证(按需修改) -Environment="CH_USER=default" -Environment="CH_PASSWORD=your_clickhouse_password" - -[Install] -WantedBy=multi-user.target -``` - -若密码含特殊字符,建议用 `EnvironmentFile=/etc/fluent-bit/fluent-bit.env`,并在该文件中写: - -```bash -CH_USER=default -CH_PASSWORD=your_clickhouse_password -``` - -然后: - -```bash -sudo systemctl daemon-reload -sudo systemctl enable fluent-bit-edge -sudo systemctl start fluent-bit-edge -sudo systemctl status fluent-bit-edge -``` - -### 4.3 前台调试 - -不依赖 systemd 时可直接前台跑(便于看日志): - -```bash -export CH_USER=default -export CH_PASSWORD=your_clickhouse_password -fluent-bit -c /etc/fluent-bit/fluent-bit.conf -``` - ---- - -## 五、ClickHouse 建表 - -平台(EdgeAPI)会查询两张表: -- HTTP:`logs_ingest` -- DNS:`dns_logs_ingest` - -需在 ClickHouse 中先建表。库名默认为 `default`,若使用其它库,需与 EdgeAPI 的 `CLICKHOUSE_DATABASE` 一致。 - -在 ClickHouse 中执行(按需改库名或引擎): - -```sql -CREATE TABLE IF NOT EXISTS default.logs_ingest -( - timestamp DateTime, - node_id UInt64, - cluster_id UInt64, - server_id UInt64, - host String, - ip String, - method String, - path String, - status UInt16, - bytes_in UInt64, - bytes_out UInt64, - cost_ms UInt32, - ua String, - referer String, - log_type String, - trace_id String, - firewall_policy_id UInt64 DEFAULT 0, - firewall_rule_group_id UInt64 DEFAULT 0, - firewall_rule_set_id UInt64 DEFAULT 0, - firewall_rule_id UInt64 DEFAULT 0, - request_headers String DEFAULT '', - request_body String DEFAULT '', - response_headers String DEFAULT '', - response_body String DEFAULT '' -) -ENGINE = MergeTree() -ORDER BY (timestamp, node_id, server_id, trace_id) -SETTINGS index_granularity = 8192; -``` - -DNS 日志建表: - -```sql -CREATE TABLE IF NOT EXISTS default.dns_logs_ingest -( - timestamp DateTime, - request_id String, - node_id UInt64, - cluster_id UInt64, - domain_id UInt64, - record_id UInt64, - remote_addr String, - question_name String, - question_type String, - record_name String, - record_type String, - record_value String, - networking String, - is_recursive UInt8, - error String, - ns_route_codes Array(String), - content_json String DEFAULT '' -) -ENGINE = MergeTree() -ORDER BY (timestamp, request_id, node_id) -SETTINGS index_granularity = 8192; -``` - -- **log_type**:`access` / `waf` / `error`;攻击日志同时看 **firewall_rule_id** 或 **firewall_policy_id** 是否大于 0(与原有 MySQL 通过规则 ID 判断攻击日志一致)。 -- **request_headers / response_headers**:JSON 字符串;**request_body / response_body**:请求/响应体(单条建议限制长度,如 512KB)。 -- **request_body 为空**:需在管理端为该站点/服务的「访问日志」策略中勾选「请求Body」后才会落盘;默认未勾选。路径大致为:站点/服务 → 访问日志 → 策略 → 记录字段 → 勾选「请求Body」。WAF 拦截且策略开启「记录请求Body」时也会记录。 -- **response_body 为空**:当前版本未实现(proto 与节点均未支持响应体落盘),表中已预留字段,后续可扩展。 -- **原有 MySQL 日志同步到 ClickHouse**:见 [mysql-to-clickhouse-migration.md](mysql-to-clickhouse-migration.md)。 - -若表已存在且缺少新字段,可执行: - -```sql -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS firewall_policy_id UInt64 DEFAULT 0; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS firewall_rule_group_id UInt64 DEFAULT 0; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS firewall_rule_set_id UInt64 DEFAULT 0; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS firewall_rule_id UInt64 DEFAULT 0; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS request_headers String DEFAULT ''; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS request_body String DEFAULT ''; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS response_headers String DEFAULT ''; -ALTER TABLE default.logs_ingest ADD COLUMN IF NOT EXISTS response_body String DEFAULT ''; -ALTER TABLE default.dns_logs_ingest ADD COLUMN IF NOT EXISTS content_json String DEFAULT ''; -``` - -Fluent Bit 写入时使用 `json_date_key timestamp` 和 `json_date_format epoch`,会将 JSON 中的 `timestamp`(Unix 秒)转为 DateTime。 - ---- - -## 六、验证与排错 - -1. **看 Fluent Bit 日志** - - systemd:`journalctl -u fluent-bit-edge -f`(或你的服务名) - - 前台:直接看终端输出。 - -2. **看 ClickHouse 是否有数据** - ```sql - SELECT count() FROM default.logs_ingest; - SELECT * FROM default.logs_ingest LIMIT 5; - SELECT count() FROM default.dns_logs_ingest; - SELECT * FROM default.dns_logs_ingest LIMIT 5; - ``` - -3. **常见问题** - - **连接被拒**:检查 `clickhouse-upstream.conf` 的 Host/Port、防火墙、ClickHouse 的 `listen_host`。 - - **认证失败**:检查 `CH_USER`、`CH_PASSWORD` 是否与 ClickHouse 用户一致,环境变量是否被 systemd 正确加载。 - - **找不到 parsers.conf**:见上文 3.4。 - - **没有新数据**:确认 EdgeNode/EdgeDNS 已写日志到 `Path` 下,且 Fluent Bit 对目录有读权限;可分别执行 `tail -f /var/log/edge/edge-node/access.log` 与 `tail -f /var/log/edge/edge-dns/access.log`。 - - **Node 上没有 `/var/log/edge/edge-node/access.log`**:见下文「八、Node 上找不到日志文件」。 - ---- - -## 七、与其它组件的关系(简要) - -| 组件 | 说明 | -|------|------| -| **EdgeNode** | 日志落盘路径优先复用公用访问日志策略文件 `path`(取目录);若为空回退 `EDGE_LOG_DIR`,再回退默认 `/var/log/edge/edge-node`;生成 `access.log`、`waf.log`、`error.log`;内建 lumberjack 轮转(默认 256MB/14份/7天,可按策略调整),仍支持 SIGHUP 重建 writer。 | -| **EdgeDNS** | DNS 访问日志落盘路径优先复用公用访问日志策略文件 `path`(取目录);若为空回退 `EDGE_DNS_LOG_DIR`,再回退默认 `/var/log/edge/edge-dns`;生成 `access.log`(JSON Lines),由 Fluent Bit 采集写入 `dns_logs_ingest`。 | -| **logrotate** | 可选的历史兼容方案(已非必需);默认建议使用节点内建 lumberjack 轮转。 | -| **平台(EdgeAPI)** | 配置 ClickHouse 只读连接(`CLICKHOUSE_HOST`、`CLICKHOUSE_PORT`、`CLICKHOUSE_USER`、`CLICKHOUSE_PASSWORD`、`CLICKHOUSE_DATABASE`);当请求带 `Day` 且已配置 ClickHouse 时,访问日志列表查询走 ClickHouse。 | - ---- - -## 八、Node 上找不到日志文件 - -若在 EdgeNode 机器上执行 `tail -f /var/log/edge/edge-node/access.log` 报 **No such file or directory**,按下面检查: - -1. **EdgeNode 版本** - 本地日志落盘是较新功能,需使用**包含该功能的 EdgeNode 构建**(当前仓库版本在首次加载配置时会预创建目录和三个空日志文件)。 - -2. **预创建目录(可选)** - 若进程以非 root 运行,可先手动建目录并赋权,避免无权限创建 `/var/log/edge`: - ```bash - sudo mkdir -p /var/log/edge/edge-node - sudo chown <运行 edge-node 的用户>:<同组> /var/log/edge/edge-node - ``` - -3. **重启 EdgeNode** - 新版本在**首次成功加载节点配置后**会调用 `EnsureInit()`,自动创建 `/var/log/edge/edge-node` 及 `access.log`、`waf.log`、`error.log`。重启一次 edge-node 后再看目录下是否已有文件。 - -4. **自定义路径** - 若在管理端设置了公用访问日志策略的文件 `path`,节点会优先使用该目录;否则才使用 `EDGE_LOG_DIR`。Fluent Bit 的 `Path` 需与实际目录一致。 - -以上完成即完成 Fluent Bit 的部署与验证。 - ---- - -## 九、HTTPS 模式(ClickHouse) - -当 ClickHouse 只开放 HTTPS(如 8443)或链路必须加密时,使用本目录新增模板: - -- `fluent-bit-https.conf`:Node+DNS 同机采集(HTTP+DNS 双输入) -- `fluent-bit-dns-https.conf`:仅 DNS 节点采集 -- `fluent-bit-windows-https.conf`:Windows 节点 HTTPS 采集 - -### 9.1 什么时候用 HTTPS 模板 - -- ClickHouse 仅开放 HTTPS 端口; -- 节点到 ClickHouse 跨公网或需要传输加密; -- 你希望启用证书校验和 SNI。 - -### 9.2 最小切换步骤(Linux) - -1. 备份当前配置: -```bash -sudo cp /etc/fluent-bit/fluent-bit.conf /etc/fluent-bit/fluent-bit.conf.bak -``` - -2. 切换为 HTTPS 模板(Node+DNS 同机示例): -```bash -sudo cp /path/to/fluent-bit-https.conf /etc/fluent-bit/fluent-bit.conf -``` - -3. 设置账号密码(按你的服务文件方式设置): -```bash -export CH_USER=default -export CH_PASSWORD='your_password' -``` - -4. 修改模板中的关键项: -- `Host` / `Port`(HTTPS 常见端口 `8443`) -- `tls.verify`:`On`/`Off` -- `tls.ca_file`:自签名证书建议配置 CA 文件 -- `tls.vhost`:证书 CN/SAN 对应主机名(SNI) - -5. 重启并检查: -```bash -sudo systemctl restart fluent-bit -sudo systemctl status fluent-bit -journalctl -u fluent-bit -f -``` - -### 9.3 验证点 - -- `default.logs_ingest` 有新增数据(HTTP) -- `default.dns_logs_ingest` 有新增数据(DNS) -- Fluent Bit 日志中无 TLS 握手失败(`certificate`, `x509`, `tls`) - -### 9.4 回滚 - -TLS 配置错误导致中断时,快速回滚: - -```bash -sudo cp /etc/fluent-bit/fluent-bit.conf.bak /etc/fluent-bit/fluent-bit.conf -sudo systemctl restart fluent-bit -``` - -回滚后恢复原 HTTP 模式,不影响平台 API/管理端配置。 - ---- - -## 十、平台托管模式(推荐) - -从 `v1.4.7` 开始,Node/DNS 在线安装流程会由平台托管 Fluent Bit,默认不再要求逐台手改 `/etc/fluent-bit/fluent-bit.conf`。 - -### 10.1 托管行为 - -- 安装器优先使用发布包内置离线包(不走 `curl | sh`)。 -- 首次安装后写入: - - `/etc/fluent-bit/fluent-bit.conf` - - `/etc/fluent-bit/parsers.conf` - - `/etc/fluent-bit/.edge-managed.env` - - `/etc/fluent-bit/.edge-managed.json` -- 配置发生变化时按 `hash` 幂等更新,仅在内容变化时重启服务。 -- Node 与 DNS 同机安装时会自动合并角色,输出单份配置。 - -### 10.2 托管元数据 - -平台会维护 `/etc/fluent-bit/.edge-managed.json`,核心字段: - -- `roles`: 当前机器启用角色(`node`/`dns`) -- `hash`: 当前托管配置摘要 -- `sourceVersion`: 平台版本号 -- `updatedAt`: 最近更新时间戳 - -### 10.3 支持矩阵(离线包) - -当前固定支持以下平台键: - -- `ubuntu22.04-amd64` -- `ubuntu22.04-arm64` -- `amzn2023-amd64` -- `amzn2023-arm64` - -构建阶段会校验矩阵包是否齐全,缺失会直接失败并打印期望文件路径。 - -### 10.4 手工配置兼容 - -- 若现有 `fluent-bit.conf` 不是平台托管文件(不含 `managed-by-edgeapi` 标记),安装器不会强制覆盖,会返回明确错误提示。 -- 需要切到托管模式时,先备份旧配置,再由平台触发一次安装/更新任务。 - -### 10.5 Resource Profile Notes (New) - -- Managed default is now tuned for `2C4G` nodes (conservative and stable). -- Additional sample profiles are provided for larger nodes: - - `deploy/fluent-bit/fluent-bit-sample-4c8g.conf` - - `deploy/fluent-bit/fluent-bit-sample-8c16g.conf` -- These sample files are for benchmark/reference only and are not auto-applied by installer. -- To use higher profiles in managed mode, sync those parameters into `EdgeAPI/internal/installers/fluent_bit.go` and then trigger node reinstall/upgrade. diff --git a/deploy/fluent-bit/fluent-bit-sample-4c8g.conf b/deploy/fluent-bit/fluent-bit-sample-4c8g.conf deleted file mode 100644 index 56b2f6a..0000000 --- a/deploy/fluent-bit/fluent-bit-sample-4c8g.conf +++ /dev/null @@ -1,69 +0,0 @@ -# Sample profile for 4C8G nodes (Node + DNS on same host). -# Replace Host/Port/URI and credentials according to your ClickHouse deployment. - -[SERVICE] - Flush 1 - Log_Level info - Parsers_File parsers.conf - storage.path /var/lib/fluent-bit/storage - storage.sync normal - storage.checksum off - storage.backlog.mem_limit 512MB - -[INPUT] - Name tail - Path /var/log/edge/edge-node/*.log - Tag app.http.logs - Parser json - Refresh_Interval 2 - Read_from_Head false - DB /var/lib/fluent-bit/http-logs.db - storage.type filesystem - Mem_Buf_Limit 256MB - Skip_Long_Lines On - -[INPUT] - Name tail - Path /var/log/edge/edge-dns/*.log - Tag app.dns.logs - Parser json - Refresh_Interval 2 - Read_from_Head false - DB /var/lib/fluent-bit/dns-logs.db - storage.type filesystem - Mem_Buf_Limit 256MB - Skip_Long_Lines On - -[OUTPUT] - Name http - Match app.http.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT%20INTO%20default.logs_ingest%20FORMAT%20JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - json_date_key timestamp - json_date_format epoch - workers 2 - net.keepalive On - Retry_Limit False - tls On - tls.verify On - -[OUTPUT] - Name http - Match app.dns.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT%20INTO%20default.dns_logs_ingest%20FORMAT%20JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - json_date_key timestamp - json_date_format epoch - workers 2 - net.keepalive On - Retry_Limit False - tls On - tls.verify On diff --git a/deploy/fluent-bit/fluent-bit-sample-8c16g.conf b/deploy/fluent-bit/fluent-bit-sample-8c16g.conf deleted file mode 100644 index 36da86f..0000000 --- a/deploy/fluent-bit/fluent-bit-sample-8c16g.conf +++ /dev/null @@ -1,69 +0,0 @@ -# Sample profile for 8C16G nodes (Node + DNS on same host). -# Replace Host/Port/URI and credentials according to your ClickHouse deployment. - -[SERVICE] - Flush 1 - Log_Level info - Parsers_File parsers.conf - storage.path /var/lib/fluent-bit/storage - storage.sync normal - storage.checksum off - storage.backlog.mem_limit 1024MB - -[INPUT] - Name tail - Path /var/log/edge/edge-node/*.log - Tag app.http.logs - Parser json - Refresh_Interval 1 - Read_from_Head false - DB /var/lib/fluent-bit/http-logs.db - storage.type filesystem - Mem_Buf_Limit 512MB - Skip_Long_Lines On - -[INPUT] - Name tail - Path /var/log/edge/edge-dns/*.log - Tag app.dns.logs - Parser json - Refresh_Interval 1 - Read_from_Head false - DB /var/lib/fluent-bit/dns-logs.db - storage.type filesystem - Mem_Buf_Limit 512MB - Skip_Long_Lines On - -[OUTPUT] - Name http - Match app.http.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT%20INTO%20default.logs_ingest%20FORMAT%20JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - json_date_key timestamp - json_date_format epoch - workers 4 - net.keepalive On - Retry_Limit False - tls On - tls.verify On - -[OUTPUT] - Name http - Match app.dns.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT%20INTO%20default.dns_logs_ingest%20FORMAT%20JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - json_date_key timestamp - json_date_format epoch - workers 4 - net.keepalive On - Retry_Limit False - tls On - tls.verify On diff --git a/deploy/fluent-bit/fluent-bit-windows-https.conf b/deploy/fluent-bit/fluent-bit-windows-https.conf deleted file mode 100644 index b3a717a..0000000 --- a/deploy/fluent-bit/fluent-bit-windows-https.conf +++ /dev/null @@ -1,62 +0,0 @@ -[SERVICE] - Flush 1 - Log_Level info - Parsers_File parsers.conf - storage.path ./storage - storage.sync normal - -[INPUT] - Name tail - Path E:\var\log\edge\edge-node\*.log - Tag app.http.logs - Parser json - Refresh_Interval 1 - Read_from_Head true - DB ./http-logs.db - Mem_Buf_Limit 128MB - Skip_Long_Lines On - -[INPUT] - Name tail - Path E:\var\log\edge\edge-dns\*.log - Tag app.dns.logs - Parser json - Refresh_Interval 1 - Read_from_Head true - DB ./dns-logs.db - Mem_Buf_Limit 128MB - Skip_Long_Lines On - -[OUTPUT] - Name http - Match app.http.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT+INTO+logs_ingest+FORMAT+JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - tls On - tls.verify On - # tls.ca_file C:\\path\\to\\ca.pem - # tls.vhost clickhouse.example.com - Json_Date_Key timestamp - Json_Date_Format epoch - Retry_Limit 10 - -[OUTPUT] - Name http - Match app.dns.logs - Host 127.0.0.1 - Port 8443 - URI /?query=INSERT+INTO+dns_logs_ingest+FORMAT+JSONEachRow - Format json_lines - http_user ${CH_USER} - http_passwd ${CH_PASSWORD} - tls On - tls.verify On - # tls.ca_file C:\\path\\to\\ca.pem - # tls.vhost clickhouse.example.com - Json_Date_Key timestamp - Json_Date_Format epoch - Retry_Limit 10 diff --git a/deploy/fluent-bit/logrotate.conf b/deploy/fluent-bit/logrotate.conf deleted file mode 100644 index 8daedf9..0000000 --- a/deploy/fluent-bit/logrotate.conf +++ /dev/null @@ -1,20 +0,0 @@ -# logrotate 示例:边缘节点日志轮转 -# 安装:放入 /etc/logrotate.d/edge-node 或 include 到主配置 - -/var/log/edge/edge-node/*.log { - daily - rotate 14 - compress - missingok - notifempty - copytruncate -} - -/var/log/edge/edge-dns/*.log { - daily - rotate 14 - compress - missingok - notifempty - copytruncate -} diff --git a/deploy/fluent-bit/logs.db b/deploy/fluent-bit/logs.db deleted file mode 100644 index e06183c..0000000 Binary files a/deploy/fluent-bit/logs.db and /dev/null differ diff --git a/deploy/fluent-bit/logs.db-shm b/deploy/fluent-bit/logs.db-shm deleted file mode 100644 index 0850e15..0000000 Binary files a/deploy/fluent-bit/logs.db-shm and /dev/null differ diff --git a/deploy/fluent-bit/logs.db-wal b/deploy/fluent-bit/logs.db-wal deleted file mode 100644 index 102e271..0000000 Binary files a/deploy/fluent-bit/logs.db-wal and /dev/null differ diff --git a/go1.21.6.linux-amd64.tar.gz b/go1.21.6.linux-amd64.tar.gz deleted file mode 100644 index e6567ae..0000000 Binary files a/go1.21.6.linux-amd64.tar.gz and /dev/null differ diff --git a/go1.25.7.linux-amd64.tar.gz b/go1.25.7.linux-amd64.tar.gz deleted file mode 100644 index 4a4ae60..0000000 Binary files a/go1.25.7.linux-amd64.tar.gz and /dev/null differ diff --git a/日志策略逻辑梳理与问题清单.md b/日志策略逻辑梳理与问题清单.md deleted file mode 100644 index 3f8c56e..0000000 --- a/日志策略逻辑梳理与问题清单.md +++ /dev/null @@ -1,148 +0,0 @@ -# 日志策略逻辑梳理与问题清单(当前基线:`E:\AI_PRODUCT\waf-platform`) - -## 1. 结论摘要 - -- 当前链路是 **`type` + `writeTargets` 双字段共同决定行为**。 -- 运行时真正用于读写判断的是 `writeTargets`(`ParseWriteTargetsFromPolicy` 解析结果)。 -- HTTP 与 DNS 都已接入“公用策略”下发,DNS 也已支持 ClickHouse 读取。 -- 目前存在多处逻辑不一致,核心风险是:**页面显示、数据库值、实际读写行为可能不同步**。 - -## 2. 关键入口文件 - -- 类型与组合映射:`EdgeCommon/pkg/serverconfigs/access_log_storages.go` -- 写入目标定义/解析:`EdgeCommon/pkg/serverconfigs/access_log_write_targets.go` -- 策略创建/更新(Admin):`EdgeAdmin/internal/web/actions/default/servers/accesslogs/createPopup.go`、`EdgeAdmin/internal/web/actions/default/servers/accesslogs/update.go` -- 策略保存(API):`EdgeAPI/internal/rpc/services/service_http_access_log_policy_plus.go` -- 策略落库(DAO):`EdgeAPI/internal/db/models/http_access_log_policy_dao.go` -- 公用策略运行时缓存:`EdgeAPI/internal/accesslogs/storage_manager.go` -- HTTP 节点队列:`EdgeNode/internal/nodes/http_access_log_queue.go` -- DNS 节点队列:`EdgeDNS/internal/nodes/ns_access_log_queue.go` -- 节点配置下发:`EdgeAPI/internal/db/models/node_dao.go`、`EdgeAPI/internal/db/models/ns_node_dao_plus.go` -- HTTP 查询服务:`EdgeAPI/internal/rpc/services/service_http_access_log.go` -- DNS 查询服务:`EdgeAPI/internal/rpc/services/nameservers/service_ns_access_log.go` -- CH 查询实现:`EdgeAPI/internal/clickhouse/logs_ingest_store.go`、`EdgeAPI/internal/clickhouse/ns_logs_ingest_store.go` - -## 3. 数据模型与语义 - -`edgeHTTPAccessLogPolicies` 关键字段: - -- `type`:`file` / `file_mysql` / `file_clickhouse` / `file_mysql_clickhouse` / `es` / `tcp` / `syslog` / `command` -- `writeTargets`:JSON(`file/mysql/clickhouse` 三个布尔值) -- `disableDefaultDB`:停用默认数据库存储(兼容旧语义) - -当前实际规则: - -1. Admin 侧根据下拉 `type` 生成 `writeTargetsJSON`。 -2. API 原样落库(仅做少量历史 type 别名兼容)。 -3. 运行时使用 `ParseWriteTargetsFromPolicy(writeTargets, type, disableDefaultDB)` 得到最终写入目标。 - -## 4. 端到端链路(当前行为) - -### 4.1 策略创建/更新 - -- 创建与更新都会调用 `ParseStorageTypeAndWriteTargets`,并同时提交 `type` 与 `writeTargetsJSON`。 -- `file_clickhouse` / `file_mysql_clickhouse` 在 UI 上隐藏了手填路径输入,依赖旧值或默认目录回退。 -- DAO 更新时,只有 `writeTargetsJSON` 非空才会覆盖 `writeTargets` 字段。 - -### 4.2 HTTP 写入链路 - -- Node 侧: - - `needWriteFile = writeTargets == nil || writeTargets.NeedWriteFile()` - - `needReportAPI = writeTargets == nil || writeTargets.NeedReportToAPI()` -- API 侧: - - `CreateHTTPAccessLogs` 里是否写 MySQL 由 `canWriteAccessLogsToDB() -> WriteMySQL()` 决定。 - - 同时调用 `writeAccessLogsToPolicy()`,把日志再交给公用策略存储引擎处理(如 file/es/tcp/syslog/command)。 -- 查询侧: - - `shouldReadAccessLogsFromClickHouse()` 为真且 CH 配置可用时优先读 CH。 - - CH 失败后,按 `shouldReadAccessLogsFromMySQL()` 回退 MySQL。 - -### 4.3 DNS 写入链路 - -- DNS 节点: - - `needWriteFile = targets == nil || targets.File || targets.ClickHouse` - - `needReportAPI = targets == nil || targets.MySQL` - - 即 CH-only 下 DNS 只写本地文件,不上报 API。 -- DNS API 查询: - - 与 HTTP 一样优先 CH,再按策略回退 MySQL。 - -### 4.4 节点路径更新机制 - -- API 下发公用策略的 `AccessLogFilePath` 与 `AccessLogWriteTargets` 到 HTTP/DNS 节点配置。 -- Node/DNS 收到新配置后会 `SetDirByPolicyPath(...)` 并 `EnsureInit/Reopen/Close`,可自动切换目录。 -- 空路径时会回退到: - - HTTP:`EDGE_LOG_DIR` 或默认 `/var/log/edge/edge-node` - - DNS:`EDGE_DNS_LOG_DIR` 或默认 `/var/log/edge/edge-dns` - -## 5. 行为矩阵(按当前代码) - -- `file` - - 写文件:是 - - 写 MySQL:否(仅当 `writeTargets.mysql=true` 才会写) - - 读:优先 CH(若开启),否则按 MySQL 开关 -- `file_mysql` - - 写文件:是 - - 写 MySQL:是 - - 读:MySQL 可读;若 CH 也开则优先 CH -- `file_clickhouse` - - 写文件:是 - - 写 MySQL:否(理论上) - - 读:优先 CH;若 CH 不可用且 mysql=false,则返回空 -- `file_mysql_clickhouse` - - 写文件:是 - - 写 MySQL:是 - - 读:优先 CH,失败回退 MySQL -- `es/tcp/syslog/command` - - 仍会由 `writeTargets` 决定是否 MySQL(当前解析默认给 MySQL=true) - - 另外会通过策略引擎输出到对应目标 - -## 6. 逻辑问题清单(按优先级) - -### P0:`type` 与 `writeTargets` 双真源,容易漂移 - -- 页面展示与回显会参考 `type`,实际写读判断优先看 `writeTargets`。 -- 一旦两者不一致,会出现“UI 看起来是 ClickHouse,实际还在写/读 MySQL”。 - -### P0:`disableDefaultDB` 在新链路中容易失效 - -- `WriteMySQL()` 优先看 `writeTargets.MySQL`,只有 `writeTargets` 为空才回退 `disableDefaultDB`。 -- 由于 Admin 基本总会提交 `writeTargetsJSON`,`disableDefaultDB` 常常不会真正生效。 - -### P1:HTTP 与 DNS 在 CH-only 场景上报 API 语义不一致 - -- HTTP:`NeedReportToAPI()` = `MySQL || ClickHouse`,CH-only 仍会上报 API。 -- DNS:CH-only 不上报 API,仅写文件给 Fluent Bit。 -- 高并发下会带来不必要的 API 压力与行为差异。 - -### P1:`file_clickhouse` 可能出现空路径,策略引擎会启动失败 - -- `FileStorage.Start()` 要求 `path` 非空。 -- 但 UI 在 clickhouse 组合类型隐藏路径输入,若 `options.path` 为空,策略引擎会报错(虽然节点本地写文件仍可回退目录工作)。 - -### P1:HTTP 可能出现“节点写文件 + API 再写文件”的重复路径 - -- `CreateHTTPAccessLogs` 无论是否写 MySQL,都会 `writeAccessLogsToPolicy()`。 -- 公用策略若为 file*,API 侧 `StorageManager.createStorage()` 会创建 `FileStorage` 并再次落文件。 -- 若目标是“仅节点写文件供 Fluent Bit 采集”,这会引入额外重复写入。 - -### P2:DNS `requestId` 生成算法有重复风险 - -- `ns_access_log_queue.go` 里 `timestamp/requestId` 为 `loop()` 局部变量,每轮 tick 重置。 -- 同秒跨批次可能冲突,影响游标分页与去重。 - -### P2:UI 文案分支存在不可达条件 - -- `createPopup.html` / `update.html` 在 `file|file_mysql` 区块内嵌了 clickhouse 条件文案分支,实际不会触发。 -- 不影响功能,但会增加理解成本。 - -## 7. 建议修复顺序 - -1. 先统一单一真源(建议 API 层统一按 `type` 规范化并覆盖 `writeTargets`)。 -2. 明确 `disableDefaultDB` 与 `writeTargets` 的优先级,避免“配置项在 UI 可选但不生效”。 -3. 统一 HTTP/DNS 在 CH-only 的上报语义(建议都走“节点文件 + Fluent Bit”,API 不再接收该流量)。 -4. 修复 file_clickhouse 空路径策略启动失败(要求路径 or 统一默认路径回填到 options)。 -5. 修复 DNS requestId 生成(全局原子递增或更高精度时间戳方案)。 - -## 8. 当前可用性判断 - -- 系统“可运行”,但配置行为存在歧义,且在高并发下会放大成本和排障难度。 -- 若目标是稳定的高吞吐日志链路,建议优先处理 P0/P1 问题后再继续线上放量。 diff --git a/编译部署升级策略.md b/编译部署升级策略.md deleted file mode 100644 index e3e0eaf..0000000 --- a/编译部署升级策略.md +++ /dev/null @@ -1,232 +0,0 @@ -# waf-platform 编译、部署、升级策略(WSL Ubuntu 22.04) - -## 1. 适用范围 - -- 主基线:`E:\AI_PRODUCT\waf-platform`(不是 `waf-platform-1.4.5/1.4.6`)。 -- 本手册覆盖: - - `EdgeAdmin` / `EdgeAPI` / `EdgeNode` / `EdgeDNS` - - HTTP + DNS 访问日志策略 - - Fluent Bit + ClickHouse 日志链路 - ---- - -## 2. 关键结论(先看) - -1. 用 `EdgeAdmin/build/build.sh` 编译时,会联动编译 `EdgeAPI`,并由 `EdgeAPI` 联动编译 `EdgeNode`。 -2. `EdgeDNS` 只有在 `plus` 模式下才会被 `EdgeAPI/build/build.sh` 自动编译并放入 deploy。 -3. 当前脚本已临时关闭自动 `arm64` 编译,只保留 `amd64` 自动链路。 -3. 如果你要发布“本次所有改动”(含 DNS/ClickHouse),建议统一用: - ```bash - cd /mnt/e/AI_PRODUCT/waf-platform/EdgeAdmin/build - bash build.sh linux amd64 plus - ``` -4. DNS 节点与 Node 节点分离部署时,两边都要有 Fluent Bit(各自采集本机日志)。 - ---- - -## 3. 编译前检查 - -在 WSL Ubuntu 22.04 执行: - -```bash -cd /mnt/e/AI_PRODUCT/waf-platform -git rev-parse --short HEAD -go version -which zip unzip go find sed -``` - -建议: - -- 线上 Ubuntu 22.04,尽量也在 Ubuntu 22.04 编译,避免 `GLIBC`/`GLIBCXX` 不兼容。 -- 若 Node plus 使用 cgo/libpcap/libbrotli,请确保构建机依赖完整。 - ---- - -## 4. 一键编译(推荐) - -```bash -cd /mnt/e/AI_PRODUCT/waf-platform/EdgeAdmin/build -bash build.sh linux amd64 plus -``` - -### 4.1 此命令会做什么 - -- 编译 `EdgeAdmin` -- 自动调用 `EdgeAPI/build/build.sh` -- `EdgeAPI` 自动编译并打包 `EdgeNode`(当前仅 linux/amd64) -- `plus` 模式下,`EdgeAPI` 自动编译并打包 `EdgeDNS`(当前仅 linux/amd64) -- 把 node/dns 包放入 API 的 `deploy` 目录用于远程安装 - -### 4.2 主要产物位置 - -- Admin 包:`EdgeAdmin/dist/edge-admin-linux-amd64-v*.zip` -- API 包:`EdgeAPI/dist/edge-api-linux-amd64-v*.zip` -- Node 包:`EdgeNode/dist/edge-node-linux-*.zip` -- DNS 包:`EdgeDNS/dist/edge-dns-linux-*.zip`(plus 时) -- API deploy 安装包目录:`EdgeAPI/build/deploy/` - ---- - -## 5. 是否需要单独编译 API / DNS / Node - -### 5.1 不需要单独编译 API 的场景 - -- 你已经执行 `EdgeAdmin/build/build.sh ... plus`,且要发布整套改动。 - -### 5.2 需要单独编译的场景 - -- 只改了 API,不想重新打 Admin: - ```bash - cd /mnt/e/AI_PRODUCT/waf-platform/EdgeAPI/build - bash build.sh linux amd64 plus - ``` -- 只改了 Node: - ```bash - cd /mnt/e/AI_PRODUCT/waf-platform/EdgeNode/build - bash build.sh linux amd64 plus - ``` -- 只改了 DNS: - ```bash - cd /mnt/e/AI_PRODUCT/waf-platform/EdgeDNS/build - bash build.sh linux amd64 - ``` - ---- - -## 6. 升级顺序(生产建议) - -## 6.1 第一步:先改 ClickHouse(DDL) - -先在 ClickHouse 建/改表,至少包含: - -- `logs_ingest`(HTTP) -- `dns_logs_ingest`(DNS) - -先做 DDL 的原因:避免新版本写入时目标表不存在。 - -## 6.2 第二步:部署 Fluent Bit 配置 - -### Node 节点(HTTP) - -- 配置文件目录一般是 `/etc/fluent-bit/` -- 至少更新: - - `fluent-bit.conf`(或你实际启用的 `fluent-bit-http.conf`) - - `clickhouse-upstream.conf` - - `parsers.conf`(通常可复用) - -### DNS 节点(DNS) - -- DNS 节点若之前没装 Fluent Bit,需要先安装并创建 service。 - - `curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh` - - `sudo apt-get update` - - `sudo apt-get install -y fluent-bit` -- 建议同样用 `/etc/fluent-bit/`,放: - - `fluent-bit.conf`(DNS 版本或含 DNS INPUT/OUTPUT 的统一版本) - - `clickhouse-upstream.conf` - - `parsers.conf` - -重启: - -```bash -sudo systemctl restart fluent-bit -sudo systemctl status fluent-bit -``` - -## 6.3 第三步:升级管理面(API + Admin) - -在管理节点更新 `edge-api`、`edge-admin` 包并重启对应服务。 -./bin/edge-api status -./bin/edge-api restart - -## 6.4 第四步:升级数据面(Node / DNS) - -- 通过 API 的远程安装/升级流程分批升级 Node、DNS -- 或手工替换二进制后重启服务 - -## 6.5 第五步:最后切换日志策略 - -在页面启用目标策略(MySQL only / ClickHouse only / 双写),并验证读写链路。 - ---- - -## 7. 日志策略与读写行为(当前实现) - -## 7.1 HTTP / DNS 共用语义 - -- `WriteMySQL=true`:写 MySQL(通过 API) -- `WriteClickHouse=true`:写本地日志文件,由 Fluent Bit 异步采集进 CH -- 两者都开:双写 -- 两者都关:不写 - -## 7.2 查询侧优先级 - -- 优先读 ClickHouse(可用且策略允许) -- ClickHouse 异常时按策略回退 MySQL -- 若两边都不可读,返回空 - -## 7.3 关于“日志文件路径” - -- 现在前端已调整:当存储类型包含 ClickHouse 时,创建/编辑页隐藏“日志文件路径”输入。 -- 但 Fluent Bit 的 `Path` 必须匹配实际日志目录;若你改了日志目录,需要同步改 Fluent Bit 配置并重启。 - ---- - -## 8. 服务检查与常用命令 - -## 8.1 检查 Fluent Bit 服务名 - -```bash -systemctl list-unit-files | grep -Ei 'fluent|td-agent-bit' -systemctl status fluent-bit.service -``` - -## 8.2 查看 Fluent Bit 实际使用的配置文件 - -```bash -systemctl status fluent-bit.service -``` - -重点看 `ExecStart`,例如: - -```text -/opt/fluent-bit/bin/fluent-bit -c /etc/fluent-bit/fluent-bit.conf -``` - -## 8.3 验证 ClickHouse 是否有数据 - -```sql -SELECT count() FROM default.logs_ingest; -SELECT count() FROM default.dns_logs_ingest; -``` - ---- - -## 9. 回滚策略(最小影响) - -1. 先把页面日志策略切回 MySQL-only。 -2. 回滚 API/Admin 到上一版本。 -3. Node/DNS 分批回滚。 -4. Fluent Bit 保留运行不影响主业务(只停止 CH 写入即可)。 - ---- - -## 10. 一次发布的最简执行清单 - -```bash -# 1) 构建 -cd /mnt/e/AI_PRODUCT/waf-platform/EdgeAdmin/build -bash build.sh linux amd64 plus - -# 2) 上传产物 -# EdgeAdmin/dist/*.zip -# EdgeAPI/dist/*.zip -# EdgeAPI/build/deploy/* (node/dns installer zip) - -# 3) 线上先执行 CH DDL -# 4) 更新 fluent-bit 配置并重启 -sudo systemctl restart fluent-bit - -# 5) 升级 edge-api / edge-admin 并重启 -# 6) 升级 edge-node / edge-dns -# 7) 切日志策略并验证 -``` diff --git a/访问日志策略配置手册.md b/访问日志策略配置手册.md deleted file mode 100644 index cecc4aa..0000000 --- a/访问日志策略配置手册.md +++ /dev/null @@ -1,215 +0,0 @@ -# 访问日志策略配置手册(默认安装 / 仅MySQL / 仅ClickHouse / 双写) - -## 1. 适用范围 -- 代码基线:`e:\AI_PRODUCT\waf-platform` -- 页面入口:`系统设置 -> 访问日志 -> 日志策略` -- 查询入口:`网站 -> 站点 -> 日志`(`/servers/server/log`) - ---- - -## 2. 默认安装后的行为(什么都不配) - -```mermaid -flowchart TD - A[EdgeNode 产生日志] --> B[写本地文件 /var/log/edge/edge-node/*.log] - A --> C[上报 EdgeAPI] - C --> D[写 MySQL 访问日志表] - E[日志查询页] --> D -``` - -- 默认即可写日志,不会因为没配 ClickHouse 就停写。 -- 查询默认走 MySQL。 -- 是否有“独立日志数据库节点”会影响写到哪个 MySQL: - - 有日志库节点:优先写日志库节点池。 - - 没有日志库节点:回退写默认数据库。 - ---- - -## 3. 必须设置项(上线最小集) - -### 3.1 基础必需(任何模式都建议) -1. `EdgeAPI` 数据库连接可用(`db.yaml` / `.db.yaml`)。 -2. `EdgeNode` 与 `EdgeAPI` 通信正常(节点在线,可上报日志)。 -3. 建议创建并启用一个**公用**访问日志策略(避免多环境行为不一致)。 - -### 3.2 仅 ClickHouse / MySQL+ClickHouse 额外必需 -1. `EdgeAPI` 配置 ClickHouse 读取: - - `EdgeAPI/configs/api.yaml`: - ```yaml - clickhouse: - host: 127.0.0.1 - port: 8123 - user: default - password: "xxxxxx" - database: default - ``` -2. Fluent Bit 已部署并运行,采集: - - `/var/log/edge/edge-node/*.log` -3. ClickHouse 已建表:`logs_ingest`(见 `deploy/fluent-bit/README.md`)。 - -### 3.3 本地日志轮转(默认开启) -从当前版本开始,EdgeNode / EdgeDNS 使用内建 `lumberjack` 轮转,不再依赖系统 `logrotate`。 - -默认值: -- `maxSizeMB=256` -- `maxBackups=14` -- `maxAgeDays=7` -- `compress=false` -- `localTime=true` - -可在策略 `file.rotate` 中配置,例如: - -```json -{ - "path": "/var/log/web-access-${date}.log", - "autoCreate": true, - "rotate": { - "maxSizeMB": 256, - "maxBackups": 14, - "maxAgeDays": 7, - "compress": false, - "localTime": true - } -} -``` - ---- - -## 4. 三种目标模式怎么配 - -## 4.1 只写入 MySQL - -在“日志策略”中: -1. 新建或修改策略,`存储类型` 选 **文件+MySQL**。 -2. 设为 **公用**,并确保 **启用**。 -3. `日志文件路径` 填一个 API 可写路径(必填校验项): - - 示例:`/var/log/edge/edge-api/http-access-${date}.log` - -结果: -- 写入:MySQL(主路径)+ Node 本地日志文件 -- 查询:MySQL -- 不依赖 ClickHouse - ---- - -## 4.2 只写入 ClickHouse - -在“日志策略”中: -1. `存储类型` 选 **文件+ClickHouse**。 -2. 设为 **公用**,并确保 **启用**。 -3. `日志文件路径` 仍需填写(策略校验要求): - - 示例:`/var/log/edge/edge-api/http-access-${date}.log` -4. 确保 Fluent Bit 正在采集 Node 目录并写入 ClickHouse。 -5. 确保 `EdgeAPI` 的 ClickHouse 连接已配置。 - -结果: -- 写入:Node 本地文件 -> Fluent Bit -> ClickHouse -- API 不写 MySQL -- 查询优先 ClickHouse(无 CH 时可能查不到数据) - ---- - -## 4.3 同时写入 MySQL + ClickHouse - -在“日志策略”中: -1. `存储类型` 选 **文件+MySQL+ClickHouse**。 -2. 设为 **公用**,并确保 **启用**。 -3. `日志文件路径` 填写有效路径(同上)。 -4. ClickHouse + Fluent Bit 同 4.2 要求。 - -结果: -- 写入:MySQL + ClickHouse(并行) -- 查询:优先 ClickHouse,失败可回退 MySQL - ---- - -## 5. 配置生效链路图 - -```mermaid -flowchart LR - P[公用日志策略 type/writeTargets] --> C[EdgeAPI 解析 writeTargets] - C --> N[下发到 EdgeNode GlobalServerConfig.HTTPAccessLog.WriteTargets] - N --> W1[NeedWriteFile] - N --> W2[NeedReportToAPI] - W1 --> F[Node本地日志文件] - F --> FB[Fluent Bit] - FB --> CH[(ClickHouse.logs_ingest)] - W2 --> API[CreateHTTPAccessLogs] - API --> MYSQL[(MySQL访问日志表)] -``` - ---- - -## 6. 验证清单(建议上线前逐项过) - -1. 打开 `/servers/server/log`,持续压测 1~2 分钟。 -2. 检查最新日志是否持续上顶(不是停在旧时间段)。 -3. 错误日志筛选是否只显示 `status>=400`。 -4. 仅 CH 模式下,停掉 Fluent Bit 后确认告警和查询表现符合预期。 -5. MySQL+CH 模式下,临时断 CH,确认页面可回退 MySQL。 - ---- - -## 7. 常见问题 - -### Q1:策略里的“日志文件路径”是干嘛的? -- 这是策略 `file` 配置的必填项(API 侧校验)。 -- 即使你用 ClickHouse 链路,当前实现仍要求该字段有值。 -- 真正给 Fluent Bit 采集的是 **Node 目录**:`/var/log/edge/edge-node/*.log`。 - -### Q2:不勾“停用默认数据库存储”,会不会同时写默认库和独立日志库? -- 正常不会双写同一条。 -- 有独立日志库节点时优先写节点池;节点池不可用时才回退默认库。 - -### Q3:修改策略后要不要重启? -- 通常 1 分钟内自动刷新生效。 -- 若要立即生效:重启 `edge-api`,并在需要时重启 `edge-node`、`fluent-bit`。 - ---- - -## 8. DNS 日志与 HTTP 策略联动(新增) - -从当前版本开始,DNS 访问日志与 HTTP 访问日志共享同一套公用策略语义(`writeTargets`): - -- `WriteMySQL=true`:DNS 节点上报 API,API 写入 MySQL 分表。 -- `WriteClickHouse=true`:DNS 节点写本地 JSONL,Fluent Bit 采集写入 ClickHouse `dns_logs_ingest`。 -- 双开即双写;双关即不写(仅保留内存处理,不入库)。 - -### 8.1 DNS 写入链路 - -```mermaid -flowchart LR - A[EdgeDNS 产生日志] --> B{writeTargets} - B -->|MySQL=true| C[CreateNSAccessLogs] - C --> D[(MySQL edgeNSAccessLogs_YYYYMMDD)] - B -->|ClickHouse=true| E[/var/log/edge/edge-dns/access.log] - E --> F[Fluent Bit] - F --> G[(ClickHouse dns_logs_ingest)] -``` - -### 8.2 DNS 查询链路 - -```mermaid -flowchart TD - Q[/ns/clusters/accessLogs] --> R{策略是否启用ClickHouse且CH可用} - R -->|是| CH[(dns_logs_ingest)] - R -->|否| M{策略是否启用MySQL} - CH -->|查询失败| M - M -->|是| MY[(MySQL edgeNSAccessLogs_YYYYMMDD)] - M -->|否| E[返回空列表] -``` - -### 8.3 组合场景说明(DNS) - -| 策略 | 写入 | 读取 | -|------|------|------| -| 仅 MySQL | API -> MySQL | MySQL | -| 仅 ClickHouse | 本地文件 -> Fluent Bit -> ClickHouse | ClickHouse | -| MySQL + ClickHouse | API -> MySQL + 本地文件 -> Fluent Bit -> ClickHouse | 优先 ClickHouse,失败回退 MySQL | - -### 8.4 DNS 相关必须配置 - -1. `EdgeAPI` 配置 ClickHouse 连接(仅读 CH 时必须)。 -2. `deploy/fluent-bit/fluent-bit.conf` 已包含 DNS 输入:`/var/log/edge/edge-dns/*.log`。 -3. ClickHouse 已创建 `dns_logs_ingest` 表。 -4. EdgeDNS 运行用户对 `EDGE_DNS_LOG_DIR`(默认 `/var/log/edge/edge-dns`)有写权限。