diff --git a/README.md b/README.md index ceca244..6517068 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ The exporter can be configured using env variables or command flags. | `SCRAPE_DELAY` | scrape delay in seconds, default `300` | | `SCRAPE_INTERVAL` | scrape interval in seconds (will query cloudflare every SCRAPE_INTERVAL seconds), default `60` | | `METRICS_DENYLIST` | (Optional) cloudflare-exporter metrics to not export, comma delimited list of cloudflare-exporter metrics. If not set, all metrics are exported | +| `KV_NAMESPACE_IDS` | (Optional) KV namespace IDs to track individually, comma delimited. Unlisted namespaces are aggregated as `other` | | `ENABLE_PPROF` | (Optional) enable pprof profiling endpoints at `/debug/pprof/`. Accepts `true` or `false`, default `false`. **Warning**: Only enable in development/debugging environments | | `ZONE_` | `DEPRECATED since 0.0.5` (optional) Zone ID. Add zones you want to scrape by adding env vars in this format. You can find the zone ids in Cloudflare dashboards. | | `LOG_LEVEL` | Set loglevel. Options are error, warn, info, debug. default `error` | @@ -84,6 +85,7 @@ Corresponding flags: -metrics_path="/metrics": path for metrics, default /metrics -scrape_delay=300: scrape delay in seconds, defaults to 300 -scrape_interval=60: scrape interval in seconds, defaults to 60 + -kv_namespace_ids="": KV namespace IDs to track individually, comma delimited -metrics_denylist="": cloudflare-exporter metrics to not export, comma delimited list -enable_pprof=false: enable pprof profiling endpoints at /debug/pprof/ -log_level="error": log level(error,warn,info,debug) @@ -127,6 +129,8 @@ Note: `ZONE_` configuration is not supported as flag. # HELP cloudflare_r2_operation_count Number of operations performed by R2 # HELP cloudflare_r2_storage_bytes Storage used by R2 # HELP cloudflare_r2_storage_total_bytes Total storage used by R2 +# HELP cloudflare_kv_requests_count Number of KV operations by namespace and action type +# HELP cloudflare_kv_latency KV operation latency quantiles (milliseconds) ``` ## Helm chart repository diff --git a/cloudflare.go b/cloudflare.go index ea5d23b..b4f1b20 100644 --- a/cloudflare.go +++ b/cloudflare.go @@ -299,6 +299,30 @@ type lbResp struct { ZoneTag string `json:"zoneTag"` } +type cloudflareResponseKV struct { + Viewer struct { + Accounts []kvAccountResp `json:"accounts"` + } `json:"viewer"` +} + +type kvAccountResp struct { + KvOperationsAdaptiveGroups []struct { + Dimensions struct { + NamespaceID string `json:"namespaceId"` + ActionType string `json:"actionType"` + } `json:"dimensions"` + Sum struct { + Requests uint64 `json:"requests"` + } `json:"sum"` + Quantiles struct { + LatencyMsP50 float32 `json:"latencyMsP50"` + LatencyMsP75 float32 `json:"latencyMsP75"` + LatencyMsP99 float32 `json:"latencyMsP99"` + LatencyMsP999 float32 `json:"latencyMsP999"` + } `json:"quantiles"` + } `json:"kvOperationsAdaptiveGroups"` +} + type cloudflareResponseDNSFirewall struct { Viewer struct { Accounts []dnsFirewallAccountResp `json:"accounts"` @@ -1085,6 +1109,51 @@ func filterNonFreePlanZones(zones []cfzones.Zone) (filteredZones []cfzones.Zone) return } +func fetchKVOperations(accountID string) (*cloudflareResponseKV, error) { + request := graphql.NewRequest(` + query ($accountID: String!, $mintime: Time!, $maxtime: Time!, $limit: Int!) { + viewer { + accounts(filter: {accountTag: $accountID}) { + kvOperationsAdaptiveGroups(limit: $limit, filter: {datetime_geq: $mintime, datetime_lt: $maxtime}) { + dimensions { + namespaceId + actionType + } + sum { + requests + } + quantiles { + latencyMsP50 + latencyMsP75 + latencyMsP99 + latencyMsP999 + } + } + } + } + }`) + + now, now1mAgo := GetTimeRange() + request.Var("limit", gqlQueryLimit) + request.Var("maxtime", now) + request.Var("mintime", now1mAgo) + request.Var("accountID", accountID) + + gql.Mu.RLock() + defer gql.Mu.RUnlock() + + ctx, cancel := context.WithTimeout(context.Background(), cftimeout) + defer cancel() + + var resp cloudflareResponseKV + if err := gql.Client.Run(ctx, request, &resp); err != nil { + log.Errorf("error fetching KV operations, err:%v", err) + return nil, err + } + + return &resp, nil +} + func fetchDNSFirewallTotals(accountID string) (*cloudflareResponseDNSFirewall, error) { request := graphql.NewRequest(` query ($accountID: string, $mintime: Time!, $maxtime: Time!, $limit: Int!) { diff --git a/main.go b/main.go index 5e9bcc6..202e5de 100644 --- a/main.go +++ b/main.go @@ -25,6 +25,10 @@ var ( cftimeout time.Duration gql *GraphQL log = logrus.New() + + // kvTrackedNamespaces is the set of KV namespace IDs that get their own + // namespace_id label. All other namespaces are aggregated under "other". + kvTrackedNamespaces map[string]struct{} ) // var ( @@ -110,6 +114,7 @@ func fetchMetrics(deniedMetricsSet MetricsSet) { for _, a := range accounts { go fetchWorkerAnalytics(a, &wg) + go fetchKVAnalytics(a, &wg, deniedMetricsSet) go fetchLogpushAnalyticsForAccount(a, &wg) go fetchR2StorageForAccount(a, &wg) go fetchLoadblancerPoolsHealth(a, &wg) @@ -173,6 +178,18 @@ func runExporter() { log.Debugf("Metrics set: %v", metricsSet) mustRegisterMetrics(metricsSet) + // Build tracked KV namespace set from config. + kvTrackedNamespaces = make(map[string]struct{}) + if ids := viper.GetString("kv_namespace_ids"); ids != "" { + for _, id := range strings.Split(ids, ",") { + id = strings.TrimSpace(id) + if id != "" { + kvTrackedNamespaces[id] = struct{}{} + } + } + } + log.Infof("Tracking %d KV namespace IDs", len(kvTrackedNamespaces)) + scrapeInterval := time.Duration(viper.GetInt("scrape_interval")) * time.Second log.Info("Scrape interval set to ", scrapeInterval) @@ -256,6 +273,10 @@ func main() { viper.BindEnv("cf_timeout") viper.SetDefault("cf_timeout", 10*time.Second) + flags.String("kv_namespace_ids", "", "KV namespace IDs to track individually, comma delimited. Unlisted namespaces are aggregated as 'other'") + viper.BindEnv("kv_namespace_ids") + viper.SetDefault("kv_namespace_ids", "") + flags.String("metrics_denylist", "", "metrics to not expose, comma delimited list") viper.BindEnv("metrics_denylist") viper.SetDefault("metrics_denylist", "") diff --git a/prometheus.go b/prometheus.go index 1f8395a..48dff94 100644 --- a/prometheus.go +++ b/prometheus.go @@ -64,6 +64,8 @@ const ( tunnelConnectorInfoMetricName MetricName = "cloudflare_tunnel_connector_info" tunnelConnectorActiveConnectionsMetricName MetricName = "cloudflare_tunnel_connector_active_connections" dnsFirewallQueryCountMetricName MetricName = "cloudflare_dns_firewall_query_count" + kvRequestsMetricName MetricName = "cloudflare_kv_requests_count" + kvLatencyMetricName MetricName = "cloudflare_kv_latency" ) type MetricsSet map[MetricName]struct{} @@ -336,6 +338,16 @@ var ( Help: "Reports number of active connections for a Cloudflare Tunnel connector", }, []string{"account", "tunnel_id", "client_id"}) + kvRequests = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: kvRequestsMetricName.String(), + Help: "Number of KV operations by namespace and action type", + }, []string{"namespace_id", "action_type", "account"}) + + kvLatency = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: kvLatencyMetricName.String(), + Help: "KV operation latency quantiles (milliseconds)", + }, []string{"namespace_id", "action_type", "account", "quantile"}) + dnsFirewallQueryCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: dnsFirewallQueryCountMetricName.String(), Help: "DNS Firewall query count by query type and response code", @@ -388,6 +400,8 @@ func buildAllMetricsSet() MetricsSet { allMetricsSet.Add(tunnelConnectorInfoMetricName) allMetricsSet.Add(tunnelConnectorActiveConnectionsMetricName) allMetricsSet.Add(dnsFirewallQueryCountMetricName) + allMetricsSet.Add(kvRequestsMetricName) + allMetricsSet.Add(kvLatencyMetricName) return allMetricsSet } @@ -537,6 +551,12 @@ func mustRegisterMetrics(deniedMetrics MetricsSet) { if !deniedMetrics.Has(dnsFirewallQueryCountMetricName) { prometheus.MustRegister(dnsFirewallQueryCount) } + if !deniedMetrics.Has(kvRequestsMetricName) { + prometheus.MustRegister(kvRequests) + } + if !deniedMetrics.Has(kvLatencyMetricName) { + prometheus.MustRegister(kvLatency) + } } func fetchLoadblancerPoolsHealth(account cfaccounts.Account, wg *sync.WaitGroup) { @@ -607,6 +627,38 @@ func fetchWorkerAnalytics(account cfaccounts.Account, wg *sync.WaitGroup) { } } +func fetchKVAnalytics(account cfaccounts.Account, wg *sync.WaitGroup, deniedMetricsSet MetricsSet) { + wg.Add(1) + defer wg.Done() + + r, err := fetchKVOperations(account.ID) + if err != nil { + log.Error("failed to fetch KV operations for account ", account.ID, ": ", err) + return + } + + accountName := strings.ToLower(strings.ReplaceAll(account.Name, " ", "-")) + + for _, a := range r.Viewer.Accounts { + for _, kv := range a.KvOperationsAdaptiveGroups { + nsID := kv.Dimensions.NamespaceID + if _, tracked := kvTrackedNamespaces[nsID]; !tracked { + nsID = "other" + } + + if !deniedMetricsSet.Has(kvRequestsMetricName) { + kvRequests.With(prometheus.Labels{"namespace_id": nsID, "action_type": kv.Dimensions.ActionType, "account": accountName}).Add(float64(kv.Sum.Requests)) + } + if !deniedMetricsSet.Has(kvLatencyMetricName) { + kvLatency.With(prometheus.Labels{"namespace_id": nsID, "action_type": kv.Dimensions.ActionType, "account": accountName, "quantile": "P50"}).Set(float64(kv.Quantiles.LatencyMsP50)) + kvLatency.With(prometheus.Labels{"namespace_id": nsID, "action_type": kv.Dimensions.ActionType, "account": accountName, "quantile": "P75"}).Set(float64(kv.Quantiles.LatencyMsP75)) + kvLatency.With(prometheus.Labels{"namespace_id": nsID, "action_type": kv.Dimensions.ActionType, "account": accountName, "quantile": "P99"}).Set(float64(kv.Quantiles.LatencyMsP99)) + kvLatency.With(prometheus.Labels{"namespace_id": nsID, "action_type": kv.Dimensions.ActionType, "account": accountName, "quantile": "P999"}).Set(float64(kv.Quantiles.LatencyMsP999)) + } + } + } +} + func fetchLogpushAnalyticsForAccount(account cfaccounts.Account, wg *sync.WaitGroup) { wg.Add(1) defer wg.Done()