diff --git a/cmd/addon-operator/main.go b/cmd/addon-operator/main.go index c01bbc02c..4290cbe49 100644 --- a/cmd/addon-operator/main.go +++ b/cmd/addon-operator/main.go @@ -17,10 +17,12 @@ import ( addon_operator "github.com/flant/addon-operator/pkg/addon-operator" "github.com/flant/addon-operator/pkg/app" "github.com/flant/addon-operator/pkg/kube_config_manager/backend/configmap" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/utils/stdliblogtolog" "github.com/flant/kube-client/klogtolog" shapp "github.com/flant/shell-operator/pkg/app" "github.com/flant/shell-operator/pkg/debug" + shmetrics "github.com/flant/shell-operator/pkg/metrics" utils_signal "github.com/flant/shell-operator/pkg/utils/signal" ) @@ -71,7 +73,12 @@ func start(logger *log.Logger) func(_ *kingpin.ParseContext) error { ctx := context.Background() - operator := addon_operator.NewAddonOperator(ctx, addon_operator.WithLogger(logger.Named("addon-operator"))) + // Initialize metric names with the configured prefix + shmetrics.InitMetrics(shapp.PrometheusMetricsPrefix) + // Initialize addon-operator specific metrics + metrics.InitMetrics(shapp.PrometheusMetricsPrefix) + + operator := addon_operator.NewAddonOperator(ctx, nil, nil, addon_operator.WithLogger(logger.Named("addon-operator"))) operator.StartAPIServer() diff --git a/go.mod b/go.mod index c938bbe73..dbc021083 100644 --- a/go.mod +++ b/go.mod @@ -5,12 +5,12 @@ go 1.24.0 require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/deckhouse/deckhouse/pkg/log v0.1.0 - github.com/deckhouse/deckhouse/pkg/metrics-storage v0.2.1 + github.com/deckhouse/deckhouse/pkg/metrics-storage v0.3.0 github.com/deckhouse/module-sdk v0.5.0 github.com/dominikbraun/graph v0.23.0 github.com/ettle/strcase v0.2.0 github.com/flant/kube-client v1.5.0 - github.com/flant/shell-operator v1.12.0 + github.com/flant/shell-operator v1.12.1 github.com/go-chi/chi/v5 v5.2.2 github.com/go-openapi/loads v0.23.1 github.com/go-openapi/spec v0.22.0 diff --git a/go.sum b/go.sum index 4f496c2a1..0bd174f90 100644 --- a/go.sum +++ b/go.sum @@ -100,8 +100,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/deckhouse/deckhouse/pkg/log v0.1.0 h1:2aPfyiHHSIJlX4x7ysyPOaIb7CLmyY+hUf9uDb8TYd8= github.com/deckhouse/deckhouse/pkg/log v0.1.0/go.mod h1:pbAxTSDcPmwyl3wwKDcEB3qdxHnRxqTV+J0K+sha8bw= -github.com/deckhouse/deckhouse/pkg/metrics-storage v0.2.1 h1:DhMHRzmaVNXASH+koIy9gK25GU/EaK+zC6sVpacqWRw= -github.com/deckhouse/deckhouse/pkg/metrics-storage v0.2.1/go.mod h1:Rz++SzCLkFW03WGgftnn91TimGU2shiKb5S/YuxcBuE= +github.com/deckhouse/deckhouse/pkg/metrics-storage v0.3.0 h1:xZvbKuexrSQGEw6CB4n3UC7XbOb9QNLbm8UhcGZ2R1I= +github.com/deckhouse/deckhouse/pkg/metrics-storage v0.3.0/go.mod h1:Rz++SzCLkFW03WGgftnn91TimGU2shiKb5S/YuxcBuE= github.com/deckhouse/module-sdk v0.5.0 h1:b2GJUzMKQLr7oJVJy5lXHvyymNyvNiFXpBie7MwEWwE= github.com/deckhouse/module-sdk v0.5.0/go.mod h1:+EbBnP8z+poIihgL4l1oxHng5ePqDUK44c39u7sEBss= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= @@ -155,8 +155,8 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flant/kube-client v1.5.0 h1:6QZOZy3uk58Bh9YUn4CnhEz13og/cEGXB2uBZ1gWwtM= github.com/flant/kube-client v1.5.0/go.mod h1:hpJZ0FnDKHW3r5q5SYQgBrTw9k94q4+dcnJ4uOGYBHc= -github.com/flant/shell-operator v1.12.0 h1:VAi6EEqG5aCKgPlE+XaXmpdG5dvGPUkrHSstAREvSiU= -github.com/flant/shell-operator v1.12.0/go.mod h1:MlEVeHh88sL5LCiMmLmtYJZSV7XL/WHIJBKJeNviExE= +github.com/flant/shell-operator v1.12.1 h1:ARrZaPiiQVjPjXombUt//doaHFCBlzGATgsUpQHT85o= +github.com/flant/shell-operator v1.12.1/go.mod h1:RFXNvfK/v18J5u4RMqfFSDFhwJvMNPv5OR3BpioyxI0= github.com/flopp/go-findfont v0.1.0 h1:lPn0BymDUtJo+ZkV01VS3661HL6F4qFlkhcJN55u6mU= github.com/flopp/go-findfont v0.1.0/go.mod h1:wKKxRDjD024Rh7VMwoU90i6ikQRCr+JTHB5n4Ejkqvw= github.com/fluxcd/flagger v1.36.1 h1:X2PumtNwZz9YSGaOtZLFm2zAKLgHhFkbNv8beg7ifyc= diff --git a/pkg/addon-operator/bootstrap.go b/pkg/addon-operator/bootstrap.go index 0188480ce..a35ed1718 100644 --- a/pkg/addon-operator/bootstrap.go +++ b/pkg/addon-operator/bootstrap.go @@ -9,6 +9,7 @@ import ( "github.com/flant/addon-operator/pkg/app" "github.com/flant/addon-operator/pkg/kube_config_manager" "github.com/flant/addon-operator/pkg/kube_config_manager/backend" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" taskservice "github.com/flant/addon-operator/pkg/task/service" shapp "github.com/flant/shell-operator/pkg/app" @@ -75,8 +76,8 @@ func (op *AddonOperator) Assemble(debugServer *debug.Server) error { } // Start background updaters for metrics - StartLiveTicksUpdater(op.engine.MetricStorage) - StartTasksQueueLengthUpdater(op.engine.MetricStorage, op.engine.TaskQueues) + metrics.StartLiveTicksUpdater(op.engine.MetricStorage) + metrics.StartTasksQueueLengthUpdater(op.engine.MetricStorage, op.engine.TaskQueues) // Register debug HTTP endpoints to inspect internal state op.engine.RegisterDebugQueueRoutes(debugServer) diff --git a/pkg/addon-operator/handler_module_manager.go b/pkg/addon-operator/handler_module_manager.go index 64755c367..1e101a180 100644 --- a/pkg/addon-operator/handler_module_manager.go +++ b/pkg/addon-operator/handler_module_manager.go @@ -10,6 +10,7 @@ import ( "github.com/flant/addon-operator/pkg/addon-operator/converge" "github.com/flant/addon-operator/pkg/kube_config_manager/config" + "github.com/flant/addon-operator/pkg/metrics" dynamic_extender "github.com/flant/addon-operator/pkg/module_manager/scheduler/extenders/dynamically_enabled" "github.com/flant/addon-operator/pkg/task" "github.com/flant/addon-operator/pkg/utils" @@ -196,13 +197,13 @@ func (op *AddonOperator) StartModuleManagerEventHandler() { additionalDescription := fmt.Sprintf("%d absent module resources", len(HelmReleaseStatusEvent.Absent)) // helm reslease in unexpected state event if HelmReleaseStatusEvent.UnexpectedStatus { - op.engine.MetricStorage.CounterAdd("{PREFIX}modules_helm_release_redeployed_total", 1.0, map[string]string{"module": HelmReleaseStatusEvent.ModuleName}) + op.engine.MetricStorage.CounterAdd(metrics.ModulesHelmReleaseRedeployedTotal, 1.0, map[string]string{"module": HelmReleaseStatusEvent.ModuleName}) eventDescription = "HelmReleaseUnexpectedStatus" additionalDescription = "unexpected helm release status" } else { // some resources are missing and metrics are provided for _, manifest := range HelmReleaseStatusEvent.Absent { - op.engine.MetricStorage.CounterAdd("{PREFIX}modules_absent_resources_total", 1.0, map[string]string{"module": HelmReleaseStatusEvent.ModuleName, "resource": fmt.Sprintf("%s/%s/%s", manifest.Namespace(""), manifest.Kind(), manifest.Name())}) + op.engine.MetricStorage.CounterAdd(metrics.ModulesAbsentResourcesTotal, 1.0, map[string]string{"module": HelmReleaseStatusEvent.ModuleName, "resource": fmt.Sprintf("%s/%s/%s", manifest.Namespace(""), manifest.Kind(), manifest.Name())}) } } diff --git a/pkg/addon-operator/metrics.go b/pkg/addon-operator/metrics.go deleted file mode 100644 index c03fca60c..000000000 --- a/pkg/addon-operator/metrics.go +++ /dev/null @@ -1,160 +0,0 @@ -package addon_operator - -import ( - "context" - "time" - - metricsstorage "github.com/deckhouse/deckhouse/pkg/metrics-storage" - - "github.com/flant/addon-operator/pkg" - "github.com/flant/shell-operator/pkg/task/queue" -) - -var buckets_1msTo10s = []float64{ - 0.0, - 0.001, 0.002, 0.005, // 1,2,5 milliseconds - 0.01, 0.02, 0.05, // 10,20,50 milliseconds - 0.1, 0.2, 0.5, // 100,200,500 milliseconds - 1, 2, 5, // 1,2,5 seconds - 10, // 10 seconds -} - -// registerHookMetrics register metrics specified for addon-operator -func registerHookMetrics(metricStorage metricsstorage.Storage) { - // configuration metrics - _, _ = metricStorage.RegisterGauge( - "{PREFIX}binding_count", - []string{ - "module", - pkg.MetricKeyHook, - }) - // ConfigMap validation errors - _, _ = metricStorage.RegisterCounter("{PREFIX}config_values_errors_total", []string{}) - - // modules - _, _ = metricStorage.RegisterCounter("{PREFIX}modules_discover_errors_total", []string{}) - _, _ = metricStorage.RegisterCounter("{PREFIX}module_delete_errors_total", []string{"module"}) - - // module - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}module_run_seconds", - []string{ - "module", - pkg.MetricKeyActivation, - }, - buckets_1msTo10s, - ) - _, _ = metricStorage.RegisterCounter("{PREFIX}module_run_errors_total", []string{"module"}) - - moduleHookLabels := []string{ - "module", - pkg.MetricKeyHook, - pkg.MetricKeyBinding, - "queue", - pkg.MetricKeyActivation, - } - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}module_hook_run_seconds", - moduleHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}module_hook_run_user_cpu_seconds", - moduleHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}module_hook_run_sys_cpu_seconds", - moduleHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterGauge("{PREFIX}module_hook_run_max_rss_bytes", moduleHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}module_hook_allowed_errors_total", moduleHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}module_hook_errors_total", moduleHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}module_hook_success_total", moduleHookLabels) - - // global hook running - globalHookLabels := []string{ - pkg.MetricKeyHook, - pkg.MetricKeyBinding, - "queue", - pkg.MetricKeyActivation, - } - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}global_hook_run_seconds", - globalHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}global_hook_run_user_cpu_seconds", - globalHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}global_hook_run_sys_cpu_seconds", - globalHookLabels, - buckets_1msTo10s) - _, _ = metricStorage.RegisterGauge("{PREFIX}global_hook_run_max_rss_bytes", globalHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}global_hook_allowed_errors_total", globalHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}global_hook_errors_total", globalHookLabels) - _, _ = metricStorage.RegisterCounter("{PREFIX}global_hook_success_total", globalHookLabels) - - // converge duration - _, _ = metricStorage.RegisterCounter("{PREFIX}convergence_seconds", []string{pkg.MetricKeyActivation}) - _, _ = metricStorage.RegisterCounter("{PREFIX}convergence_total", []string{pkg.MetricKeyActivation}) - - // helm operations - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}module_helm_seconds", - []string{ - "module", - pkg.MetricKeyActivation, - }, - buckets_1msTo10s) - _, _ = metricStorage.RegisterHistogram( - "{PREFIX}helm_operation_seconds", - []string{ - "module", - pkg.MetricKeyActivation, - "operation", - }, - buckets_1msTo10s) - - // task age - // hook_run task waiting time - _, _ = metricStorage.RegisterCounter( - "{PREFIX}task_wait_in_queue_seconds_total", - []string{ - "module", - pkg.MetricKeyHook, - pkg.MetricKeyBinding, - "queue", - }) -} - -// StartLiveTicksUpdater starts a goroutine that periodically updates -// the live_ticks metric every 10 seconds. -// This metric can be used to verify that addon-operator is alive and functioning. -func StartLiveTicksUpdater(metricStorage metricsstorage.Storage) { - // Addon-operator live ticks. - go func() { - for { - metricStorage.CounterAdd("{PREFIX}live_ticks", 1.0, map[string]string{}) - - time.Sleep(10 * time.Second) - } - }() -} - -// StartTasksQueueLengthUpdater starts a goroutine that periodically updates -// the tasks_queue_length metric every 5 seconds. -// This metric shows the number of pending tasks in each queue, which can be useful -// for monitoring system load and potential backlog issues. -func StartTasksQueueLengthUpdater(metricStorage metricsstorage.Storage, tqs *queue.TaskQueueSet) { - go func() { - for { - // Gather task queues lengths. - tqs.IterateSnapshot(context.TODO(), func(_ context.Context, queue *queue.TaskQueue) { - queueLen := float64(queue.Length()) - metricStorage.GaugeSet("{PREFIX}tasks_queue_length", queueLen, map[string]string{"queue": queue.Name}) - }) - - time.Sleep(5 * time.Second) - } - }() -} diff --git a/pkg/addon-operator/operator.go b/pkg/addon-operator/operator.go index 3c7880a19..e3f06d277 100644 --- a/pkg/addon-operator/operator.go +++ b/pkg/addon-operator/operator.go @@ -21,6 +21,7 @@ import ( "github.com/flant/addon-operator/pkg/helm_resources_manager" "github.com/flant/addon-operator/pkg/kube_config_manager" "github.com/flant/addon-operator/pkg/kube_config_manager/config" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" gohook "github.com/flant/addon-operator/pkg/module_manager/go_hook" "github.com/flant/addon-operator/pkg/module_manager/models/hooks/kind" @@ -117,7 +118,7 @@ func WithOnConvergeFinish(callback func()) Option { } } -func NewAddonOperator(ctx context.Context, opts ...Option) *AddonOperator { +func NewAddonOperator(ctx context.Context, metricsStorage, hookMetricStorage metricsstorage.Storage, opts ...Option) *AddonOperator { cctx, cancel := context.WithCancel(ctx) ao := &AddonOperator{ @@ -136,7 +137,24 @@ func NewAddonOperator(ctx context.Context, opts ...Option) *AddonOperator { ao.Logger = log.NewLogger().Named("addon-operator") } - so := shell_operator.NewShellOperator(cctx, shell_operator.WithLogger(ao.Logger.Named("shell-operator"))) + if metricsStorage == nil { + ao.Logger.Warn("MetricStorage is not provided, creating a new one") + + metricsStorage = metricsstorage.NewMetricStorage( + metricsstorage.WithLogger(ao.Logger.Named("metric-storage")), + ) + } + + if hookMetricStorage == nil { + ao.Logger.Warn("HookMetricStorage is not provided, creating a new one") + + hookMetricStorage = metricsstorage.NewMetricStorage( + metricsstorage.WithNewRegistry(), + metricsstorage.WithLogger(ao.Logger.Named("hook-metric-storage")), + ) + } + + so := shell_operator.NewShellOperator(cctx, metricsStorage, hookMetricStorage, shell_operator.WithLogger(ao.Logger.Named("shell-operator"))) // initialize logging before Assemble rc := runtimeConfig.NewConfig(ao.Logger) @@ -155,7 +173,10 @@ func NewAddonOperator(ctx context.Context, opts ...Option) *AddonOperator { panic(err) } - registerHookMetrics(so.HookMetricStorage) + // Register addon-operator specific metrics + if err := metrics.RegisterHookMetrics(so.HookMetricStorage); err != nil { + panic(fmt.Errorf("register hook metrics: %w", err)) + } labelSelector, err := metav1.ParseToLabelSelector(app.ExtraLabels) if err != nil { diff --git a/pkg/addon-operator/operator_test.go b/pkg/addon-operator/operator_test.go index a6b6f61ef..a1ef8ebd7 100644 --- a/pkg/addon-operator/operator_test.go +++ b/pkg/addon-operator/operator_test.go @@ -110,7 +110,7 @@ func assembleTestAddonOperator(t *testing.T, configPath string) (*AddonOperator, prometheus.DefaultRegisterer = registry // Assemble AddonOperator. - op := NewAddonOperator(context.Background(), WithLogger(log.NewNop())) + op := NewAddonOperator(context.Background(), nil, nil, WithLogger(log.NewNop())) op.engine.KubeClient = kubeClient // Mock helm client for ModuleManager result.helmClient = &mockhelm.Client{} @@ -139,12 +139,10 @@ func assembleTestAddonOperator(t *testing.T, configPath string) (*AddonOperator, Helm: op.Helm, HelmResourcesManager: op.HelmResourcesManager, MetricStorage: metricstorage.NewMetricStorage( - metricstorage.WithPrefix("addon_operator_"), metricstorage.WithLogger(log.NewNop()), metricstorage.WithNewRegistry(), ), HookMetricStorage: metricstorage.NewMetricStorage( - metricstorage.WithPrefix("addon_operator_"), metricstorage.WithLogger(log.NewNop()), metricstorage.WithNewRegistry(), ), diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 000000000..e27b4b4da --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,599 @@ +// Package metrics provides centralized metric names and registration functions for addon-operator. +// All metric names use variables to ensure consistency and prevent typos. +// The {PREFIX} placeholder is replaced by the metrics storage with the appropriate prefix. +package metrics + +import ( + "context" + "fmt" + "strings" + "time" + + metricsstorage "github.com/deckhouse/deckhouse/pkg/metrics-storage" + "github.com/deckhouse/deckhouse/pkg/metrics-storage/options" + + "github.com/flant/addon-operator/pkg" + "github.com/flant/shell-operator/pkg/task/queue" +) + +// Metric name variables organized by functional area. +// Each variable represents a unique metric name used throughout addon-operator. +// These variables are initialized with prefix replacement at startup. +var ( + // ============================================================================ + // Configuration Metrics + // ============================================================================ + // BindingCount tracks the number of bindings per module and hook + BindingCount = "{PREFIX}binding_count" + // ConfigValuesErrorsTotal counts ConfigMap validation errors + ConfigValuesErrorsTotal = "{PREFIX}config_values_errors_total" + + // ============================================================================ + // Module Metrics + // ============================================================================ + // ModulesDiscoverErrorsTotal counts errors during module discovery + ModulesDiscoverErrorsTotal = "{PREFIX}modules_discover_errors_total" + // ModuleDeleteErrorsTotal counts errors during module deletion + ModuleDeleteErrorsTotal = "{PREFIX}module_delete_errors_total" + // ModuleRunSeconds measures module execution time + ModuleRunSeconds = "{PREFIX}module_run_seconds" + // ModuleRunErrorsTotal counts module execution errors + ModuleRunErrorsTotal = "{PREFIX}module_run_errors_total" + // ModulesHelmReleaseRedeployedTotal counts Helm release redeployments + ModulesHelmReleaseRedeployedTotal = "{PREFIX}modules_helm_release_redeployed_total" + // ModulesAbsentResourcesTotal counts absent resources per module + ModulesAbsentResourcesTotal = "{PREFIX}modules_absent_resources_total" + // ModuleInfoMetricName tracks module information + ModuleInfoMetricName = "{PREFIX}mm_module_info" + // ModuleMaintenanceMetricName tracks module maintenance state + ModuleMaintenanceMetricName = "{PREFIX}mm_module_maintenance" + + // ============================================================================ + // Module Hook Metrics + // ============================================================================ + // ModuleHookRunSeconds measures module hook execution time + ModuleHookRunSeconds = "{PREFIX}module_hook_run_seconds" + // ModuleHookRunUserCPUSeconds measures module hook user CPU usage + ModuleHookRunUserCPUSeconds = "{PREFIX}module_hook_run_user_cpu_seconds" + // ModuleHookRunSysCPUSeconds measures module hook system CPU usage + ModuleHookRunSysCPUSeconds = "{PREFIX}module_hook_run_sys_cpu_seconds" + // ModuleHookRunMaxRSSBytes tracks maximum resident set size for module hooks + ModuleHookRunMaxRSSBytes = "{PREFIX}module_hook_run_max_rss_bytes" + // ModuleHookAllowedErrorsTotal counts allowed module hook errors + ModuleHookAllowedErrorsTotal = "{PREFIX}module_hook_allowed_errors_total" + // ModuleHookErrorsTotal counts module hook execution errors + ModuleHookErrorsTotal = "{PREFIX}module_hook_errors_total" + // ModuleHookSuccessTotal counts successful module hook executions + ModuleHookSuccessTotal = "{PREFIX}module_hook_success_total" + + // ============================================================================ + // Global Hook Metrics + // ============================================================================ + // GlobalHookRunSeconds measures global hook execution time + GlobalHookRunSeconds = "{PREFIX}global_hook_run_seconds" + // GlobalHookRunUserCPUSeconds measures global hook user CPU usage + GlobalHookRunUserCPUSeconds = "{PREFIX}global_hook_run_user_cpu_seconds" + // GlobalHookRunSysCPUSeconds measures global hook system CPU usage + GlobalHookRunSysCPUSeconds = "{PREFIX}global_hook_run_sys_cpu_seconds" + // GlobalHookRunMaxRSSBytes tracks maximum resident set size for global hooks + GlobalHookRunMaxRSSBytes = "{PREFIX}global_hook_run_max_rss_bytes" + // GlobalHookAllowedErrorsTotal counts allowed global hook errors + GlobalHookAllowedErrorsTotal = "{PREFIX}global_hook_allowed_errors_total" + // GlobalHookErrorsTotal counts global hook execution errors + GlobalHookErrorsTotal = "{PREFIX}global_hook_errors_total" + // GlobalHookSuccessTotal counts successful global hook executions + GlobalHookSuccessTotal = "{PREFIX}global_hook_success_total" + + // ============================================================================ + // Convergence Metrics + // ============================================================================ + // ConvergenceSeconds measures convergence duration + ConvergenceSeconds = "{PREFIX}convergence_seconds" + // ConvergenceTotal counts convergence executions + ConvergenceTotal = "{PREFIX}convergence_total" + + // ============================================================================ + // Helm Operations Metrics + // ============================================================================ + // ModuleHelmSeconds measures Helm operation time for modules + ModuleHelmSeconds = "{PREFIX}module_helm_seconds" + // HelmOperationSeconds measures specific Helm operation durations + HelmOperationSeconds = "{PREFIX}helm_operation_seconds" + + // ============================================================================ + // Task Queue Metrics + // ============================================================================ + // TaskWaitInQueueSecondsTotal measures time tasks wait in queue + TaskWaitInQueueSecondsTotal = "{PREFIX}task_wait_in_queue_seconds_total" + // TasksQueueLength shows current length of task queues + TasksQueueLength = "{PREFIX}tasks_queue_length" + + // ============================================================================ + // Live Ticks Metrics + // ============================================================================ + // LiveTicks is a counter that increases every 10 seconds to indicate addon-operator is alive + LiveTicks = "{PREFIX}live_ticks" +) + +// Standard histogram buckets for timing metrics (1ms to 10s) +var buckets_1msTo10s = []float64{ + 0.0, + 0.001, 0.002, 0.005, // 1,2,5 milliseconds + 0.01, 0.02, 0.05, // 10,20,50 milliseconds + 0.1, 0.2, 0.5, // 100,200,500 milliseconds + 1, 2, 5, // 1,2,5 seconds + 10, // 10 seconds +} + +// ReplacePrefix replaces the {PREFIX} placeholder in a metric name with the provided prefix. +// This function is useful for testing or when you need to manually construct metric names +// with a specific prefix instead of relying on the metrics storage's automatic replacement. +func ReplacePrefix(metricName, prefix string) string { + return strings.ReplaceAll(metricName, "{PREFIX}", prefix) +} + +// InitMetrics initializes all metric name variables by replacing {PREFIX} placeholders +// with the provided prefix. This function should be called once at startup before +// registering any metrics. +func InitMetrics(prefix string) { + // ============================================================================ + // Configuration Metrics + // ============================================================================ + BindingCount = ReplacePrefix(BindingCount, prefix) + ConfigValuesErrorsTotal = ReplacePrefix(ConfigValuesErrorsTotal, prefix) + + // ============================================================================ + // Module Metrics + // ============================================================================ + ModulesDiscoverErrorsTotal = ReplacePrefix(ModulesDiscoverErrorsTotal, prefix) + ModuleDeleteErrorsTotal = ReplacePrefix(ModuleDeleteErrorsTotal, prefix) + ModuleRunSeconds = ReplacePrefix(ModuleRunSeconds, prefix) + ModuleRunErrorsTotal = ReplacePrefix(ModuleRunErrorsTotal, prefix) + ModulesHelmReleaseRedeployedTotal = ReplacePrefix(ModulesHelmReleaseRedeployedTotal, prefix) + ModulesAbsentResourcesTotal = ReplacePrefix(ModulesAbsentResourcesTotal, prefix) + ModuleInfoMetricName = ReplacePrefix(ModuleInfoMetricName, prefix) + ModuleMaintenanceMetricName = ReplacePrefix(ModuleMaintenanceMetricName, prefix) + + // ============================================================================ + // Module Hook Metrics + // ============================================================================ + ModuleHookRunSeconds = ReplacePrefix(ModuleHookRunSeconds, prefix) + ModuleHookRunUserCPUSeconds = ReplacePrefix(ModuleHookRunUserCPUSeconds, prefix) + ModuleHookRunSysCPUSeconds = ReplacePrefix(ModuleHookRunSysCPUSeconds, prefix) + ModuleHookRunMaxRSSBytes = ReplacePrefix(ModuleHookRunMaxRSSBytes, prefix) + ModuleHookAllowedErrorsTotal = ReplacePrefix(ModuleHookAllowedErrorsTotal, prefix) + ModuleHookErrorsTotal = ReplacePrefix(ModuleHookErrorsTotal, prefix) + ModuleHookSuccessTotal = ReplacePrefix(ModuleHookSuccessTotal, prefix) + + // ============================================================================ + // Global Hook Metrics + // ============================================================================ + GlobalHookRunSeconds = ReplacePrefix(GlobalHookRunSeconds, prefix) + GlobalHookRunUserCPUSeconds = ReplacePrefix(GlobalHookRunUserCPUSeconds, prefix) + GlobalHookRunSysCPUSeconds = ReplacePrefix(GlobalHookRunSysCPUSeconds, prefix) + GlobalHookRunMaxRSSBytes = ReplacePrefix(GlobalHookRunMaxRSSBytes, prefix) + GlobalHookAllowedErrorsTotal = ReplacePrefix(GlobalHookAllowedErrorsTotal, prefix) + GlobalHookErrorsTotal = ReplacePrefix(GlobalHookErrorsTotal, prefix) + GlobalHookSuccessTotal = ReplacePrefix(GlobalHookSuccessTotal, prefix) + + // ============================================================================ + // Convergence Metrics + // ============================================================================ + ConvergenceSeconds = ReplacePrefix(ConvergenceSeconds, prefix) + ConvergenceTotal = ReplacePrefix(ConvergenceTotal, prefix) + + // ============================================================================ + // Helm Operations Metrics + // ============================================================================ + ModuleHelmSeconds = ReplacePrefix(ModuleHelmSeconds, prefix) + HelmOperationSeconds = ReplacePrefix(HelmOperationSeconds, prefix) + + // ============================================================================ + // Task Queue Metrics + // ============================================================================ + TaskWaitInQueueSecondsTotal = ReplacePrefix(TaskWaitInQueueSecondsTotal, prefix) + TasksQueueLength = ReplacePrefix(TasksQueueLength, prefix) + + // ============================================================================ + // Live Ticks Metrics + // ============================================================================ + LiveTicks = ReplacePrefix(LiveTicks, prefix) +} + +// ============================================================================ +// Registration Functions +// ============================================================================ + +// registerHookMetrics registers all addon-operator specific metrics with the provided storage. +// This includes configuration, module, hook, convergence, Helm, and task queue metrics. +// Returns an error if any metric registration fails. +func RegisterHookMetrics(metricStorage metricsstorage.Storage) error { + // Register configuration metrics + if err := registerConfigurationMetrics(metricStorage); err != nil { + return fmt.Errorf("register configuration metrics: %w", err) + } + + // Register module metrics + if err := registerModuleMetrics(metricStorage); err != nil { + return fmt.Errorf("register module metrics: %w", err) + } + + // Register module hook metrics + if err := registerModuleHookMetrics(metricStorage); err != nil { + return fmt.Errorf("register module hook metrics: %w", err) + } + + // Register global hook metrics + if err := registerGlobalHookMetrics(metricStorage); err != nil { + return fmt.Errorf("register global hook metrics: %w", err) + } + + // Register convergence metrics + if err := registerConvergenceMetrics(metricStorage); err != nil { + return fmt.Errorf("register convergence metrics: %w", err) + } + + // Register Helm metrics + if err := registerHelmMetrics(metricStorage); err != nil { + return fmt.Errorf("register helm metrics: %w", err) + } + + // Register task queue metrics + if err := registerTaskQueueMetrics(metricStorage); err != nil { + return fmt.Errorf("register task queue metrics: %w", err) + } + + return nil +} + +// registerConfigurationMetrics registers metrics related to configuration and bindings +func registerConfigurationMetrics(metricStorage metricsstorage.Storage) error { + _, err := metricStorage.RegisterGauge( + BindingCount, + []string{"module", pkg.MetricKeyHook}, + options.WithHelp("Number of bindings per module and hook"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", BindingCount, err) + } + + _, err = metricStorage.RegisterCounter( + ConfigValuesErrorsTotal, + []string{}, + options.WithHelp("Counter of ConfigMap validation errors"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ConfigValuesErrorsTotal, err) + } + + return nil +} + +// registerModuleMetrics registers metrics related to module operations +func registerModuleMetrics(metricStorage metricsstorage.Storage) error { + _, err := metricStorage.RegisterCounter( + ModulesDiscoverErrorsTotal, + []string{}, + options.WithHelp("Counter of errors during module discovery"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModulesDiscoverErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + ModuleDeleteErrorsTotal, + []string{"module"}, + options.WithHelp("Counter of errors during module deletion"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleDeleteErrorsTotal, err) + } + + _, err = metricStorage.RegisterHistogram( + ModuleRunSeconds, + []string{"module", pkg.MetricKeyActivation}, + buckets_1msTo10s, + options.WithHelp("Histogram of module execution times in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleRunSeconds, err) + } + + _, err = metricStorage.RegisterCounter( + ModuleRunErrorsTotal, + []string{"module"}, + options.WithHelp("Counter of module execution errors"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleRunErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + ModulesHelmReleaseRedeployedTotal, + []string{"module"}, + options.WithHelp("Counter of Helm release redeployments per module"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModulesHelmReleaseRedeployedTotal, err) + } + + _, err = metricStorage.RegisterCounter( + ModulesAbsentResourcesTotal, + []string{"module", "resource"}, + options.WithHelp("Counter of absent resources per module and resource"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModulesAbsentResourcesTotal, err) + } + + return nil +} + +// registerModuleHookMetrics registers metrics related to module hook execution +func registerModuleHookMetrics(metricStorage metricsstorage.Storage) error { + moduleHookLabels := []string{ + "module", + pkg.MetricKeyHook, + pkg.MetricKeyBinding, + "queue", + pkg.MetricKeyActivation, + } + + _, err := metricStorage.RegisterHistogram( + ModuleHookRunSeconds, + moduleHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of module hook execution times in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookRunSeconds, err) + } + + _, err = metricStorage.RegisterHistogram( + ModuleHookRunUserCPUSeconds, + moduleHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of module hook user CPU usage in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookRunUserCPUSeconds, err) + } + + _, err = metricStorage.RegisterHistogram( + ModuleHookRunSysCPUSeconds, + moduleHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of module hook system CPU usage in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookRunSysCPUSeconds, err) + } + + _, err = metricStorage.RegisterGauge( + ModuleHookRunMaxRSSBytes, + moduleHookLabels, + options.WithHelp("Gauge of maximum resident set size used by module hook in bytes"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookRunMaxRSSBytes, err) + } + + _, err = metricStorage.RegisterCounter( + ModuleHookAllowedErrorsTotal, + moduleHookLabels, + options.WithHelp("Counter of module hook execution errors that are allowed to fail (allowFailure: true)"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookAllowedErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + ModuleHookErrorsTotal, + moduleHookLabels, + options.WithHelp("Counter of module hook execution errors (allowFailure: false)"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + ModuleHookSuccessTotal, + moduleHookLabels, + options.WithHelp("Counter of successful module hook executions"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHookSuccessTotal, err) + } + + return nil +} + +// registerGlobalHookMetrics registers metrics related to global hook execution +func registerGlobalHookMetrics(metricStorage metricsstorage.Storage) error { + globalHookLabels := []string{ + pkg.MetricKeyHook, + pkg.MetricKeyBinding, + "queue", + pkg.MetricKeyActivation, + } + + _, err := metricStorage.RegisterHistogram( + GlobalHookRunSeconds, + globalHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of global hook execution times in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookRunSeconds, err) + } + + _, err = metricStorage.RegisterHistogram( + GlobalHookRunUserCPUSeconds, + globalHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of global hook user CPU usage in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookRunUserCPUSeconds, err) + } + + _, err = metricStorage.RegisterHistogram( + GlobalHookRunSysCPUSeconds, + globalHookLabels, + buckets_1msTo10s, + options.WithHelp("Histogram of global hook system CPU usage in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookRunSysCPUSeconds, err) + } + + _, err = metricStorage.RegisterGauge( + GlobalHookRunMaxRSSBytes, + globalHookLabels, + options.WithHelp("Gauge of maximum resident set size used by global hook in bytes"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookRunMaxRSSBytes, err) + } + + _, err = metricStorage.RegisterCounter( + GlobalHookAllowedErrorsTotal, + globalHookLabels, + options.WithHelp("Counter of global hook execution errors that are allowed to fail (allowFailure: true)"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookAllowedErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + GlobalHookErrorsTotal, + globalHookLabels, + options.WithHelp("Counter of global hook execution errors (allowFailure: false)"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookErrorsTotal, err) + } + + _, err = metricStorage.RegisterCounter( + GlobalHookSuccessTotal, + globalHookLabels, + options.WithHelp("Counter of successful global hook executions"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", GlobalHookSuccessTotal, err) + } + + return nil +} + +// registerConvergenceMetrics registers metrics related to convergence operations +func registerConvergenceMetrics(metricStorage metricsstorage.Storage) error { + _, err := metricStorage.RegisterCounter( + ConvergenceSeconds, + []string{pkg.MetricKeyActivation}, + options.WithHelp("Counter of convergence duration in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ConvergenceSeconds, err) + } + + _, err = metricStorage.RegisterCounter( + ConvergenceTotal, + []string{pkg.MetricKeyActivation}, + options.WithHelp("Counter of convergence executions"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ConvergenceTotal, err) + } + + return nil +} + +// registerHelmMetrics registers metrics related to Helm operations +func registerHelmMetrics(metricStorage metricsstorage.Storage) error { + _, err := metricStorage.RegisterHistogram( + ModuleHelmSeconds, + []string{"module", pkg.MetricKeyActivation}, + buckets_1msTo10s, + options.WithHelp("Histogram of Helm operation times for modules in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", ModuleHelmSeconds, err) + } + + _, err = metricStorage.RegisterHistogram( + HelmOperationSeconds, + []string{"module", pkg.MetricKeyActivation, "operation"}, + buckets_1msTo10s, + options.WithHelp("Histogram of specific Helm operation durations in seconds"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", HelmOperationSeconds, err) + } + + return nil +} + +// registerTaskQueueMetrics registers metrics related to task queue operations +func registerTaskQueueMetrics(metricStorage metricsstorage.Storage) error { + _, err := metricStorage.RegisterCounter( + TaskWaitInQueueSecondsTotal, + []string{"module", pkg.MetricKeyHook, pkg.MetricKeyBinding, "queue"}, + options.WithHelp("Counter of seconds that tasks waited in queue before execution"), + ) + if err != nil { + return fmt.Errorf("can not register %s: %w", TaskWaitInQueueSecondsTotal, err) + } + + return nil +} + +// ============================================================================ +// Live Metric Updaters +// ============================================================================ + +// StartLiveTicksUpdater starts a goroutine that periodically updates +// the live_ticks metric every 10 seconds. +// This metric can be used to verify that addon-operator is alive and functioning. +func StartLiveTicksUpdater(metricStorage metricsstorage.Storage) { + // Register the live ticks counter + _, _ = metricStorage.RegisterCounter( + LiveTicks, + []string{}, + options.WithHelp("Counter that increases every 10 seconds to indicate addon-operator is alive"), + ) + + // Start the updater goroutine + go func() { + for { + metricStorage.CounterAdd(LiveTicks, 1.0, map[string]string{}) + time.Sleep(10 * time.Second) + } + }() +} + +// StartTasksQueueLengthUpdater starts a goroutine that periodically updates +// the tasks_queue_length metric every 5 seconds. +// This metric shows the number of pending tasks in each queue, which can be useful +// for monitoring system load and potential backlog issues. +func StartTasksQueueLengthUpdater(metricStorage metricsstorage.Storage, tqs *queue.TaskQueueSet) { + // Register the tasks queue length gauge + _, _ = metricStorage.RegisterGauge( + TasksQueueLength, + []string{"queue"}, + options.WithHelp("Gauge showing the length of the task queue"), + ) + + // Start the updater goroutine + go func() { + for { + // Gather task queues lengths. + tqs.IterateSnapshot(context.TODO(), func(_ context.Context, queue *queue.TaskQueue) { + queueLen := float64(queue.Length()) + metricStorage.GaugeSet(TasksQueueLength, queueLen, map[string]string{"queue": queue.Name}) + }) + + time.Sleep(5 * time.Second) + } + }() +} diff --git a/pkg/module_manager/models/modules/basic.go b/pkg/module_manager/models/modules/basic.go index dc829fe62..66660fa64 100644 --- a/pkg/module_manager/models/modules/basic.go +++ b/pkg/module_manager/models/modules/basic.go @@ -24,6 +24,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/app" "github.com/flant/addon-operator/pkg/hook/types" + "github.com/flant/addon-operator/pkg/metrics" environmentmanager "github.com/flant/addon-operator/pkg/module_manager/environment_manager" "github.com/flant/addon-operator/pkg/module_manager/models/hooks" "github.com/flant/addon-operator/pkg/module_manager/models/hooks/kind" @@ -658,7 +659,7 @@ func (bm *BasicModule) RunHooksByBinding(ctx context.Context, binding sh_op_type func() { defer measure.Duration(func(d time.Duration) { - bm.dc.MetricStorage.HistogramObserve("{PREFIX}module_hook_run_seconds", d.Seconds(), metricLabels, nil) + bm.dc.MetricStorage.HistogramObserve(metrics.ModuleHookRunSeconds, d.Seconds(), metricLabels, nil) })() err = bm.executeHook(ctx, moduleHook, binding, []bindingcontext.BindingContext{bc}, logLabels, metricLabels) }() @@ -823,9 +824,9 @@ func (bm *BasicModule) RunEnabledScript(ctx context.Context, tmpDir string, prec "queue": logLabels["queue"], pkg.MetricKeyActivation: logLabels[pkg.LogKeyEventType], } - bm.dc.MetricStorage.HistogramObserve("{PREFIX}module_hook_run_sys_cpu_seconds", usage.Sys.Seconds(), metricLabels, nil) - bm.dc.MetricStorage.HistogramObserve("{PREFIX}module_hook_run_user_cpu_seconds", usage.User.Seconds(), metricLabels, nil) - bm.dc.MetricStorage.GaugeSet("{PREFIX}module_hook_run_max_rss_bytes", float64(usage.MaxRss)*1024, metricLabels) + bm.dc.MetricStorage.HistogramObserve(metrics.ModuleHookRunSysCPUSeconds, usage.Sys.Seconds(), metricLabels, nil) + bm.dc.MetricStorage.HistogramObserve(metrics.ModuleHookRunUserCPUSeconds, usage.User.Seconds(), metricLabels, nil) + bm.dc.MetricStorage.GaugeSet(metrics.ModuleHookRunMaxRSSBytes, float64(usage.MaxRss)*1024, metricLabels) } if err != nil { logEntry.Error("Fail to run enabled script", @@ -1034,9 +1035,9 @@ func (bm *BasicModule) executeHook(ctx context.Context, h *hooks.ModuleHook, bin hookResult, err := h.Execute(ctx, h.GetConfigVersion(), bctx, bm.safeName(), hookConfigValues, hookValues, logLabels) if hookResult != nil && hookResult.Usage != nil { - bm.dc.MetricStorage.HistogramObserve("{PREFIX}module_hook_run_sys_cpu_seconds", hookResult.Usage.Sys.Seconds(), metricLabels, nil) - bm.dc.MetricStorage.HistogramObserve("{PREFIX}module_hook_run_user_cpu_seconds", hookResult.Usage.User.Seconds(), metricLabels, nil) - bm.dc.MetricStorage.GaugeSet("{PREFIX}module_hook_run_max_rss_bytes", float64(hookResult.Usage.MaxRss)*1024, metricLabels) + bm.dc.MetricStorage.HistogramObserve(metrics.ModuleHookRunSysCPUSeconds, hookResult.Usage.Sys.Seconds(), metricLabels, nil) + bm.dc.MetricStorage.HistogramObserve(metrics.ModuleHookRunUserCPUSeconds, hookResult.Usage.User.Seconds(), metricLabels, nil) + bm.dc.MetricStorage.GaugeSet(metrics.ModuleHookRunMaxRSSBytes, float64(hookResult.Usage.MaxRss)*1024, metricLabels) } if hookResult != nil && len(hookResult.Metrics) > 0 { diff --git a/pkg/module_manager/models/modules/basic_test.go b/pkg/module_manager/models/modules/basic_test.go index 9a282f704..fdb56ebd9 100644 --- a/pkg/module_manager/models/modules/basic_test.go +++ b/pkg/module_manager/models/modules/basic_test.go @@ -143,7 +143,6 @@ exit 0 logger := log.NewLogger() storage := metricsstorage.NewMetricStorage( - metricsstorage.WithPrefix("addon_operator_"), metricsstorage.WithLogger(logger), ) @@ -199,7 +198,6 @@ exit 0 logger := log.NewLogger() storage := metricsstorage.NewMetricStorage( - metricsstorage.WithPrefix("addon_operator_"), metricsstorage.WithLogger(logger), ) @@ -243,7 +241,6 @@ fi func stubDeps(logger *log.Logger) *hooks.HookExecutionDependencyContainer { st := metricsstorage.NewMetricStorage( - metricsstorage.WithPrefix("addon_operator_"), metricsstorage.WithLogger(logger), ) return &hooks.HookExecutionDependencyContainer{ diff --git a/pkg/module_manager/models/modules/global.go b/pkg/module_manager/models/modules/global.go index 7487cc8eb..b72a48567 100644 --- a/pkg/module_manager/models/modules/global.go +++ b/pkg/module_manager/models/modules/global.go @@ -16,6 +16,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/hook/types" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager/models/hooks" "github.com/flant/addon-operator/pkg/module_manager/models/hooks/kind" "github.com/flant/addon-operator/pkg/utils" @@ -202,9 +203,9 @@ func (gm *GlobalModule) executeHook(ctx context.Context, h *hooks.GlobalHook, bi pkg.MetricKeyActivation: logLabels[pkg.LogKeyEventType], } // usage metrics - gm.dc.MetricStorage.HistogramObserve("{PREFIX}global_hook_run_sys_cpu_seconds", hookResult.Usage.Sys.Seconds(), metricLabels, nil) - gm.dc.MetricStorage.HistogramObserve("{PREFIX}global_hook_run_user_cpu_seconds", hookResult.Usage.User.Seconds(), metricLabels, nil) - gm.dc.MetricStorage.GaugeSet("{PREFIX}global_hook_run_max_rss_bytes", float64(hookResult.Usage.MaxRss)*1024, metricLabels) + gm.dc.MetricStorage.HistogramObserve(metrics.GlobalHookRunSysCPUSeconds, hookResult.Usage.Sys.Seconds(), metricLabels, nil) + gm.dc.MetricStorage.HistogramObserve(metrics.GlobalHookRunUserCPUSeconds, hookResult.Usage.User.Seconds(), metricLabels, nil) + gm.dc.MetricStorage.GaugeSet(metrics.GlobalHookRunMaxRSSBytes, float64(hookResult.Usage.MaxRss)*1024, metricLabels) } if hookResult != nil && len(hookResult.Metrics) > 0 { diff --git a/pkg/module_manager/models/modules/global_test.go b/pkg/module_manager/models/modules/global_test.go index 81061c2e3..74729128e 100644 --- a/pkg/module_manager/models/modules/global_test.go +++ b/pkg/module_manager/models/modules/global_test.go @@ -31,7 +31,6 @@ exit 0 defer os.RemoveAll("/tmp/global") logger := log.NewLogger() storage := metricsstorage.NewMetricStorage( - metricsstorage.WithPrefix("addon_operator_"), metricsstorage.WithLogger(logger), ) gm, err := NewGlobalModule( diff --git a/pkg/module_manager/models/modules/helm.go b/pkg/module_manager/models/modules/helm.go index b02ac10ae..ea7e65a93 100644 --- a/pkg/module_manager/models/modules/helm.go +++ b/pkg/module_manager/models/modules/helm.go @@ -22,6 +22,7 @@ import ( "github.com/flant/addon-operator/pkg/helm" "github.com/flant/addon-operator/pkg/helm/client" "github.com/flant/addon-operator/pkg/helm/helm3lib" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/utils" "github.com/flant/kube-client/manifest" "github.com/flant/shell-operator/pkg/utils/measure" @@ -176,7 +177,7 @@ func (hm *HelmModule) RunHelmInstall(ctx context.Context, logLabels map[string]s } defer measure.Duration(func(d time.Duration) { - hm.dependencies.MetricsStorage.HistogramObserve("{PREFIX}module_helm_seconds", d.Seconds(), metricLabels, nil) + hm.dependencies.MetricsStorage.HistogramObserve(metrics.ModuleHelmSeconds, d.Seconds(), metricLabels, nil) })() logEntry := utils.EnrichLoggerWithLabels(hm.logger, logLabels) @@ -242,7 +243,7 @@ func (hm *HelmModule) RunHelmInstall(ctx context.Context, logLabels map[string]s } defer measure.Duration(func(d time.Duration) { - hm.dependencies.MetricsStorage.HistogramObserve("{PREFIX}helm_operation_seconds", d.Seconds(), metricLabels, nil) + hm.dependencies.MetricsStorage.HistogramObserve(metrics.HelmOperationSeconds, d.Seconds(), metricLabels, nil) })() renderedManifests, err = helmClient.Render( @@ -281,7 +282,7 @@ func (hm *HelmModule) RunHelmInstall(ctx context.Context, logLabels map[string]s "operation": "check-upgrade", } defer measure.Duration(func(d time.Duration) { - hm.dependencies.MetricsStorage.HistogramObserve("{PREFIX}helm_operation_seconds", d.Seconds(), metricLabels, nil) + hm.dependencies.MetricsStorage.HistogramObserve(metrics.HelmOperationSeconds, d.Seconds(), metricLabels, nil) })() runUpgradeRelease, err = hm.shouldRunHelmUpgrade(helmClient, helmReleaseName, checksum, manifests, logLabels) @@ -309,7 +310,7 @@ func (hm *HelmModule) RunHelmInstall(ctx context.Context, logLabels map[string]s } defer measure.Duration(func(d time.Duration) { - hm.dependencies.MetricsStorage.HistogramObserve("{PREFIX}helm_operation_seconds", d.Seconds(), metricLabels, nil) + hm.dependencies.MetricsStorage.HistogramObserve(metrics.HelmOperationSeconds, d.Seconds(), metricLabels, nil) })() err = helmClient.UpgradeRelease( diff --git a/pkg/module_manager/module_manager.go b/pkg/module_manager/module_manager.go index a365c4268..58f14e044 100644 --- a/pkg/module_manager/module_manager.go +++ b/pkg/module_manager/module_manager.go @@ -21,6 +21,7 @@ import ( "github.com/flant/addon-operator/pkg/helm_resources_manager" . "github.com/flant/addon-operator/pkg/hook/types" "github.com/flant/addon-operator/pkg/kube_config_manager/config" + "github.com/flant/addon-operator/pkg/metrics" environmentmanager "github.com/flant/addon-operator/pkg/module_manager/environment_manager" gohook "github.com/flant/addon-operator/pkg/module_manager/go_hook" "github.com/flant/addon-operator/pkg/module_manager/loader" @@ -48,12 +49,12 @@ import ( "github.com/flant/shell-operator/pkg/task/queue" ) -const ( +var ( moduleInfoMetricGroup = "mm_module_info" - moduleInfoMetricName = "{PREFIX}mm_module_info" + moduleInfoMetricName = metrics.ModuleInfoMetricName moduleMaintenanceMetricGroup = "mm_module_maintenance" - moduleMaintenanceMetricName = "{PREFIX}mm_module_maintenance" + moduleMaintenanceMetricName = metrics.ModuleMaintenanceMetricName moduleManagerServiceName = "module-manager" ) @@ -412,7 +413,7 @@ func (mm *ModuleManager) checkConfig() { } mm.kubeConfigLock.RLock() if !mm.kubeConfigValid || !mm.kubeConfigValuesValid { - mm.dependencies.MetricStorage.CounterAdd("{PREFIX}config_values_errors_total", 1.0, map[string]string{}) + mm.dependencies.MetricStorage.CounterAdd(metrics.ConfigValuesErrorsTotal, 1.0, map[string]string{}) } mm.kubeConfigLock.RUnlock() time.Sleep(5 * time.Second) diff --git a/pkg/module_manager/module_manager_hooks.go b/pkg/module_manager/module_manager_hooks.go index 7e675b76c..1d97de062 100644 --- a/pkg/module_manager/module_manager_hooks.go +++ b/pkg/module_manager/module_manager_hooks.go @@ -12,6 +12,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/app" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager/models/hooks" "github.com/flant/addon-operator/pkg/module_manager/models/modules" dynamic_extender "github.com/flant/addon-operator/pkg/module_manager/scheduler/extenders/dynamically_enabled" @@ -126,7 +127,7 @@ func (mm *ModuleManager) registerGlobalHooks(gm *modules.GlobalModule) error { hk.WithTmpDir(mm.TempDir) mm.dependencies.MetricStorage.GaugeSet( - "{PREFIX}binding_count", + metrics.BindingCount, float64(hk.GetHookConfig().BindingsCount()), map[string]string{ pkg.MetricKeyHook: hk.GetName(), @@ -154,7 +155,7 @@ func (mm *ModuleManager) RegisterModuleHooks(ml *modules.BasicModule, logLabels hk.WithTmpDir(mm.TempDir) mm.dependencies.MetricStorage.GaugeSet( - "{PREFIX}binding_count", + metrics.BindingCount, float64(hk.GetHookConfig().BindingsCount()), map[string]string{ "module": ml.GetName(), diff --git a/pkg/task/service/converge.go b/pkg/task/service/converge.go index 2cb649b48..3142b5a89 100644 --- a/pkg/task/service/converge.go +++ b/pkg/task/service/converge.go @@ -8,6 +8,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/addon-operator/converge" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/task" sh_task "github.com/flant/shell-operator/pkg/task" "github.com/flant/shell-operator/pkg/task/queue" @@ -61,14 +62,14 @@ func (s *TaskHandlerService) recordConvergenceMetrics(durationSeconds float64) { // Record the time taken for convergence s.metricStorage.CounterAdd( - "{PREFIX}convergence_seconds", + metrics.ConvergenceSeconds, durationSeconds, metricLabels, ) // Increment the total convergence operations counter s.metricStorage.CounterAdd( - "{PREFIX}convergence_total", + metrics.ConvergenceTotal, 1.0, metricLabels, ) diff --git a/pkg/task/service/metric.go b/pkg/task/service/metric.go index a20ea10ba..a7fa6c598 100644 --- a/pkg/task/service/metric.go +++ b/pkg/task/service/metric.go @@ -4,6 +4,7 @@ import ( "time" "github.com/flant/addon-operator/pkg" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/task" sh_task "github.com/flant/shell-operator/pkg/task" ) @@ -49,5 +50,5 @@ func (s *TaskHandlerService) UpdateWaitInQueueMetric(t sh_task.Task) { } taskWaitTime := time.Since(t.GetQueuedAt()).Seconds() - s.metricStorage.CounterAdd("{PREFIX}task_wait_in_queue_seconds_total", taskWaitTime, metricLabels) + s.metricStorage.CounterAdd(metrics.TaskWaitInQueueSecondsTotal, taskWaitTime, metricLabels) } diff --git a/pkg/task/tasks/apply-kube-config-values/task.go b/pkg/task/tasks/apply-kube-config-values/task.go index 0218565f2..8711fbcf8 100644 --- a/pkg/task/tasks/apply-kube-config-values/task.go +++ b/pkg/task/tasks/apply-kube-config-values/task.go @@ -11,6 +11,7 @@ import ( "github.com/flant/addon-operator/pkg/kube_config_manager" "github.com/flant/addon-operator/pkg/kube_config_manager/config" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/task" sh_task "github.com/flant/shell-operator/pkg/task" @@ -88,7 +89,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { slog.Int("count", s.shellTask.GetFailureCount()+1), log.Err(handleErr)) - s.metricStorage.CounterAdd("{PREFIX}modules_discover_errors_total", 1.0, map[string]string{}) + s.metricStorage.CounterAdd(metrics.ModulesDiscoverErrorsTotal, 1.0, map[string]string{}) s.shellTask.UpdateFailureMessage(handleErr.Error()) s.shellTask.WithQueuedAt(time.Now()) diff --git a/pkg/task/tasks/converge-modules/task.go b/pkg/task/tasks/converge-modules/task.go index e1d58da18..7f6480fc9 100644 --- a/pkg/task/tasks/converge-modules/task.go +++ b/pkg/task/tasks/converge-modules/task.go @@ -14,6 +14,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/addon-operator/converge" hookTypes "github.com/flant/addon-operator/pkg/hook/types" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/module_manager/models/modules/events" "github.com/flant/addon-operator/pkg/task" @@ -237,7 +238,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { slog.String("phase", string(s.convergeState.GetPhase())), slog.Int("count", s.shellTask.GetFailureCount()+1), log.Err(handleErr)) - s.metricStorage.CounterAdd("{PREFIX}modules_discover_errors_total", 1.0, map[string]string{}) + s.metricStorage.CounterAdd(metrics.ModulesDiscoverErrorsTotal, 1.0, map[string]string{}) s.shellTask.UpdateFailureMessage(handleErr.Error()) s.shellTask.WithQueuedAt(time.Now()) return res diff --git a/pkg/task/tasks/global-hook-enable-kubernetes-bindings/task.go b/pkg/task/tasks/global-hook-enable-kubernetes-bindings/task.go index c755ba708..1ed895485 100644 --- a/pkg/task/tasks/global-hook-enable-kubernetes-bindings/task.go +++ b/pkg/task/tasks/global-hook-enable-kubernetes-bindings/task.go @@ -13,6 +13,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/addon-operator/converge" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/module_manager/models/hooks" "github.com/flant/addon-operator/pkg/task" @@ -140,7 +141,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { if err != nil { hookLabel := path.Base(globalHook.GetPath()) // TODO use separate metric, as in shell-operator? - s.metricStorage.CounterAdd("{PREFIX}global_hook_errors_total", 1.0, map[string]string{ + s.metricStorage.CounterAdd(metrics.GlobalHookErrorsTotal, 1.0, map[string]string{ pkg.MetricKeyHook: hookLabel, pkg.MetricKeyBinding: "GlobalEnableKubernetesBindings", pkg.MetricKeyQueue: s.shellTask.GetQueueName(), diff --git a/pkg/task/tasks/global-hook-run/task.go b/pkg/task/tasks/global-hook-run/task.go index 0e5ae694d..1894e60e3 100644 --- a/pkg/task/tasks/global-hook-run/task.go +++ b/pkg/task/tasks/global-hook-run/task.go @@ -15,6 +15,7 @@ import ( "github.com/flant/addon-operator/pkg/helm/helm3lib" "github.com/flant/addon-operator/pkg/helm_resources_manager" hookTypes "github.com/flant/addon-operator/pkg/hook/types" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/task" "github.com/flant/addon-operator/pkg/task/helpers" @@ -102,7 +103,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { } defer measure.Duration(func(d time.Duration) { - s.metricStorage.HistogramObserve("{PREFIX}global_hook_run_seconds", d.Seconds(), metricLabels, nil) + s.metricStorage.HistogramObserve(metrics.GlobalHookRunSeconds, d.Seconds(), metricLabels, nil) })() isSynchronization := hm.IsSynchronization() @@ -299,9 +300,9 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { //} } - s.metricStorage.CounterAdd("{PREFIX}global_hook_allowed_errors_total", allowed, metricLabels) - s.metricStorage.CounterAdd("{PREFIX}global_hook_errors_total", errors, metricLabels) - s.metricStorage.CounterAdd("{PREFIX}global_hook_success_total", success, metricLabels) + s.metricStorage.CounterAdd(metrics.GlobalHookAllowedErrorsTotal, allowed, metricLabels) + s.metricStorage.CounterAdd(metrics.GlobalHookErrorsTotal, errors, metricLabels) + s.metricStorage.CounterAdd(metrics.GlobalHookSuccessTotal, success, metricLabels) } if isSynchronization && res.Status == queue.Success { diff --git a/pkg/task/tasks/module-delete/task.go b/pkg/task/tasks/module-delete/task.go index 066d7f8d9..087dc62d7 100644 --- a/pkg/task/tasks/module-delete/task.go +++ b/pkg/task/tasks/module-delete/task.go @@ -9,6 +9,7 @@ import ( metricsstorage "github.com/deckhouse/deckhouse/pkg/metrics-storage" "go.opentelemetry.io/otel" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/task" taskqueue "github.com/flant/addon-operator/pkg/task/queue" @@ -96,7 +97,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { s.moduleManager.UpdateModuleLastErrorAndNotify(baseModule, err) if err != nil { - s.metricStorage.CounterAdd("{PREFIX}module_delete_errors_total", 1.0, map[string]string{"module": hm.ModuleName}) + s.metricStorage.CounterAdd(metrics.ModuleDeleteErrorsTotal, 1.0, map[string]string{"module": hm.ModuleName}) s.logger.Error("Module delete failed, requeue task to retry after delay.", slog.Int("count", s.shellTask.GetFailureCount()+1), diff --git a/pkg/task/tasks/module-hook-run/task.go b/pkg/task/tasks/module-hook-run/task.go index 93a874af4..1ae8e9773 100644 --- a/pkg/task/tasks/module-hook-run/task.go +++ b/pkg/task/tasks/module-hook-run/task.go @@ -12,6 +12,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/addon-operator/converge" "github.com/flant/addon-operator/pkg/helm_resources_manager" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/task" "github.com/flant/addon-operator/pkg/task/helpers" @@ -123,7 +124,7 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { } defer measure.Duration(func(d time.Duration) { - s.metricStorage.HistogramObserve("{PREFIX}module_hook_run_seconds", d.Seconds(), metricLabels, nil) + s.metricStorage.HistogramObserve(metrics.ModuleHookRunSeconds, d.Seconds(), metricLabels, nil) })() shouldRunHook := true @@ -290,9 +291,9 @@ func (s *Task) Handle(ctx context.Context) queue.TaskResult { } } - s.metricStorage.CounterAdd("{PREFIX}module_hook_allowed_errors_total", allowed, metricLabels) - s.metricStorage.CounterAdd("{PREFIX}module_hook_errors_total", errors, metricLabels) - s.metricStorage.CounterAdd("{PREFIX}module_hook_success_total", success, metricLabels) + s.metricStorage.CounterAdd(metrics.ModuleHookAllowedErrorsTotal, allowed, metricLabels) + s.metricStorage.CounterAdd(metrics.ModuleHookErrorsTotal, errors, metricLabels) + s.metricStorage.CounterAdd(metrics.ModuleHookSuccessTotal, success, metricLabels) } if isSynchronization && res.Status == queue.Success { diff --git a/pkg/task/tasks/module-run/task.go b/pkg/task/tasks/module-run/task.go index 21f9adecd..25a220ffd 100644 --- a/pkg/task/tasks/module-run/task.go +++ b/pkg/task/tasks/module-run/task.go @@ -15,6 +15,7 @@ import ( "github.com/flant/addon-operator/pkg" "github.com/flant/addon-operator/pkg/addon-operator/converge" "github.com/flant/addon-operator/pkg/app" + "github.com/flant/addon-operator/pkg/metrics" "github.com/flant/addon-operator/pkg/module_manager" "github.com/flant/addon-operator/pkg/module_manager/models/hooks" "github.com/flant/addon-operator/pkg/module_manager/models/modules" @@ -124,7 +125,7 @@ func (s *Task) Handle(ctx context.Context) (res queue.TaskResult) { //nolint:non } defer measure.Duration(func(d time.Duration) { - s.metricStorage.HistogramObserve("{PREFIX}module_run_seconds", d.Seconds(), metricLabels, nil) + s.metricStorage.HistogramObserve(metrics.ModuleRunSeconds, d.Seconds(), metricLabels, nil) })() var moduleRunErr error @@ -140,7 +141,7 @@ func (s *Task) Handle(ctx context.Context) (res queue.TaskResult) { //nolint:non slog.Int("count", s.shellTask.GetFailureCount()+1), log.Err(moduleRunErr)) - s.metricStorage.CounterAdd("{PREFIX}module_run_errors_total", 1.0, map[string]string{"module": hm.ModuleName}) + s.metricStorage.CounterAdd(metrics.ModuleRunErrorsTotal, 1.0, map[string]string{"module": hm.ModuleName}) s.shellTask.UpdateFailureMessage(moduleRunErr.Error()) s.shellTask.WithQueuedAt(time.Now()) }