From 77ca094b4d99ddc7d01211c76f7a5e813c902a6b Mon Sep 17 00:00:00 2001 From: theTibi Date: Wed, 15 Oct 2025 13:46:02 +0200 Subject: [PATCH 1/5] PMM-14404 - Enhance PBM backup metrics collection by including cluster node information - Added retrieval of current node role and cluster status. - Updated metrics to include node-specific details such as role and host for each backup. - Improved error handling for node info and cluster status retrieval. --- exporter/pbm_collector.go | 95 ++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/exporter/pbm_collector.go b/exporter/pbm_collector.go index e4f26c671..9ca24ec14 100644 --- a/exporter/pbm_collector.go +++ b/exporter/pbm_collector.go @@ -175,6 +175,18 @@ func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Clien } func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger) []prometheus.Metric { + currentNode, err := util.MyRole(ctx, p.base.client) + if err != nil { + l.Error("failed to get current node info", "error", err.Error()) + return nil + } + + clusterStatus, err := cli.ClusterStatus(ctx, pbmClient, cli.RSConfGetter(p.mongoURI)) + if err != nil { + l.Error("failed to get cluster status", "error", err.Error()) + return nil + } + backupsList, err := pbmClient.GetAllBackups(ctx) if err != nil { l.Error("failed to get PBM backup list", "error", err.Error()) @@ -184,42 +196,55 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli metrics := make([]prometheus.Metric, 0, len(backupsList)) for _, backup := range backupsList { - metrics = append(metrics, createPBMMetric("backup_size_bytes", - "Size of PBM backup", - float64(backup.Size), map[string]string{ - "opid": backup.OPID, - "status": string(backup.Status), - "name": backup.Name, - }), - ) - - // Add backup_last_transition_ts metric - metrics = append(metrics, createPBMMetric("backup_last_transition_ts", - "Last transition timestamp of PBM backup (seconds since epoch)", - float64(backup.LastTransitionTS), map[string]string{ - "opid": backup.OPID, - "status": string(backup.Status), - "name": backup.Name, - }), - ) - - var endTime int64 - switch pbmAgentStatus(backup.Status) { - case statusDone, statusCancelled, statusError, statusDown: - endTime = backup.LastTransitionTS - default: - endTime = time.Now().Unix() + // For each backup, iterate through all nodes in the cluster + for replsetName, nodes := range clusterStatus { + for _, node := range nodes { + // Determine role + role := string(node.Role) + + // Determine if this is the current node + self := "0" + if node.Host == currentNode.Me { + self = "1" + } + + baseLabels := map[string]string{ + "opid": backup.OPID, + "status": string(backup.Status), + "name": backup.Name, + "type": string(backup.Type), + "host": node.Host, + "replica_set": replsetName, + "role": role, + "self": self, + } + + metrics = append(metrics, createPBMMetric("backup_size_bytes", + "Size of PBM backup", + float64(backup.Size), baseLabels), + ) + + // Add backup_last_transition_ts metric + metrics = append(metrics, createPBMMetric("backup_last_transition_ts", + "Last transition timestamp of PBM backup (seconds since epoch)", + float64(backup.LastTransitionTS), baseLabels), + ) + + var endTime int64 + switch pbmAgentStatus(backup.Status) { + case statusDone, statusCancelled, statusError, statusDown: + endTime = backup.LastTransitionTS + default: + endTime = time.Now().Unix() + } + + duration := time.Unix(endTime-backup.StartTS, 0).Unix() + metrics = append(metrics, createPBMMetric("backup_duration_seconds", + "Duration of PBM backup", + float64(duration), baseLabels), + ) + } } - - duration := time.Unix(endTime-backup.StartTS, 0).Unix() - metrics = append(metrics, createPBMMetric("backup_duration_seconds", - "Duration of PBM backup", - float64(duration), map[string]string{ - "opid": backup.OPID, - "status": string(backup.Status), - "name": backup.Name, - }), - ) } return metrics } From 1b81c71d862b4352f3114cfc3f6473c94a04030f Mon Sep 17 00:00:00 2001 From: theTibi Date: Wed, 15 Oct 2025 16:00:02 +0200 Subject: [PATCH 2/5] Refactor PBM backup metrics collection to streamline node information retrieval - Removed unnecessary cluster status retrieval and integrated replset iteration directly from backup metadata. - Updated metrics to focus on node-specific details, enhancing clarity and performance. - Improved overall structure for better maintainability. --- exporter/pbm_collector.go | 93 +++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 53 deletions(-) diff --git a/exporter/pbm_collector.go b/exporter/pbm_collector.go index 9ca24ec14..b41b29b7d 100644 --- a/exporter/pbm_collector.go +++ b/exporter/pbm_collector.go @@ -181,12 +181,6 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli return nil } - clusterStatus, err := cli.ClusterStatus(ctx, pbmClient, cli.RSConfGetter(p.mongoURI)) - if err != nil { - l.Error("failed to get cluster status", "error", err.Error()) - return nil - } - backupsList, err := pbmClient.GetAllBackups(ctx) if err != nil { l.Error("failed to get PBM backup list", "error", err.Error()) @@ -196,54 +190,47 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli metrics := make([]prometheus.Metric, 0, len(backupsList)) for _, backup := range backupsList { - // For each backup, iterate through all nodes in the cluster - for replsetName, nodes := range clusterStatus { - for _, node := range nodes { - // Determine role - role := string(node.Role) - - // Determine if this is the current node - self := "0" - if node.Host == currentNode.Me { - self = "1" - } - - baseLabels := map[string]string{ - "opid": backup.OPID, - "status": string(backup.Status), - "name": backup.Name, - "type": string(backup.Type), - "host": node.Host, - "replica_set": replsetName, - "role": role, - "self": self, - } - - metrics = append(metrics, createPBMMetric("backup_size_bytes", - "Size of PBM backup", - float64(backup.Size), baseLabels), - ) - - // Add backup_last_transition_ts metric - metrics = append(metrics, createPBMMetric("backup_last_transition_ts", - "Last transition timestamp of PBM backup (seconds since epoch)", - float64(backup.LastTransitionTS), baseLabels), - ) - - var endTime int64 - switch pbmAgentStatus(backup.Status) { - case statusDone, statusCancelled, statusError, statusDown: - endTime = backup.LastTransitionTS - default: - endTime = time.Now().Unix() - } - - duration := time.Unix(endTime-backup.StartTS, 0).Unix() - metrics = append(metrics, createPBMMetric("backup_duration_seconds", - "Duration of PBM backup", - float64(duration), baseLabels), - ) + // Iterate through replsets in the backup metadata + for _, replset := range backup.Replsets { + // Determine if this is the current node + self := "0" + if replset.Node == currentNode.Me { + self = "1" + } + + labels := map[string]string{ + "opid": backup.OPID, + "status": string(backup.Status), + "name": backup.Name, + "host": replset.Node, + "replica_set": replset.Name, + "self": self, } + + metrics = append(metrics, createPBMMetric("backup_size_bytes", + "Size of PBM backup", + float64(backup.Size), labels), + ) + + // Add backup_last_transition_ts metric + metrics = append(metrics, createPBMMetric("backup_last_transition_ts", + "Last transition timestamp of PBM backup (seconds since epoch)", + float64(backup.LastTransitionTS), labels), + ) + + var endTime int64 + switch pbmAgentStatus(backup.Status) { + case statusDone, statusCancelled, statusError, statusDown: + endTime = backup.LastTransitionTS + default: + endTime = time.Now().Unix() + } + + duration := time.Unix(endTime-backup.StartTS, 0).Unix() + metrics = append(metrics, createPBMMetric("backup_duration_seconds", + "Duration of PBM backup", + float64(duration), labels), + ) } } return metrics From 5642f8421d9b261c24d00f40cb14e2c8f1d2d297 Mon Sep 17 00:00:00 2001 From: theTibi Date: Wed, 15 Oct 2025 20:44:23 +0200 Subject: [PATCH 3/5] Enhance PBM backup metrics collection with detailed logging and improved metric creation - Added logging for the creation of metrics and labels to provide better visibility during the backup metrics collection process. - Updated the `createPBMMetric` function to include logging of the metric name being created. - Enhanced the iteration over backups and replsets with detailed logs for each processing step, including current node checks and metric additions. - Improved overall structure for clarity and maintainability. --- exporter/pbm_collector.go | 73 ++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/exporter/pbm_collector.go b/exporter/pbm_collector.go index b41b29b7d..7deb6b15c 100644 --- a/exporter/pbm_collector.go +++ b/exporter/pbm_collector.go @@ -51,8 +51,17 @@ const ( func createPBMMetric(name, help string, value float64, labels map[string]string) prometheus.Metric { //nolint:ireturn const prefix = "mongodb_pbm_" - d := prometheus.NewDesc(prefix+name, help, nil, labels) - return prometheus.MustNewConstMetric(d, prometheus.GaugeValue, value) + + // Log what we're creating + fullName := prefix + name + + // Create the descriptor with labels as constant labels (4th parameter) + d := prometheus.NewDesc(fullName, help, nil, labels) + + // Create the metric + metric := prometheus.MustNewConstMetric(d, prometheus.GaugeValue, value) + + return metric } func newPbmCollector(ctx context.Context, client *mongo.Client, mongoURI string, logger *slog.Logger) *pbmCollector { @@ -175,27 +184,49 @@ func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Clien } func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger) []prometheus.Metric { + l.Info("===== Starting PBM backups metrics collection =====") + currentNode, err := util.MyRole(ctx, p.base.client) if err != nil { l.Error("failed to get current node info", "error", err.Error()) return nil } + l.Info("Current node info", "me", currentNode.Me, "is_master", currentNode.IsMaster, "is_secondary", currentNode.Secondary, "primary", currentNode.Primary) backupsList, err := pbmClient.GetAllBackups(ctx) if err != nil { l.Error("failed to get PBM backup list", "error", err.Error()) return nil } + l.Info("Retrieved backups list", "total_backups", len(backupsList)) metrics := make([]prometheus.Metric, 0, len(backupsList)) - for _, backup := range backupsList { + for idx, backup := range backupsList { + l.Info("===== Processing backup =====", + "backup_index", idx, + "name", backup.Name, + "opid", backup.OPID, + "status", backup.Status, + "size", backup.Size, + "replsets_count", len(backup.Replsets)) + // Iterate through replsets in the backup metadata - for _, replset := range backup.Replsets { + for i, replset := range backup.Replsets { + l.Info("--- Processing replset ---", + "replset_index", i, + "replset_name", replset.Name, + "replset_node", replset.Node) + // Determine if this is the current node self := "0" if replset.Node == currentNode.Me { + l.Info("This replset node matches current node", "node", replset.Node) self = "1" + } else { + l.Info("This replset node does NOT match current node", + "replset_node", replset.Node, + "current_node", currentNode.Me) } labels := map[string]string{ @@ -207,16 +238,28 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli "self": self, } - metrics = append(metrics, createPBMMetric("backup_size_bytes", + l.Info("Created labels map for backup metrics") + l.Info(" Label: opid", "value", labels["opid"]) + l.Info(" Label: status", "value", labels["status"]) + l.Info(" Label: name", "value", labels["name"]) + l.Info(" Label: host", "value", labels["host"]) + l.Info(" Label: replica_set", "value", labels["replica_set"]) + l.Info(" Label: self", "value", labels["self"]) + + l.Info("Creating backup_size_bytes metric", "size", backup.Size) + metric1 := createPBMMetric("backup_size_bytes", "Size of PBM backup", - float64(backup.Size), labels), - ) + float64(backup.Size), labels) + metrics = append(metrics, metric1) + l.Info("Added backup_size_bytes metric", "total_metrics", len(metrics)) // Add backup_last_transition_ts metric - metrics = append(metrics, createPBMMetric("backup_last_transition_ts", + l.Info("Creating backup_last_transition_ts metric", "timestamp", backup.LastTransitionTS) + metric2 := createPBMMetric("backup_last_transition_ts", "Last transition timestamp of PBM backup (seconds since epoch)", - float64(backup.LastTransitionTS), labels), - ) + float64(backup.LastTransitionTS), labels) + metrics = append(metrics, metric2) + l.Info("Added backup_last_transition_ts metric", "total_metrics", len(metrics)) var endTime int64 switch pbmAgentStatus(backup.Status) { @@ -227,12 +270,16 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli } duration := time.Unix(endTime-backup.StartTS, 0).Unix() - metrics = append(metrics, createPBMMetric("backup_duration_seconds", + l.Info("Creating backup_duration_seconds metric", "duration", duration, "start_ts", backup.StartTS, "end_time", endTime) + metric3 := createPBMMetric("backup_duration_seconds", "Duration of PBM backup", - float64(duration), labels), - ) + float64(duration), labels) + metrics = append(metrics, metric3) + l.Info("Added backup_duration_seconds metric", "total_metrics", len(metrics)) } } + + l.Info("===== Finished PBM backups metrics collection =====", "total_metrics_created", len(metrics)) return metrics } From 708bfa8abeb1898544b8fda425d79792514c680c Mon Sep 17 00:00:00 2001 From: theTibi Date: Thu, 16 Oct 2025 11:44:34 +0200 Subject: [PATCH 4/5] Revert "Enhance PBM backup metrics collection with detailed logging and improved metric creation" This reverts commit 5642f8421d9b261c24d00f40cb14e2c8f1d2d297. --- exporter/pbm_collector.go | 73 +++++++-------------------------------- 1 file changed, 13 insertions(+), 60 deletions(-) diff --git a/exporter/pbm_collector.go b/exporter/pbm_collector.go index 7deb6b15c..b41b29b7d 100644 --- a/exporter/pbm_collector.go +++ b/exporter/pbm_collector.go @@ -51,17 +51,8 @@ const ( func createPBMMetric(name, help string, value float64, labels map[string]string) prometheus.Metric { //nolint:ireturn const prefix = "mongodb_pbm_" - - // Log what we're creating - fullName := prefix + name - - // Create the descriptor with labels as constant labels (4th parameter) - d := prometheus.NewDesc(fullName, help, nil, labels) - - // Create the metric - metric := prometheus.MustNewConstMetric(d, prometheus.GaugeValue, value) - - return metric + d := prometheus.NewDesc(prefix+name, help, nil, labels) + return prometheus.MustNewConstMetric(d, prometheus.GaugeValue, value) } func newPbmCollector(ctx context.Context, client *mongo.Client, mongoURI string, logger *slog.Logger) *pbmCollector { @@ -184,49 +175,27 @@ func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Clien } func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger) []prometheus.Metric { - l.Info("===== Starting PBM backups metrics collection =====") - currentNode, err := util.MyRole(ctx, p.base.client) if err != nil { l.Error("failed to get current node info", "error", err.Error()) return nil } - l.Info("Current node info", "me", currentNode.Me, "is_master", currentNode.IsMaster, "is_secondary", currentNode.Secondary, "primary", currentNode.Primary) backupsList, err := pbmClient.GetAllBackups(ctx) if err != nil { l.Error("failed to get PBM backup list", "error", err.Error()) return nil } - l.Info("Retrieved backups list", "total_backups", len(backupsList)) metrics := make([]prometheus.Metric, 0, len(backupsList)) - for idx, backup := range backupsList { - l.Info("===== Processing backup =====", - "backup_index", idx, - "name", backup.Name, - "opid", backup.OPID, - "status", backup.Status, - "size", backup.Size, - "replsets_count", len(backup.Replsets)) - + for _, backup := range backupsList { // Iterate through replsets in the backup metadata - for i, replset := range backup.Replsets { - l.Info("--- Processing replset ---", - "replset_index", i, - "replset_name", replset.Name, - "replset_node", replset.Node) - + for _, replset := range backup.Replsets { // Determine if this is the current node self := "0" if replset.Node == currentNode.Me { - l.Info("This replset node matches current node", "node", replset.Node) self = "1" - } else { - l.Info("This replset node does NOT match current node", - "replset_node", replset.Node, - "current_node", currentNode.Me) } labels := map[string]string{ @@ -238,28 +207,16 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli "self": self, } - l.Info("Created labels map for backup metrics") - l.Info(" Label: opid", "value", labels["opid"]) - l.Info(" Label: status", "value", labels["status"]) - l.Info(" Label: name", "value", labels["name"]) - l.Info(" Label: host", "value", labels["host"]) - l.Info(" Label: replica_set", "value", labels["replica_set"]) - l.Info(" Label: self", "value", labels["self"]) - - l.Info("Creating backup_size_bytes metric", "size", backup.Size) - metric1 := createPBMMetric("backup_size_bytes", + metrics = append(metrics, createPBMMetric("backup_size_bytes", "Size of PBM backup", - float64(backup.Size), labels) - metrics = append(metrics, metric1) - l.Info("Added backup_size_bytes metric", "total_metrics", len(metrics)) + float64(backup.Size), labels), + ) // Add backup_last_transition_ts metric - l.Info("Creating backup_last_transition_ts metric", "timestamp", backup.LastTransitionTS) - metric2 := createPBMMetric("backup_last_transition_ts", + metrics = append(metrics, createPBMMetric("backup_last_transition_ts", "Last transition timestamp of PBM backup (seconds since epoch)", - float64(backup.LastTransitionTS), labels) - metrics = append(metrics, metric2) - l.Info("Added backup_last_transition_ts metric", "total_metrics", len(metrics)) + float64(backup.LastTransitionTS), labels), + ) var endTime int64 switch pbmAgentStatus(backup.Status) { @@ -270,16 +227,12 @@ func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Cli } duration := time.Unix(endTime-backup.StartTS, 0).Unix() - l.Info("Creating backup_duration_seconds metric", "duration", duration, "start_ts", backup.StartTS, "end_time", endTime) - metric3 := createPBMMetric("backup_duration_seconds", + metrics = append(metrics, createPBMMetric("backup_duration_seconds", "Duration of PBM backup", - float64(duration), labels) - metrics = append(metrics, metric3) - l.Info("Added backup_duration_seconds metric", "total_metrics", len(metrics)) + float64(duration), labels), + ) } } - - l.Info("===== Finished PBM backups metrics collection =====", "total_metrics_created", len(metrics)) return metrics } From 5574603bf953f625e725b0a94fe9652224a394ae Mon Sep 17 00:00:00 2001 From: theTibi Date: Thu, 16 Oct 2025 11:56:44 +0200 Subject: [PATCH 5/5] Refactor PBM metrics collection to utilize current node information - Updated `pbmBackupsMetrics` and `pbmAgentMetrics` functions to accept `currentNode` as a parameter, reducing redundant calls to retrieve node information. - Enhanced error handling for node info retrieval, ensuring metrics are only collected if the current node information is successfully obtained. - Improved overall structure for clarity and maintainability. --- exporter/pbm_collector.go | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/exporter/pbm_collector.go b/exporter/pbm_collector.go index b41b29b7d..d6a195d88 100644 --- a/exporter/pbm_collector.go +++ b/exporter/pbm_collector.go @@ -26,6 +26,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "go.mongodb.org/mongo-driver/mongo" + "github.com/percona/mongodb_exporter/internal/proto" "github.com/percona/mongodb_exporter/internal/util" ) @@ -113,8 +114,14 @@ func (p *pbmCollector) collect(ch chan<- prometheus.Metric) { "PBM PITR backups are enabled for the cluster", float64(pitrEnabledMetric), nil)) - metrics = append(metrics, p.pbmBackupsMetrics(p.ctx, pbmClient, logger)...) - metrics = append(metrics, p.pbmAgentMetrics(p.ctx, pbmClient, logger)...) + // Get current node info once for both agent and backup metrics + currentNode, err := util.MyRole(p.ctx, p.base.client) + if err != nil { + logger.Error("failed to get current node info", "error", err.Error()) + } else { + metrics = append(metrics, p.pbmBackupsMetrics(p.ctx, pbmClient, logger, currentNode)...) + metrics = append(metrics, p.pbmAgentMetrics(p.ctx, pbmClient, logger, currentNode)...) + } } metrics = append(metrics, createPBMMetric("cluster_backup_configured", @@ -126,13 +133,7 @@ func (p *pbmCollector) collect(ch chan<- prometheus.Metric) { } } -func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger) []prometheus.Metric { - currentNode, err := util.MyRole(ctx, p.base.client) - if err != nil { - l.Error("failed to get current node info", "error", err.Error()) - return nil - } - +func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger, currentNode *proto.HelloResponse) []prometheus.Metric { clusterStatus, err := cli.ClusterStatus(ctx, pbmClient, cli.RSConfGetter(p.mongoURI)) if err != nil { l.Error("failed to get cluster status", "error", err.Error()) @@ -174,13 +175,7 @@ func (p *pbmCollector) pbmAgentMetrics(ctx context.Context, pbmClient *sdk.Clien return metrics } -func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger) []prometheus.Metric { - currentNode, err := util.MyRole(ctx, p.base.client) - if err != nil { - l.Error("failed to get current node info", "error", err.Error()) - return nil - } - +func (p *pbmCollector) pbmBackupsMetrics(ctx context.Context, pbmClient *sdk.Client, l *slog.Logger, currentNode *proto.HelloResponse) []prometheus.Metric { backupsList, err := pbmClient.GetAllBackups(ctx) if err != nil { l.Error("failed to get PBM backup list", "error", err.Error())