diff --git a/sdk/testing/server/metrics.go b/sdk/testing/server/metrics.go index 41f06a66996..af0d2823ea1 100644 --- a/sdk/testing/server/metrics.go +++ b/sdk/testing/server/metrics.go @@ -28,7 +28,6 @@ import ( "syscall" "time" - "github.com/stretchr/testify/require" gopkgyaml "gopkg.in/yaml.v3" "k8s.io/apimachinery/pkg/util/wait" @@ -66,6 +65,13 @@ func scrapeMetricsForServer(t TestingT, srv RunningServer) { t.Logf("PROMETHEUS_URL environment variable unset, skipping Prometheus scrape config generation") return } + + caFile := filepath.Join(srv.CADirectory(), "apiserver.crt") + if _, err := os.Stat(caFile); os.IsNotExist(err) { + t.Logf("CA file %s does not exist, skipping Prometheus scrape config for server %s", caFile, srv.Name()) + return + } + jobName := fmt.Sprintf("kcp-%s-%s", srv.Name(), t.Name()) labels := map[string]string{ "server": srv.Name(), @@ -75,8 +81,23 @@ func scrapeMetricsForServer(t TestingT, srv RunningServer) { ctx, cancel := context.WithTimeout(context.Background(), wait.ForeverTestTimeout) defer cancel() repoDir, err := kcptestinghelpers.RepositoryDir() - require.NoError(t, err) - require.NoError(t, ScrapeMetrics(ctx, srv.RootShardSystemMasterBaseConfig(t), promUrl, repoDir, jobName, filepath.Join(srv.CADirectory(), "apiserver.crt"), labels)) + if err != nil { + t.Logf("error getting repository directory for server %s: %v", srv.Name(), err) + return + } + + if err := ScrapeMetrics(ctx, srv.RootShardSystemMasterBaseConfig(t), promUrl, repoDir, jobName, caFile, labels); err != nil { + t.Logf("error configuring Prometheus scraping for server %s: %v", srv.Name(), err) + } + + // Clean up Prometheus configuration when test finishes + t.Cleanup(func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := CleanupScrapeMetrics(cleanupCtx, promUrl, repoDir, jobName); err != nil { + t.Logf("error cleaning up Prometheus scrape config for server %s: %v", srv.Name(), err) + } + }) } func ScrapeMetrics(ctx context.Context, cfg *rest.Config, promUrl, promCfgDir, jobName, caFile string, labels map[string]string) error { @@ -162,3 +183,92 @@ func ScrapeMetrics(ctx context.Context, cfg *rest.Config, promUrl, promCfgDir, j resp.Body.Close() return nil } + +func CleanupScrapeMetrics(ctx context.Context, promUrl, promCfgDir, jobNamePrefix string) error { + type staticConfigs struct { + Targets []string `yaml:"targets,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + } + type tlsConfig struct { + InsecureSkipVerify bool `yaml:"insecure_skip_verify,omitempty"` + CaFile string `yaml:"ca_file,omitempty"` + } + type scrapeConfig struct { + JobName string `yaml:"job_name,omitempty"` + ScrapeInterval string `yaml:"scrape_interval,omitempty"` + BearerToken string `yaml:"bearer_token,omitempty"` + TlsConfig tlsConfig `yaml:"tls_config,omitempty"` + Scheme string `yaml:"scheme,omitempty"` + StaticConfigs []staticConfigs `yaml:"static_configs,omitempty"` + } + type config struct { + ScrapeConfigs []scrapeConfig `yaml:"scrape_configs,omitempty"` + } + + err := func() error { + scrapeConfigFile := filepath.Join(promCfgDir, ".prometheus-config.yaml") + f, err := os.OpenFile(scrapeConfigFile, os.O_RDWR, 0o644) + if os.IsNotExist(err) { + return nil // Nothing to clean up + } + if err != nil { + return err + } + defer f.Close() + + // lock config file exclusively + err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX) + if err != nil { + return err + } + defer func() { + _ = syscall.Flock(int(f.Fd()), syscall.LOCK_UN) + }() + + promCfg := config{} + err = gopkgyaml.NewDecoder(f).Decode(&promCfg) + if err != nil && !errors.Is(err, io.EOF) { + return err + } + + // Remove scrape configs that match the job name prefix + var filteredConfigs []scrapeConfig + for _, cfg := range promCfg.ScrapeConfigs { + // Check if CA file still exists - if not, remove the config + if cfg.TlsConfig.CaFile != "" { + if _, err := os.Stat(cfg.TlsConfig.CaFile); os.IsNotExist(err) { + continue // Skip this config - CA file is gone + } + } + filteredConfigs = append(filteredConfigs, cfg) + } + + promCfg.ScrapeConfigs = filteredConfigs + + err = f.Truncate(0) + if err != nil { + return err + } + _, err = f.Seek(0, 0) + if err != nil { + return err + } + return gopkgyaml.NewEncoder(f).Encode(&promCfg) + }() + if err != nil { + return err + } + + // Reload Prometheus configuration + req, err := http.NewRequestWithContext(ctx, http.MethodPost, promUrl+"/-/reload", http.NoBody) + if err != nil { + return err + } + c := &http.Client{} + resp, err := c.Do(req) + if err != nil { + return err + } + resp.Body.Close() + return nil +}