|
| 1 | +package leak |
| 2 | + |
| 3 | +/* |
| 4 | +Resource leak detector |
| 5 | +This module provides a Prometheus-based leak detector for long-running soak tests. It detects leaks by comparing the median resource usage at the start and end of a test and flags any increases that breach configured thresholds. |
| 6 | +
|
| 7 | +Usage Note: Set the WarmUpDuration to at least 20% of your test length for reliable metrics. |
| 8 | +It is also recommend to use it with 3h+ soak tests for less false-positives. |
| 9 | +*/ |
| 10 | + |
| 11 | +import ( |
| 12 | + "fmt" |
| 13 | + "strconv" |
| 14 | + "time" |
| 15 | + |
| 16 | + f "github.com/smartcontractkit/chainlink-testing-framework/framework" |
| 17 | +) |
| 18 | + |
| 19 | +// ResourceLeakCheckerConfig is resource leak checker config with Prometheus base URL |
| 20 | +type ResourceLeakCheckerConfig struct { |
| 21 | + PrometheusBaseURL string |
| 22 | +} |
| 23 | + |
| 24 | +// ResourceLeakChecker is resource leak cheker instance |
| 25 | +type ResourceLeakChecker struct { |
| 26 | + PrometheusURL string |
| 27 | + c PromQuerier |
| 28 | +} |
| 29 | + |
| 30 | +// WithPrometheusBaseURL sets Prometheus base URL, example http://localhost:9099 |
| 31 | +func WithPrometheusBaseURL(url string) func(*ResourceLeakChecker) { |
| 32 | + return func(rlc *ResourceLeakChecker) { |
| 33 | + rlc.PrometheusURL = url |
| 34 | + } |
| 35 | +} |
| 36 | + |
| 37 | +// WithQueryClient sets Prometheus query client |
| 38 | +func WithQueryClient(c PromQuerier) func(*ResourceLeakChecker) { |
| 39 | + return func(rlc *ResourceLeakChecker) { |
| 40 | + rlc.c = c |
| 41 | + } |
| 42 | +} |
| 43 | + |
| 44 | +// PromQueries is an interface for querying Prometheus containing only methods we need for detecting resource leaks |
| 45 | +type PromQuerier interface { |
| 46 | + Query(query string, timestamp time.Time) (*f.PrometheusQueryResponse, error) |
| 47 | +} |
| 48 | + |
| 49 | +// NewResourceLeakChecker creates a new resource leak checker |
| 50 | +func NewResourceLeakChecker(opts ...func(*ResourceLeakChecker)) *ResourceLeakChecker { |
| 51 | + lc := &ResourceLeakChecker{} |
| 52 | + for _, o := range opts { |
| 53 | + o(lc) |
| 54 | + } |
| 55 | + if lc.c == nil { |
| 56 | + lc.c = f.NewPrometheusQueryClient(f.LocalPrometheusBaseURL) |
| 57 | + } |
| 58 | + return lc |
| 59 | +} |
| 60 | + |
| 61 | +// CheckConfig describes leak check configuration |
| 62 | +type CheckConfig struct { |
| 63 | + Query string |
| 64 | + Start time.Time |
| 65 | + End time.Time |
| 66 | + WarmUpDuration time.Duration |
| 67 | +} |
| 68 | + |
| 69 | +// MeasureLeak measures resource leak between start and end timestamps |
| 70 | +// WarmUpDuration is used to ignore warm up interval results for more stable comparison |
| 71 | +func (rc *ResourceLeakChecker) MeasureLeak( |
| 72 | + c *CheckConfig, |
| 73 | +) (float64, error) { |
| 74 | + if c.Start.After(c.End) { |
| 75 | + return 0, fmt.Errorf("start time is greated than end time: %s -> %s", c.Start, c.End) |
| 76 | + } |
| 77 | + if c.WarmUpDuration > c.End.Sub(c.Start)/2 { |
| 78 | + return 0, fmt.Errorf("warm up duration can't be more than 50 percent of test interval between start and end timestamps: %s", c.WarmUpDuration) |
| 79 | + } |
| 80 | + startWithWarmUp := c.Start.Add(c.WarmUpDuration) |
| 81 | + memStart, err := rc.c.Query(c.Query, startWithWarmUp) |
| 82 | + if err != nil { |
| 83 | + return 0, fmt.Errorf("failed to get memory for the test start: %w", err) |
| 84 | + } |
| 85 | + |
| 86 | + memEnd, err := rc.c.Query(c.Query, c.End) |
| 87 | + if err != nil { |
| 88 | + return 0, fmt.Errorf("failed to get memory for the test end: %w", err) |
| 89 | + } |
| 90 | + |
| 91 | + resStart := memStart.Data.Result |
| 92 | + resEnd := memEnd.Data.Result |
| 93 | + if len(resStart) == 0 { |
| 94 | + return 0, fmt.Errorf("no results for start timestamp: %s", c.Start) |
| 95 | + } |
| 96 | + if len(resEnd) == 0 { |
| 97 | + return 0, fmt.Errorf("no results for end timestamp: %s", c.End) |
| 98 | + } |
| 99 | + |
| 100 | + if len(resStart[0].Value) < 2 { |
| 101 | + return 0, fmt.Errorf("invalid Prometheus response for start timestamp, should have timestamp and value: %s", c.Start) |
| 102 | + } |
| 103 | + if len(resEnd[0].Value) < 2 { |
| 104 | + return 0, fmt.Errorf("invalid Prometheus response for end timestamp, should have timestamp and value: %s", c.End) |
| 105 | + } |
| 106 | + |
| 107 | + memStartVal, startOk := memStart.Data.Result[0].Value[1].(string) |
| 108 | + if !startOk { |
| 109 | + return 0, fmt.Errorf("invalid Prometheus response value for timestamp: %s, value: %v", c.Start, memStart.Data.Result[0].Value[1]) |
| 110 | + } |
| 111 | + memEndVal, endOk := memEnd.Data.Result[0].Value[1].(string) |
| 112 | + if !endOk { |
| 113 | + return 0, fmt.Errorf("invalid Prometheus response value for timestamp: %s, value: %v", c.End, memEnd.Data.Result[0].Value[1]) |
| 114 | + } |
| 115 | + |
| 116 | + memStartValFloat, err := strconv.ParseFloat(memStartVal, 64) |
| 117 | + if err != nil { |
| 118 | + return 0, fmt.Errorf("start quantile can't be parsed from string: %w", err) |
| 119 | + } |
| 120 | + memEndValFloat, err := strconv.ParseFloat(memEndVal, 64) |
| 121 | + if err != nil { |
| 122 | + return 0, fmt.Errorf("start quantile can't be parsed from string: %w", err) |
| 123 | + } |
| 124 | + |
| 125 | + totalIncreasePercentage := (memEndValFloat / memStartValFloat * 100) - 100 |
| 126 | + |
| 127 | + f.L.Debug(). |
| 128 | + Float64("Start", memStartValFloat). |
| 129 | + Float64("End", memEndValFloat). |
| 130 | + Float64("Increase", totalIncreasePercentage). |
| 131 | + Msg("Memory increase total (percentage)") |
| 132 | + return totalIncreasePercentage, nil |
| 133 | +} |
0 commit comments