Skip to content

Commit 55183f0

Browse files
authored
fix: prevent the panics when we try to retry with a different number of shards (#1547)
* prevent the panics when we try to retry with a different number of shards Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Simplify Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> --------- Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
1 parent a261ffa commit 55183f0

File tree

3 files changed

+78
-21
lines changed

3 files changed

+78
-21
lines changed

pkg/cache/stats_counter.go

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package cache
1515

1616
import (
17+
"errors"
1718
"fmt"
1819
"time"
1920

@@ -37,13 +38,13 @@ const (
3738
// StatsCounter is an interface for recording cache stats for goburrow.Cache.
3839
// The StatsCounter can be found at
3940
// - https://github.com/goburrow/cache/blob/f6da914dd6e3546dffa8802919dbca80cd33abe3/stats.go#L67
40-
var _ burrow.StatsCounter = (*burrowStatsCounter)(nil)
41+
var _ burrow.StatsCounter = (*BurrowStatsCounter)(nil)
4142

42-
// burrowStatsCounter is a StatsCounter implementation for burrow cache.
43+
// BurrowStatsCounter is a StatsCounter implementation for burrow cache.
4344
// It is a wrapper around prometheus metrics.
4445
// It has been intended to passed through the cache using cache.WithStatsCounter option
4546
// - https://github.com/goburrow/cache/blob/f6da914dd6e3546dffa8802919dbca80cd33abe3/local.go#L552
46-
type burrowStatsCounter struct {
47+
type BurrowStatsCounter struct {
4748
logger log.Logger
4849
reg prometheus.Registerer
4950

@@ -56,12 +57,12 @@ type burrowStatsCounter struct {
5657
}
5758

5859
// Option add options for default Cache.
59-
type Option func(c *burrowStatsCounter)
60+
type Option func(c *BurrowStatsCounter)
6061

6162
// WithTrackLoadingCacheStats enables tracking of loading cache stats.
6263
// It is disabled by default.
6364
func WithTrackLoadingCacheStats() Option {
64-
return func(c *burrowStatsCounter) {
65+
return func(c *BurrowStatsCounter) {
6566
c.trackLoadingCacheStats = true
6667
c.load = promauto.With(c.reg).NewCounterVec(prometheus.CounterOpts{
6768
Name: "cache_load_total",
@@ -79,9 +80,9 @@ func WithTrackLoadingCacheStats() Option {
7980
//
8081
// RecordLoadSuccess and RecordLoadError methods are called by Get methods on a successful and failed load respectively.
8182
// Get method only called by LoadingCache implementation.
82-
func NewBurrowStatsCounter(logger log.Logger, reg prometheus.Registerer, name string, options ...Option) *burrowStatsCounter {
83+
func NewBurrowStatsCounter(logger log.Logger, reg prometheus.Registerer, name string, options ...Option) *BurrowStatsCounter {
8384
reg = prometheus.WrapRegistererWith(prometheus.Labels{"cache": name}, reg)
84-
s := &burrowStatsCounter{
85+
s := &BurrowStatsCounter{
8586
logger: logger,
8687
reg: reg,
8788

@@ -100,27 +101,50 @@ func NewBurrowStatsCounter(logger log.Logger, reg prometheus.Registerer, name st
100101
return s
101102
}
102103

104+
// Unregister removes all metrics from the registry.
105+
func (c *BurrowStatsCounter) Unregister() error {
106+
var err error
107+
if ok := c.reg.Unregister(c.requests); !ok {
108+
err = errors.Join(err, fmt.Errorf("unregistering requests counter: %w", err))
109+
}
110+
if ok := c.reg.Unregister(c.eviction); !ok {
111+
err = errors.Join(err, fmt.Errorf("unregistering eviction counter: %w", err))
112+
}
113+
if c.trackLoadingCacheStats {
114+
if ok := c.reg.Unregister(c.load); !ok {
115+
err = errors.Join(err, fmt.Errorf("unregistering load counter: %w", err))
116+
}
117+
if ok := c.reg.Unregister(c.loadTotalTime); !ok {
118+
err = errors.Join(err, fmt.Errorf("unregistering load total time histogram: %w", err))
119+
}
120+
}
121+
if err != nil {
122+
return fmt.Errorf("cleaning cache stats counter: %w", err)
123+
}
124+
return nil
125+
}
126+
103127
// RecordHits records the number of hits.
104128
// It is part of the burrow.StatsCounter interface.
105129
//
106130
// This method is called by Get and GetIfPresent methods on a cache hit.
107-
func (c *burrowStatsCounter) RecordHits(hits uint64) {
131+
func (c *BurrowStatsCounter) RecordHits(hits uint64) {
108132
c.requests.WithLabelValues(lvHit).Add(float64(hits))
109133
}
110134

111135
// RecordMisses records the number of misses.
112136
// It is part of the burrow.StatsCounter interface.
113137
//
114138
// This method is called by Get and GetIfPresent methods method on a cache miss.
115-
func (c *burrowStatsCounter) RecordMisses(misses uint64) {
139+
func (c *BurrowStatsCounter) RecordMisses(misses uint64) {
116140
c.requests.WithLabelValues(lvMiss).Add(float64(misses))
117141
}
118142

119143
// RecordLoadSuccess records the number of successful loads.
120144
// It is part of the burrow.StatsCounter interface.
121145
//
122146
// This method is called by Get methods on a successful load.
123-
func (c *burrowStatsCounter) RecordLoadSuccess(loadTime time.Duration) {
147+
func (c *BurrowStatsCounter) RecordLoadSuccess(loadTime time.Duration) {
124148
if !c.trackLoadingCacheStats {
125149
return
126150
}
@@ -132,7 +156,7 @@ func (c *burrowStatsCounter) RecordLoadSuccess(loadTime time.Duration) {
132156
// It is part of the burrow.StatsCounter interface.
133157
//
134158
// This method is called by Get methods on a failed load.
135-
func (c *burrowStatsCounter) RecordLoadError(loadTime time.Duration) {
159+
func (c *BurrowStatsCounter) RecordLoadError(loadTime time.Duration) {
136160
if !c.trackLoadingCacheStats {
137161
return
138162
}
@@ -142,7 +166,7 @@ func (c *burrowStatsCounter) RecordLoadError(loadTime time.Duration) {
142166

143167
// RecordEviction records the number of evictions.
144168
// It is part of the burrow.StatsCounter interface.
145-
func (c *burrowStatsCounter) RecordEviction() {
169+
func (c *BurrowStatsCounter) RecordEviction() {
146170
c.eviction.Inc()
147171
}
148172

@@ -152,7 +176,7 @@ func (c *burrowStatsCounter) RecordEviction() {
152176
// This method is called only by Stats method. And it is just for debugging purpose.
153177
// Snapshot function is called manually and we don't plan to use it.
154178
// For completeness, we implemented it.
155-
func (c *burrowStatsCounter) Snapshot(s *burrow.Stats) {
179+
func (c *BurrowStatsCounter) Snapshot(s *burrow.Stats) {
156180
var err error
157181
s.HitCount, err = currentCounterVecValue(c.requests, lvHit)
158182
if err != nil {

pkg/profiler/cpu/cpu.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,9 @@ func loadBpfProgram(logger log.Logger, reg prometheus.Registerer, debugEnabled,
277277
// There's not enough free memory for these many unwind shards, let's retry with half
278278
// as many.
279279
if errors.Is(err, syscall.ENOMEM) {
280+
if err := bpfMaps.close(); err != nil { // Only required when we want to retry.
281+
return nil, nil, fmt.Errorf("failed to cleanup previously created bpfmaps: %w", err)
282+
}
280283
unwindShards /= 2
281284
} else {
282285
break

pkg/profiler/cpu/maps.go

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ type bpfMaps struct {
192192
programs *bpf.BPFMap
193193

194194
// Unwind stuff 🔬
195-
processCache burrow.Cache
195+
processCache *processCache
196196
mappingInfoMemory profiler.EfficientBuffer
197197

198198
buildIDMapping map[string]uint64
@@ -225,6 +225,34 @@ func min[T constraints.Ordered](a, b T) T {
225225
return b
226226
}
227227

228+
type processCache struct {
229+
burrow.Cache
230+
statsCounter *cache.BurrowStatsCounter
231+
}
232+
233+
func newProcessCache(logger log.Logger, reg prometheus.Registerer) *processCache {
234+
statsCounter := cache.NewBurrowStatsCounter(logger, reg, "cpu_map")
235+
return &processCache{
236+
Cache: burrow.New(
237+
burrow.WithMaximumSize(maxCachedProcesses),
238+
burrow.WithStatsCounter(statsCounter),
239+
),
240+
statsCounter: statsCounter,
241+
}
242+
}
243+
244+
// close closes the cache and makes sure the stats counter is unregistered.
245+
func (c *processCache) close() error {
246+
// Unregister the stats counter before closing the cache,
247+
// in case the cache could be initialized again.
248+
err := c.statsCounter.Unregister()
249+
// Close the cache.
250+
if err := c.Cache.Close(); err != nil {
251+
return errors.Join(err, fmt.Errorf("failed to close process cache: %w", err))
252+
}
253+
return err
254+
}
255+
228256
func initializeMaps(logger log.Logger, reg prometheus.Registerer, m *bpf.Module, byteOrder binary.ByteOrder) (*bpfMaps, error) {
229257
if m == nil {
230258
return nil, fmt.Errorf("nil module")
@@ -234,13 +262,10 @@ func initializeMaps(logger log.Logger, reg prometheus.Registerer, m *bpf.Module,
234262
unwindInfoMemory := make([]byte, maxUnwindTableSize*compactUnwindRowSizeBytes)
235263

236264
maps := &bpfMaps{
237-
logger: log.With(logger, "component", "maps"),
238-
module: m,
239-
byteOrder: byteOrder,
240-
processCache: burrow.New(
241-
burrow.WithMaximumSize(maxCachedProcesses),
242-
burrow.WithStatsCounter(cache.NewBurrowStatsCounter(logger, reg, "cpu_map")),
243-
),
265+
logger: log.With(logger, "component", "maps"),
266+
module: m,
267+
byteOrder: byteOrder,
268+
processCache: newProcessCache(logger, reg),
244269
mappingInfoMemory: mappingInfoMemory,
245270
unwindInfoMemory: unwindInfoMemory,
246271
buildIDMapping: make(map[string]uint64),
@@ -253,6 +278,11 @@ func initializeMaps(logger log.Logger, reg prometheus.Registerer, m *bpf.Module,
253278
return maps, nil
254279
}
255280

281+
// close closes all the resources associated with the maps.
282+
func (m *bpfMaps) close() error {
283+
return m.processCache.close()
284+
}
285+
256286
// adjustMapSizes updates the amount of unwind shards.
257287
//
258288
// Note: It must be called before `BPFLoadObject()`.

0 commit comments

Comments
 (0)