fix race where map elements could be overwritten

dashpole · dashpole · commit e513754ebbd3 · 2025-09-17T19:48:35.000Z
diff --git a/sdk/metric/internal/aggregate/atomic.go b/sdk/metric/internal/aggregate/atomic.go
@@ -8,35 +8,35 @@ import (
 	"sync/atomic"
 )
 
-// counter is an efficient way of adding to a number which is either an
+// atomicSum is an efficient way of adding to a number which is either an
 // int64 or float64.
-type counter[N int64 | float64] struct {
+type atomicSum[N int64 | float64] struct {
 	// nFloatBits contains only the non-integer portion of the counter.
-	nFloatBits uint64
+	nFloatBits atomic.Uint64
 	// nInt contains only the integer portion of the counter.
-	nInt uint64
+	nInt atomic.Uint64
 }
 
 // value returns the float or integer value.
-func (n *counter[N]) value() N {
-	fval := math.Float64frombits(atomic.LoadUint64(&n.nFloatBits))
-	ival := atomic.LoadUint64(&n.nInt)
+func (n *atomicSum[N]) value() N {
+	fval := math.Float64frombits(n.nFloatBits.Load())
+	ival := n.nInt.Load()
 	return N(fval + float64(ival))
 }
 
-func (n *counter[N]) add(value N) {
+func (n *atomicSum[N]) add(value N) {
 	ival := uint64(value)
 	// This case is where the value is an int, or if it is a whole-numbered float.
 	if float64(ival) == float64(value) {
-		atomic.AddUint64(&n.nInt, ival)
+		n.nInt.Add(ival)
 		return
 	}
 
 	// Value must be a float below.
 	for {
-		oldBits := atomic.LoadUint64(&n.nFloatBits)
+		oldBits := n.nFloatBits.Load()
 		newBits := math.Float64bits(math.Float64frombits(oldBits) + float64(value))
-		if atomic.CompareAndSwapUint64(&n.nFloatBits, oldBits, newBits) {
+		if n.nFloatBits.CompareAndSwap(oldBits, newBits) {
 			return
 		}
 	}
diff --git a/sdk/metric/internal/aggregate/exponential_histogram.go b/sdk/metric/internal/aggregate/exponential_histogram.go
@@ -72,6 +72,11 @@ func newExpoHistogramDataPoint[N int64 | float64](
 	}
 }
 
+func (p *expoHistogramDataPoint[N]) measure(ctx context.Context, v N, droppedAttr []attribute.KeyValue) {
+	p.record(v)
+	p.res.Offer(ctx, v, droppedAttr)
+}
+
 // record adds a new measurement to the histogram. It will rescale the buckets if needed.
 func (p *expoHistogramDataPoint[N]) record(v N) {
 	p.count++
@@ -316,10 +321,10 @@ type expoHistogram[N int64 | float64] struct {
 	maxSize  int
 	maxScale int32
 
-	newRes   func(attribute.Set) FilteredExemplarReservoir[N]
-	limit    limiter[expoHistogramDataPoint[N]]
-	values   map[attribute.Distinct]*expoHistogramDataPoint[N]
-	valuesMu sync.Mutex
+	newRes func(attribute.Set) FilteredExemplarReservoir[N]
+	limit  limiter[expoHistogramDataPoint[N]]
+	values map[attribute.Distinct]*expoHistogramDataPoint[N]
+	sync.RWMutex
 
 	start time.Time
 }
@@ -335,19 +340,30 @@ func (e *expoHistogram[N]) measure(
 		return
 	}
 
-	e.valuesMu.Lock()
-	defer e.valuesMu.Unlock()
-
+	// Hold the RLock even after we are done reading from the values map to
+	// ensure we don't race with collection.
+	e.RLock()
 	attr := e.limit.Attributes(fltrAttr, e.values)
 	v, ok := e.values[attr.Equivalent()]
-	if !ok {
-		v = newExpoHistogramDataPoint[N](attr, e.maxSize, e.maxScale, e.noMinMax, e.noSum)
-		v.res = e.newRes(attr)
-
-		e.values[attr.Equivalent()] = v
+	if ok {
+		v.measure(ctx, value, droppedAttr)
+		e.RUnlock()
+		return
+	}
+	e.RUnlock()
+	// Switch to a full lock to add a new element to the map.
+	e.Lock()
+	defer e.Unlock()
+	// Check that the element wasn't added since we last checked.
+	v, ok = e.values[attr.Equivalent()]
+	if ok {
+		v.measure(ctx, value, droppedAttr)
+		return
 	}
-	v.record(value)
-	v.res.Offer(ctx, value, droppedAttr)
+	v = newExpoHistogramDataPoint[N](attr, e.maxSize, e.maxScale, e.noMinMax, e.noSum)
+	v.res = e.newRes(attr)
+	v.measure(ctx, value, droppedAttr)
+	e.values[attr.Equivalent()] = v
 }
 
 func (e *expoHistogram[N]) delta(
@@ -360,8 +376,8 @@ func (e *expoHistogram[N]) delta(
 	h, _ := (*dest).(metricdata.ExponentialHistogram[N])
 	h.Temporality = metricdata.DeltaTemporality
 
-	e.valuesMu.Lock()
-	defer e.valuesMu.Unlock()
+	e.Lock()
+	defer e.Unlock()
 
 	n := len(e.values)
 	hDPts := reset(h.DataPoints, n, n)
@@ -423,8 +439,8 @@ func (e *expoHistogram[N]) cumulative(
 	h, _ := (*dest).(metricdata.ExponentialHistogram[N])
 	h.Temporality = metricdata.CumulativeTemporality
 
-	e.valuesMu.Lock()
-	defer e.valuesMu.Unlock()
+	e.Lock()
+	defer e.Unlock()
 
 	n := len(e.values)
 	hDPts := reset(h.DataPoints, n, n)
diff --git a/sdk/metric/internal/aggregate/filtered_reservoir.go b/sdk/metric/internal/aggregate/filtered_reservoir.go
@@ -47,6 +47,8 @@ func NewFilteredExemplarReservoir[N int64 | float64](
 
 func (f *filteredExemplarReservoir[N]) Offer(ctx context.Context, val N, attr []attribute.KeyValue) {
 	if f.filter(ctx) {
+		// We need to lock here because the individual aggregation only holds a
+		// read lock.
 		f.mu.Lock()
 		defer f.mu.Unlock()
 		// only record the current time if we are sampling this measurement.
@@ -55,7 +57,7 @@ func (f *filteredExemplarReservoir[N]) Offer(ctx context.Context, val N, attr []
 }
 
 func (f *filteredExemplarReservoir[N]) Collect(dest *[]exemplar.Exemplar) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
+	// No need to lock here because the individual aggregation already holds
+	// the RW lock.
 	f.reservoir.Collect(dest)
 }
diff --git a/sdk/metric/internal/aggregate/histogram.go b/sdk/metric/internal/aggregate/histogram.go
@@ -18,37 +18,42 @@ import (
 type buckets[N int64 | float64] struct {
 	count    uint64
 	counts   []uint64
+	noSum    bool
+	noMinMax bool
 	min, max atomic.Value
-	total    *counter[N]
+	total    atomicSum[N]
 
 	attrs attribute.Set
 	res   FilteredExemplarReservoir[N]
 }
 
-// newBuckets returns buckets with n bins.
-func newBuckets[N int64 | float64](attrs attribute.Set, n int) *buckets[N] {
-	return &buckets[N]{attrs: attrs, counts: make([]uint64, n), total: &counter[N]{}}
-}
-
-func (b *buckets[N]) bin(idx int) {
+func (b *buckets[N]) measure(
+	ctx context.Context,
+	value N,
+	idx int,
+	droppedAttr []attribute.KeyValue,
+) {
 	atomic.AddUint64(&b.counts[idx], 1)
 	atomic.AddUint64(&b.count, 1)
-}
-
-func (b *buckets[N]) minMax(value N) {
-	for {
-		minLoaded := b.min.Load()
-		if value < minLoaded.(N) && !b.min.CompareAndSwap(minLoaded, value) {
-			// We got a new min value, but lost the race. Try again.
-			continue
-		}
-		maxLoaded := b.max.Load()
-		if value > maxLoaded.(N) && !b.max.CompareAndSwap(maxLoaded, value) {
-			// We got a new max value, but lost the race. Try again.
-			continue
+	if !b.noMinMax {
+		for {
+			minLoaded := b.min.Load()
+			if (minLoaded == nil || value < minLoaded.(N)) && !b.min.CompareAndSwap(minLoaded, value) {
+				// We got a new min value, but lost the race. Try again.
+				continue
+			}
+			maxLoaded := b.max.Load()
+			if (maxLoaded == nil || value > maxLoaded.(N)) && !b.max.CompareAndSwap(maxLoaded, value) {
+				// We got a new max value, but lost the race. Try again.
+				continue
+			}
+			break
 		}
-		return
 	}
+	if !b.noSum {
+		b.total.add(value)
+	}
+	b.res.Offer(ctx, value, droppedAttr)
 }
 
 // histValues summarizes a set of measurements as an histValues with
@@ -58,10 +63,10 @@ type histValues[N int64 | float64] struct {
 	noMinMax bool
 	bounds   []float64
 
-	newRes   func(attribute.Set) FilteredExemplarReservoir[N]
-	limit    limiter[buckets[N]]
-	values   map[attribute.Distinct]*buckets[N]
-	valuesMu sync.RWMutex
+	newRes func(attribute.Set) FilteredExemplarReservoir[N]
+	limit  limiter[buckets[N]]
+	values map[attribute.Distinct]*buckets[N]
+	sync.RWMutex
 }
 
 func newHistValues[N int64 | float64](
@@ -102,39 +107,42 @@ func (s *histValues[N]) measure(
 	// (s.bounds[len(s.bounds)-1], +∞).
 	idx := sort.SearchFloat64s(s.bounds, float64(value))
 
-	s.valuesMu.RLock()
-
+	// Hold the RLock even after we are done reading from the values map to
+	// ensure we don't race with collection.
+	s.RLock()
 	attr := s.limit.Attributes(fltrAttr, s.values)
 	b, ok := s.values[attr.Equivalent()]
-	s.valuesMu.RUnlock()
-	if !ok {
+	if ok {
+		b.measure(ctx, value, idx, droppedAttr)
+		s.RUnlock()
+		return
+	}
+	s.RUnlock()
+	// Switch to a full lock to add a new element to the map.
+	s.Lock()
+	defer s.Unlock()
+	// Check that the element wasn't added since we last checked.
+	b, ok = s.values[attr.Equivalent()]
+	if ok {
+		b.measure(ctx, value, idx, droppedAttr)
+		return
+	}
+	b = &buckets[N]{
+		attrs: attr,
 		// N+1 buckets. For example:
 		//
 		//   bounds = [0, 5, 10]
 		//
 		// Then,
 		//
 		//   buckets = (-∞, 0], (0, 5.0], (5.0, 10.0], (10.0, +∞)
-		b = newBuckets[N](attr, len(s.bounds)+1)
-		b.res = s.newRes(attr)
-
-		// Ensure min and max are recorded values (not zero), for new buckets.
-		if !s.noMinMax {
-			b.min.Store(value)
-			b.max.Store(value)
-		}
-		s.valuesMu.Lock()
-		s.values[attr.Equivalent()] = b
-		s.valuesMu.Unlock()
-	}
-	b.bin(idx)
-	if !s.noMinMax {
-		b.minMax(value)
+		counts:   make([]uint64, len(s.bounds)+1),
+		res:      s.newRes(attr),
+		noSum:    s.noSum,
+		noMinMax: s.noMinMax,
 	}
-	if !s.noSum {
-		b.total.add(value)
-	}
-	b.res.Offer(ctx, value, droppedAttr)
+	b.measure(ctx, value, idx, droppedAttr)
+	s.values[attr.Equivalent()] = b
 }
 
 // newHistogram returns an Aggregator that summarizes a set of measurements as
@@ -169,8 +177,11 @@ func (s *histogram[N]) delta(
 	h, _ := (*dest).(metricdata.Histogram[N])
 	h.Temporality = metricdata.DeltaTemporality
 
-	s.valuesMu.Lock()
-	defer s.valuesMu.Unlock()
+	// Aquire a full lock to ensure there are no concurrent measure() calls.
+	// If we only used a RLock, we could observe "partial" measurements, such
+	// as a histogram count increment without a histogram total increment.
+	s.Lock()
+	defer s.Unlock()
 
 	// Do not allow modification of our copy of bounds.
 	bounds := slices.Clone(s.bounds)
@@ -221,8 +232,11 @@ func (s *histogram[N]) cumulative(
 	h, _ := (*dest).(metricdata.Histogram[N])
 	h.Temporality = metricdata.CumulativeTemporality
 
-	s.valuesMu.Lock()
-	defer s.valuesMu.Unlock()
+	// Aquire a full lock to ensure there are no concurrent measure() calls.
+	// If we only used a RLock, we could observe "partial" measurements, such
+	// as a histogram count increment without a histogram total increment.
+	s.Lock()
+	defer s.Unlock()
 
 	// Do not allow modification of our copy of bounds.
 	bounds := slices.Clone(s.bounds)
diff --git a/sdk/metric/internal/aggregate/histogram_test.go b/sdk/metric/internal/aggregate/histogram_test.go
@@ -277,9 +277,11 @@ func TestBucketsBin(t *testing.T) {
 
 func testBucketsBin[N int64 | float64]() func(t *testing.T) {
 	return func(t *testing.T) {
-		b := newBuckets[N](alice, 3)
-		b.min.Store(N(0))
-		b.max.Store(N(0))
+		b := &buckets[N]{
+			attrs:  alice,
+			counts: make([]uint64, 3),
+			res:    dropExemplars[N](alice),
+		}
 		assertB := func(counts []uint64, count uint64, mi, ma N) {
 			t.Helper()
 			assert.Equal(t, counts, b.counts)
@@ -288,12 +290,9 @@ func testBucketsBin[N int64 | float64]() func(t *testing.T) {
 			assert.Equal(t, ma, b.max.Load().(N))
 		}
 
-		assertB([]uint64{0, 0, 0}, 0, 0, 0)
-		b.bin(1)
-		b.minMax(2)
-		assertB([]uint64{0, 1, 0}, 1, 0, 2)
-		b.bin(0)
-		b.minMax(-1)
+		b.measure(context.Background(), 2, 1, nil)
+		assertB([]uint64{0, 1, 0}, 1, 2, 2)
+		b.measure(context.Background(), -1, 0, nil)
 		assertB([]uint64{1, 1, 0}, 2, -1, 2)
 	}
 }
@@ -305,7 +304,10 @@ func TestBucketsSum(t *testing.T) {
 
 func testBucketsSum[N int64 | float64]() func(t *testing.T) {
 	return func(t *testing.T) {
-		b := newBuckets[N](alice, 3)
+		b := &buckets[N]{
+			attrs:  alice,
+			counts: make([]uint64, 3),
+		}
 
 		var want N
 		assert.Equal(t, want, b.total.value())
diff --git a/sdk/metric/internal/aggregate/lastvalue.go b/sdk/metric/internal/aggregate/lastvalue.go
diff --git a/sdk/metric/internal/aggregate/sum.go b/sdk/metric/internal/aggregate/sum.go