Improve histogram, summary performance under contention by striping observationCount (#1794)

jack-berg · web-flow · commit 86533fd71cb9 · 2026-02-06T10:34:01.000+01:00
Was working on improving the performance of opentelemetry-java metrics
under high contention, and realized that the same strategy I identified
to help over there helps for the prometheus implementation as well!

The idea here is recognizing that `Buffer.observationCount` is the
bottleneck under contention. In contrast to the other histogram /
summary `LongAdder` fields, `Buffer.observationCount` is `AtomicLong`
which performs much worse than `LongAdder` under high contention. Its
necessary that the type is `AtomicLong` because the CAS APIs accommodate
the two way communication that the record / collect paths need to signal
that a collection has started and all records have successfully
completed (preventing partial writes).

However, we can "have our cake and eat it to" by striping
`Buffer.observationCount` into many instances, such that the contention
on any instance is reduced. This is actually what `LongAdder` does under
the covers. This implementation stripes it into
`Runtime.getRuntime().availableProcessors()` instances, and uses
`Thread.currentThread().getId()) % stripedObservationCounts.length` to
select which instance any particular record thread should use.

Performance increase is substantial. Here's the before and after of
`HistogramBenchmark` on my machine (Apple M4 Mac Pro w/ 48gb RAM):

Before:
```
Benchmark                                     Mode  Cnt      Score      Error  Units
HistogramBenchmark.openTelemetryClassic      thrpt   25   1138.465 ±  165.921  ops/s
HistogramBenchmark.openTelemetryExponential  thrpt   25    677.483 ±   28.765  ops/s
HistogramBenchmark.prometheusClassic         thrpt   25   5126.048 ±  153.878  ops/s
HistogramBenchmark.prometheusNative          thrpt   25   3854.323 ±  107.789  ops/s
HistogramBenchmark.simpleclient              thrpt   25  13285.351 ± 1784.506  ops/s
```

After:
```
Benchmark                                     Mode  Cnt      Score      Error  Units
HistogramBenchmark.openTelemetryClassic      thrpt   25    925.528 ±   13.744  ops/s
HistogramBenchmark.openTelemetryExponential  thrpt   25    584.404 ±   32.762  ops/s
HistogramBenchmark.prometheusClassic         thrpt   25  14623.971 ± 2117.588  ops/s
HistogramBenchmark.prometheusNative          thrpt   25   7405.672 ±  857.611  ops/s
HistogramBenchmark.simpleclient              thrpt   25  13102.822 ± 3081.096  ops/s
```

---------

Signed-off-by: Jack Berg &lt;34418638+jack-berg@users.noreply.github.com&gt;
diff --git a/prometheus-metrics-core/src/main/java/io/prometheus/metrics/core/metrics/Buffer.java b/prometheus-metrics-core/src/main/java/io/prometheus/metrics/core/metrics/Buffer.java
@@ -18,7 +18,15 @@
 class Buffer {
 
   private static final long bufferActiveBit = 1L << 63;
-  private final AtomicLong observationCount = new AtomicLong(0);
+  // Tracking observation counts requires an AtomicLong for coordination between recording and
+  // collecting. AtomicLong does much worse under contention than the LongAdder instances used
+  // elsewhere to hold aggregated state. To improve, we stripe the AtomicLong into N instances,
+  // where N is the number of available processors. Each record operation chooses the appropriate
+  // instance to use based on the modulo of its thread id and N. This is a more naive / simple
+  // implementation compared to the striping used under the hood in java.util.concurrent classes
+  // like LongAdder - contention and hot spots can still occur if recording thread ids happen to
+  // resolve to the same index. Further improvement is possible.
+  private final AtomicLong[] stripedObservationCounts;
   private double[] observationBuffer = new double[0];
   private int bufferPos = 0;
   private boolean reset = false;
@@ -27,8 +35,17 @@ class Buffer {
   ReentrantLock runLock = new ReentrantLock();
   Condition bufferFilled = appendLock.newCondition();
 
+  Buffer() {
+    stripedObservationCounts = new AtomicLong[Runtime.getRuntime().availableProcessors()];
+    for (int i = 0; i < stripedObservationCounts.length; i++) {
+      stripedObservationCounts[i] = new AtomicLong(0);
+    }
+  }
+
   boolean append(double value) {
-    long count = observationCount.incrementAndGet();
+    int index = Math.abs((int) Thread.currentThread().getId()) % stripedObservationCounts.length;
+    AtomicLong observationCountForThread = stripedObservationCounts[index];
+    long count = observationCountForThread.incrementAndGet();
     if ((count & bufferActiveBit) == 0) {
       return false; // sign bit not set -> buffer not active.
     } else {
@@ -69,7 +86,10 @@ <T extends DataPointSnapshot> T run(
     runLock.lock();
     try {
       // Signal that the buffer is active.
-      Long expectedCount = observationCount.getAndAdd(bufferActiveBit);
+      long expectedCount = 0L;
+      for (AtomicLong observationCount : stripedObservationCounts) {
+        expectedCount += observationCount.getAndAdd(bufferActiveBit);
+      }
 
       while (!complete.apply(expectedCount)) {
         // Wait until all in-flight threads have added their observations to the histogram /
@@ -81,14 +101,18 @@ <T extends DataPointSnapshot> T run(
       result = createResult.get();
 
       // Signal that the buffer is inactive.
-      int expectedBufferSize;
+      long expectedBufferSize = 0;
       if (reset) {
-        expectedBufferSize =
-            (int) ((observationCount.getAndSet(0) & ~bufferActiveBit) - expectedCount);
+        for (AtomicLong observationCount : stripedObservationCounts) {
+          expectedBufferSize += observationCount.getAndSet(0) & ~bufferActiveBit;
+        }
         reset = false;
       } else {
-        expectedBufferSize = (int) (observationCount.addAndGet(bufferActiveBit) - expectedCount);
+        for (AtomicLong observationCount : stripedObservationCounts) {
+          expectedBufferSize += observationCount.addAndGet(bufferActiveBit);
+        }
       }
+      expectedBufferSize -= expectedCount;
 
       appendLock.lock();
       try {