Skip to content

Commit a22cde2

Browse files
committed
Export client request latencies as histograms
This adds support for histogram-style metrics instead of using summaries. It means we can sum on a cluster level and present the user's experienced latency instead of looking at it on a per-node level. The current version limits the range of histogram buckets between 0.1ms and 60s, to avoid exporting huge amounts of buckets that are likely empty. Further patches could limit this further, for example by going for a 1.44x increment instead of the 1.2x increment, or by specifying the ranges in the configuration. Even with the limits in place, this exports 76 metrics to 3 metric families per histogram. The original summaries-based code only exports 8 metrics (in 3 families), though in theory those are no longer needed and could be disabled with a flag.
1 parent 2767af9 commit a22cde2

File tree

4 files changed

+83
-0
lines changed

4 files changed

+83
-0
lines changed

common/src/main/java/com/zegelin/cassandra/exporter/CassandraMetricsUtilities.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ public Iterable<Interval> getIntervals() {
8787
new Interval(Interval.Quantile.P_99_9, (float) timer.get999thPercentile() * durationFactor)
8888
);
8989
}
90+
91+
@Override
92+
public long[] getValues() {
93+
return timer.values();
94+
}
9095
};
9196
}
9297

@@ -108,6 +113,11 @@ public Iterable<Interval> getIntervals() {
108113
new Interval(Interval.Quantile.P_99_9, (float) histogram.get999thPercentile())
109114
);
110115
}
116+
117+
@Override
118+
public long[] getValues() {
119+
return histogram.values();
120+
}
111121
};
112122
}
113123

@@ -125,6 +135,11 @@ public Iterable<Interval> getIntervals() {
125135

126136
return Interval.asIntervals(Interval.Quantile.STANDARD_PERCENTILES, q -> (float) snapshot.getValue(q.value));
127137
}
138+
139+
@Override
140+
public long[] getValues() {
141+
return metric.getSnapshot().getValues();
142+
}
128143
};
129144
}
130145

common/src/main/java/com/zegelin/cassandra/exporter/CollectorFunctions.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.apache.cassandra.metrics.CassandraMetricsRegistry.JmxMeterMBean;
1111
import org.apache.cassandra.utils.EstimatedHistogram;
1212

13+
import java.util.ArrayList;
1314
import java.util.stream.Stream;
1415

1516
public final class CollectorFunctions {
@@ -176,4 +177,58 @@ protected static CollectorFunction<SamplingCounting> samplingAndCountingAsSummar
176177
public static CollectorFunction<SamplingCounting> samplingAndCountingAsSummary() {
177178
return samplingAndCountingAsSummary(FloatFloatFunction.identity());
178179
}
180+
181+
/**
182+
* Collect a {@link SamplingCounting} as a Prometheus histogram.
183+
*/
184+
protected static CollectorFunction<SamplingCounting> samplingAndCountingAsHistogram(final FloatFloatFunction bucketScaleFunction) {
185+
// Set some limits on the range so we don't export all 170 buckets
186+
float bucketMin = 0.0001f; // 0.1ms
187+
float bucketMax = 60.0f; // 60sec
188+
189+
// Avoid recomputing the buckets frequently. Cassandra uses ~170 buckets
190+
float[] cachedBuckets = newBucketOffsets(200, bucketScaleFunction);
191+
192+
return group -> {
193+
final Stream<HistogramMetricFamily.Histogram> histogramStream = group.labeledObjects().entrySet().stream()
194+
.map(e -> {
195+
long[] values = e.getValue().getValues();
196+
float[] buckets = values.length <= cachedBuckets.length
197+
? cachedBuckets
198+
: newBucketOffsets(values.length, bucketScaleFunction);
199+
200+
float sum = 0;
201+
long count = 0;
202+
ArrayList<Interval> intervals = new ArrayList<>();
203+
assert values[values.length-1] == 0;
204+
205+
for (int i = 0; i < values.length; i++) {
206+
if (values[i] != 0) {
207+
sum += buckets[i] * values[i];
208+
count += values[i];
209+
}
210+
if (buckets[i] >= bucketMin && buckets[i] <= bucketMax) {
211+
intervals.add(new Interval(new Interval.Quantile(buckets[i]), count));
212+
}
213+
}
214+
215+
return new HistogramMetricFamily.Histogram(e.getKey(), sum, count, intervals);
216+
});
217+
218+
return Stream.of(new HistogramMetricFamily(group.name(), group.help(), histogramStream));
219+
};
220+
}
221+
222+
public static CollectorFunction<SamplingCounting> samplingAndCountingAsHistogram() {
223+
return samplingAndCountingAsHistogram(FloatFloatFunction.identity());
224+
}
225+
226+
private static float[] newBucketOffsets(int size, final FloatFloatFunction bucketScaleFunction) {
227+
long[] rawOffsets = EstimatedHistogram.newOffsets(size, false);
228+
float[] adjustedOffsets = new float[size];
229+
for (int i = 0; i < size; i++) {
230+
adjustedOffsets[i] = bucketScaleFunction.apply(rawOffsets[i]);
231+
}
232+
return adjustedOffsets;
233+
}
179234
}

common/src/main/java/com/zegelin/cassandra/exporter/FactoriesSupplier.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,15 @@ private static FactoryBuilder.CollectorConstructor histogramAsSummaryCollectorCo
510510
};
511511
}
512512

513+
private static FactoryBuilder.CollectorConstructor histogramAsHistogramCollectorConstructor() {
514+
return (name, help, labels, mBean) -> {
515+
final NamedObject<SamplingCounting> samplingCountingNamedObject = CassandraMetricsUtilities.jmxHistogramAsSamplingCounting(mBean);
516+
517+
return new FunctionalMetricFamilyCollector<>(name, help, ImmutableMap.of(labels, samplingCountingNamedObject),
518+
samplingAndCountingAsHistogram(MetricValueConversionFunctions::nanosecondsToSeconds));
519+
};
520+
}
521+
513522
private static <T> FactoryBuilder.CollectorConstructor functionalCollectorConstructor(final FunctionalMetricFamilyCollector.CollectorFunction<T> function) {
514523
return (final String name, final String help, final Labels labels, final NamedObject<?> mBean) ->
515524
new FunctionalMetricFamilyCollector<>(name, help, ImmutableMap.of(labels, mBean.<T>cast()), function);
@@ -592,6 +601,8 @@ public List<Factory> get() {
592601

593602
builder.add(clientRequestMetricFactory(LatencyMetricGroupSummaryCollector::collectorForMBean, "Latency", "latency_seconds", "Request latency."));
594603
builder.add(clientRequestMetricFactory(LatencyMetricGroupSummaryCollector::collectorForMBean, "TotalLatency", "latency_seconds", "Total request duration."));
604+
605+
builder.add(clientRequestMetricFactory(histogramAsHistogramCollectorConstructor(), "Latency", "latency_hist_seconds", "Request latency."));
595606
}
596607

597608

common/src/main/java/com/zegelin/cassandra/exporter/SamplingCounting.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,6 @@ public interface SamplingCounting {
1313
long getCount();
1414

1515
Iterable<Interval> getIntervals();
16+
17+
long[] getValues();
1618
}

0 commit comments

Comments
 (0)