Skip to content

Commit 7cd5db2

Browse files
committed
Add the ability to configure a ratio of proxy metrics to be recorded
This ratio defaults to 1.0 (i.e. all metrics will be recorded), but we will set it much lower in sandbox and production, probably something closer to 0.01. This will reduce recorded metrics volume and thus StackDriver cost, while still retaining enough data for overall performance monitoring. This is handled stochastically, so as to not require any coordination between Java threads or GKE pods/clusters, as alternative approaches would (i.e. using a counter and recording every Nth, or throttling to a max metrics qps).
1 parent d4bcff0 commit 7cd5db2

File tree

8 files changed

+136
-8
lines changed

8 files changed

+136
-8
lines changed

proxy/src/main/java/google/registry/proxy/ProxyConfig.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ public static class Metrics {
113113
public int stackdriverMaxQps;
114114
public int stackdriverMaxPointsPerRequest;
115115
public int writeIntervalSeconds;
116+
public double frontendMetricsRatio;
117+
public double backendMetricsRatio;
116118
}
117119

118120
/** Configuration options that apply to quota management. */

proxy/src/main/java/google/registry/proxy/ProxyModule.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,20 @@ static Duration provideCertCachingDuration(ProxyConfig config) {
395395
return Duration.ofSeconds(config.serverCertificateCacheSeconds);
396396
}
397397

398+
@Singleton
399+
@Provides
400+
@Named("frontendMetricsRatio")
401+
static double provideFrontendMetricsRatio(ProxyConfig config) {
402+
return config.metrics.frontendMetricsRatio;
403+
}
404+
405+
@Singleton
406+
@Provides
407+
@Named("backendMetricsRatio")
408+
static double provideBackendMetricsRatio(ProxyConfig config) {
409+
return config.metrics.backendMetricsRatio;
410+
}
411+
398412
/** Root level component that exposes the port-to-protocol map. */
399413
@Singleton
400414
@Component(

proxy/src/main/java/google/registry/proxy/config/default-config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,15 @@ metrics:
200200

201201
# How often metrics are written.
202202
writeIntervalSeconds: 60
203+
204+
# What ratio of frontend request metrics should be stochastically recorded
205+
# (0.0 means none, 1.0 means all). This is useful for reducing metrics volume,
206+
# and thus cost, while still recording some information for performance
207+
# monitoring purposes.
208+
frontendMetricsRatio: 1.0
209+
210+
# What ratio of backend request metrics should be stochastically recorded
211+
# (0.0 means none, 1.0 means all). This is useful for reducing metrics volume,
212+
# and thus cost, while still recording some information for performance
213+
# monitoring purposes.
214+
backendMetricsRatio: 1.0

proxy/src/main/java/google/registry/proxy/metric/BackendMetrics.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
package google.registry.proxy.metric;
1616

17+
import com.google.common.annotations.VisibleForTesting;
1718
import com.google.common.collect.ImmutableSet;
1819
import com.google.monitoring.metrics.EventMetric;
1920
import com.google.monitoring.metrics.IncrementableMetric;
@@ -22,7 +23,9 @@
2223
import google.registry.util.NonFinalForTesting;
2324
import io.netty.handler.codec.http.FullHttpResponse;
2425
import jakarta.inject.Inject;
26+
import jakarta.inject.Named;
2527
import jakarta.inject.Singleton;
28+
import java.util.Random;
2629
import org.joda.time.Duration;
2730

2831
/** Backend metrics instrumentation. */
@@ -75,8 +78,18 @@ public class BackendMetrics extends BaseMetrics {
7578
LABELS,
7679
DEFAULT_LATENCY_FITTER);
7780

81+
@NonFinalForTesting
82+
@VisibleForTesting
83+
Random random = new Random();
84+
85+
@NonFinalForTesting
86+
@VisibleForTesting
87+
double backendMetricsRatio;
88+
7889
@Inject
79-
BackendMetrics() {}
90+
BackendMetrics(@Named("backendMetricsRatio") double backendMetricsRatio) {
91+
this.backendMetricsRatio = backendMetricsRatio;
92+
}
8093

8194
@Override
8295
void resetMetrics() {
@@ -89,13 +102,21 @@ void resetMetrics() {
89102

90103
@NonFinalForTesting
91104
public void requestSent(String protocol, String certHash, int bytes) {
105+
// Short-circuit metrics recording randomly according to the configured ratio.
106+
if (random.nextDouble() > backendMetricsRatio) {
107+
return;
108+
}
92109
requestsCounter.increment(protocol, certHash);
93110
requestBytes.record(bytes, protocol, certHash);
94111
}
95112

96113
@NonFinalForTesting
97114
public void responseReceived(
98115
String protocol, String certHash, FullHttpResponse response, Duration latency) {
116+
// Short-circuit metrics recording randomly according to the configured ratio.
117+
if (random.nextDouble() > backendMetricsRatio) {
118+
return;
119+
}
99120
latencyMs.record(latency.getMillis(), protocol, certHash);
100121
responseBytes.record(response.content().readableBytes(), protocol, certHash);
101122
responsesCounter.increment(protocol, certHash, response.status().toString());

proxy/src/main/java/google/registry/proxy/metric/FrontendMetrics.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
package google.registry.proxy.metric;
1616

17+
import com.google.common.annotations.VisibleForTesting;
1718
import com.google.common.collect.ImmutableList;
1819
import com.google.common.collect.ImmutableMap;
1920
import com.google.monitoring.metrics.EventMetric;
@@ -26,8 +27,10 @@
2627
import io.netty.channel.group.DefaultChannelGroup;
2728
import io.netty.util.concurrent.GlobalEventExecutor;
2829
import jakarta.inject.Inject;
30+
import jakarta.inject.Named;
2931
import jakarta.inject.Singleton;
3032
import java.util.Map;
33+
import java.util.Random;
3134
import java.util.concurrent.ConcurrentHashMap;
3235
import java.util.concurrent.ConcurrentMap;
3336
import org.joda.time.Duration;
@@ -78,8 +81,16 @@ public class FrontendMetrics extends BaseMetrics {
7881
LABELS,
7982
DEFAULT_LATENCY_FITTER);
8083

84+
@NonFinalForTesting
85+
@VisibleForTesting
86+
Random random = new Random();
87+
88+
double frontendMetricsRatio;
89+
8190
@Inject
82-
public FrontendMetrics() {}
91+
FrontendMetrics(@Named("frontendMetricsRatio") double frontendMetricsRatio) {
92+
this.frontendMetricsRatio = frontendMetricsRatio;
93+
}
8394

8495
@Override
8596
void resetMetrics() {
@@ -109,6 +120,10 @@ public void registerQuotaRejection(String protocol, String certHash) {
109120

110121
@NonFinalForTesting
111122
public void responseSent(String protocol, String certHash, Duration latency) {
123+
// Short-circuit metrics recording randomly according to the configured ratio.
124+
if (random.nextDouble() > frontendMetricsRatio) {
125+
return;
126+
}
112127
latencyMs.record(latency.getMillis(), protocol, certHash);
113128
}
114129
}

proxy/src/test/java/google/registry/proxy/ProtocolModuleTest.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,20 @@ static Duration provideCertCachingDuration() {
312312
return Duration.ofHours(1);
313313
}
314314

315+
@Singleton
316+
@Provides
317+
@Named("frontendMetricsRatio")
318+
static double provideFrontendMetricsRatio() {
319+
return 1.0;
320+
}
321+
322+
@Singleton
323+
@Provides
324+
@Named("backendMetricsRatio")
325+
static double providebackendMetricsRatio() {
326+
return 1.0;
327+
}
328+
315329
// This method is only here to satisfy Dagger binding, but is never used. In test environment,
316330
// it is the self-signed certificate and its key that ends up being used.
317331
@Singleton

proxy/src/test/java/google/registry/proxy/metric/BackendMetricsTest.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
import static com.google.monitoring.metrics.contrib.LongMetricSubject.assertThat;
1919
import static google.registry.proxy.TestUtils.makeHttpPostRequest;
2020
import static google.registry.proxy.TestUtils.makeHttpResponse;
21+
import static org.mockito.Mockito.mock;
22+
import static org.mockito.Mockito.when;
2123

2224
import com.google.common.collect.ImmutableSet;
2325
import io.netty.handler.codec.http.FullHttpRequest;
2426
import io.netty.handler.codec.http.FullHttpResponse;
2527
import io.netty.handler.codec.http.HttpResponseStatus;
28+
import java.util.Random;
2629
import org.joda.time.Duration;
2730
import org.junit.jupiter.api.BeforeEach;
2831
import org.junit.jupiter.api.Test;
@@ -34,7 +37,7 @@ class BackendMetricsTest {
3437
private final String certHash = "blah12345";
3538
private final String protocol = "frontend protocol";
3639

37-
private final BackendMetrics metrics = new BackendMetrics();
40+
private final BackendMetrics metrics = new BackendMetrics(1.0);
3841

3942
@BeforeEach
4043
void beforeEach() {
@@ -107,15 +110,21 @@ void testSuccess_oneResponse() {
107110

108111
@Test
109112
void testSuccess_multipleResponses() {
113+
metrics.backendMetricsRatio = 0.2;
114+
metrics.random = mock(Random.class);
115+
// The third response won't be logged.
116+
when(metrics.random.nextDouble()).thenReturn(.1, .04, .5, .15);
110117
String content1 = "some response";
111118
String content2 = "other response";
112119
String content3 = "a very bad response";
113120
FullHttpResponse response1 = makeHttpResponse(content1, HttpResponseStatus.OK);
114121
FullHttpResponse response2 = makeHttpResponse(content2, HttpResponseStatus.OK);
115-
FullHttpResponse response3 = makeHttpResponse(content3, HttpResponseStatus.BAD_REQUEST);
122+
FullHttpResponse response3 = makeHttpResponse(content2, HttpResponseStatus.OK);
123+
FullHttpResponse response4 = makeHttpResponse(content3, HttpResponseStatus.BAD_REQUEST);
116124
metrics.responseReceived(protocol, certHash, response1, Duration.millis(5));
117125
metrics.responseReceived(protocol, certHash, response2, Duration.millis(8));
118-
metrics.responseReceived(protocol, certHash, response3, Duration.millis(2));
126+
metrics.responseReceived(protocol, certHash, response3, Duration.millis(15));
127+
metrics.responseReceived(protocol, certHash, response4, Duration.millis(2));
119128

120129
assertThat(BackendMetrics.requestsCounter).hasNoOtherValues();
121130
assertThat(BackendMetrics.requestBytes).hasNoOtherValues();

proxy/src/test/java/google/registry/proxy/metric/FrontendMetricsTest.java

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,17 @@
1515
package google.registry.proxy.metric;
1616

1717
import static com.google.common.truth.Truth.assertThat;
18+
import static com.google.monitoring.metrics.contrib.DistributionMetricSubject.assertThat;
1819
import static com.google.monitoring.metrics.contrib.LongMetricSubject.assertThat;
20+
import static org.mockito.Mockito.mock;
21+
import static org.mockito.Mockito.when;
1922

23+
import com.google.common.collect.ImmutableSet;
2024
import io.netty.channel.ChannelFuture;
2125
import io.netty.channel.DefaultChannelId;
2226
import io.netty.channel.embedded.EmbeddedChannel;
27+
import java.util.Random;
28+
import org.joda.time.Duration;
2329
import org.junit.jupiter.api.BeforeEach;
2430
import org.junit.jupiter.api.Test;
2531

@@ -28,7 +34,7 @@ class FrontendMetricsTest {
2834

2935
private static final String PROTOCOL = "some protocol";
3036
private static final String CERT_HASH = "abc_blah_1134zdf";
31-
private final FrontendMetrics metrics = new FrontendMetrics();
37+
private final FrontendMetrics metrics = new FrontendMetrics(1.0);
3238

3339
@BeforeEach
3440
void beforeEach() {
@@ -60,8 +66,13 @@ void testSuccess_oneConnection() {
6066

6167
@Test
6268
void testSuccess_twoConnections_sameClient() {
69+
metrics.frontendMetricsRatio = 0.2;
70+
metrics.random = mock(Random.class);
71+
// The third response won't be logged.
72+
when(metrics.random.nextDouble()).thenReturn(.1, .04, .5);
6373
EmbeddedChannel channel1 = new EmbeddedChannel();
6474
EmbeddedChannel channel2 = new EmbeddedChannel(DefaultChannelId.newInstance());
75+
EmbeddedChannel channel3 = new EmbeddedChannel();
6576

6677
metrics.registerActiveConnection(PROTOCOL, CERT_HASH, channel1);
6778
assertThat(channel1.isActive()).isTrue();
@@ -85,6 +96,27 @@ void testSuccess_twoConnections_sameClient() {
8596
.and()
8697
.hasNoOtherValues();
8798

99+
metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(10));
100+
metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(8));
101+
metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(13));
102+
103+
metrics.registerActiveConnection(PROTOCOL, CERT_HASH, channel3);
104+
assertThat(channel3.isActive()).isTrue();
105+
assertThat(FrontendMetrics.activeConnectionsGauge)
106+
.hasValueForLabels(2, PROTOCOL, CERT_HASH)
107+
.and()
108+
.hasNoOtherValues();
109+
// All connection counts are recorded as metrics, but ...
110+
assertThat(FrontendMetrics.totalConnectionsCounter)
111+
.hasValueForLabels(3, PROTOCOL, CERT_HASH)
112+
.and()
113+
.hasNoOtherValues();
114+
// Latency stats are subject to the metrics ratio.
115+
assertThat(FrontendMetrics.latencyMs).hasDataSetForLabels(ImmutableSet.of(10, 8), PROTOCOL,
116+
CERT_HASH)
117+
.and()
118+
.hasNoOtherValues();
119+
88120
@SuppressWarnings("unused")
89121
ChannelFuture unusedFuture1 = channel1.close();
90122
assertThat(channel1.isActive()).isFalse();
@@ -93,7 +125,7 @@ void testSuccess_twoConnections_sameClient() {
93125
.and()
94126
.hasNoOtherValues();
95127
assertThat(FrontendMetrics.totalConnectionsCounter)
96-
.hasValueForLabels(2, PROTOCOL, CERT_HASH)
128+
.hasValueForLabels(3, PROTOCOL, CERT_HASH)
97129
.and()
98130
.hasNoOtherValues();
99131

@@ -102,7 +134,16 @@ void testSuccess_twoConnections_sameClient() {
102134
assertThat(channel2.isActive()).isFalse();
103135
assertThat(FrontendMetrics.activeConnectionsGauge).hasNoOtherValues();
104136
assertThat(FrontendMetrics.totalConnectionsCounter)
105-
.hasValueForLabels(2, PROTOCOL, CERT_HASH)
137+
.hasValueForLabels(3, PROTOCOL, CERT_HASH)
138+
.and()
139+
.hasNoOtherValues();
140+
141+
@SuppressWarnings("unused")
142+
ChannelFuture unusedFuture3 = channel3.close();
143+
assertThat(channel3.isActive()).isFalse();
144+
assertThat(FrontendMetrics.activeConnectionsGauge).hasNoOtherValues();
145+
assertThat(FrontendMetrics.totalConnectionsCounter)
146+
.hasValueForLabels(3, PROTOCOL, CERT_HASH)
106147
.and()
107148
.hasNoOtherValues();
108149
}

0 commit comments

Comments
 (0)