Skip to content

Commit 71c9407

Browse files
authored
Add MosApiMetrics exporter (#2931)
* Add MosApiMetrics exporter with status code mapping Introduces the metrics exporter for the MoSAPI system. - Implements `MosApiMetrics` to export TLD and service states to Cloud Monitoring. - Maps ICANN status codes to numeric gauges: 1 (UP), 0 (DOWN), and 2 (DISABLED/INCONCLUSIVE). - Sets `MAX_TIMESERIES_PER_REQUEST` to 195 to respect Cloud Monitoring API limits * Automate metric descriptor creation on startup in Cloud Monitoring * Refactor MoSAPI metrics for resilience and standards * Refactor and nits - Kept projectName as part constant instead of inside method signature - Added Summary logs for metrics execution - Metric Executor defaults to Single Threaded * junit test refactoring * Fix Metric kind to GAUGE for all metrics * Refactor MosApiMetrics to remove async ExecutorService * Add LockHandler for Metric Descriptor creation * Update LockHandler lease time to one hour and refactoring
1 parent a138806 commit 71c9407

File tree

8 files changed

+602
-13
lines changed

8 files changed

+602
-13
lines changed

core/src/main/java/google/registry/config/RegistryConfig.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1463,9 +1463,9 @@ public static ImmutableSet<String> provideMosapiServices(RegistryConfigSettings
14631463
}
14641464

14651465
@Provides
1466-
@Config("mosapiTldThreadCnt")
1466+
@Config("mosapiTldThreadCount")
14671467
public static int provideMosapiTldThreads(RegistryConfigSettings config) {
1468-
return config.mosapi.tldThreadCnt;
1468+
return config.mosapi.tldThreadCount;
14691469
}
14701470

14711471
private static String formatComments(String text) {

core/src/main/java/google/registry/config/RegistryConfigSettings.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,6 @@ public static class MosApi {
272272
public String entityType;
273273
public List<String> tlds;
274274
public List<String> services;
275-
public int tldThreadCnt;
275+
public int tldThreadCount;
276276
}
277277
}

core/src/main/java/google/registry/config/files/default-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,5 +645,5 @@ mosapi:
645645
# Provides a fixed thread pool for parallel TLD processing.
646646
# @see <a href="https://www.icann.org/mosapi-specification.pdf">
647647
# ICANN MoSAPI Specification, Section 12.3</a>
648-
tldThreadCnt: 4
648+
tldThreadCount: 4
649649

core/src/main/java/google/registry/mosapi/MosApiMetrics.java

Lines changed: 329 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,346 @@
1414

1515
package google.registry.mosapi;
1616

17+
import static com.google.common.collect.ImmutableList.toImmutableList;
18+
19+
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
20+
import com.google.api.services.monitoring.v3.Monitoring;
21+
import com.google.api.services.monitoring.v3.model.CreateTimeSeriesRequest;
22+
import com.google.api.services.monitoring.v3.model.LabelDescriptor;
23+
import com.google.api.services.monitoring.v3.model.Metric;
24+
import com.google.api.services.monitoring.v3.model.MetricDescriptor;
25+
import com.google.api.services.monitoring.v3.model.MonitoredResource;
26+
import com.google.api.services.monitoring.v3.model.Point;
27+
import com.google.api.services.monitoring.v3.model.TimeInterval;
28+
import com.google.api.services.monitoring.v3.model.TimeSeries;
29+
import com.google.api.services.monitoring.v3.model.TypedValue;
30+
import com.google.common.annotations.VisibleForTesting;
31+
import com.google.common.base.Ascii;
32+
import com.google.common.collect.ImmutableList;
33+
import com.google.common.collect.ImmutableMap;
34+
import com.google.common.collect.Iterators;
1735
import com.google.common.flogger.FluentLogger;
36+
import google.registry.config.RegistryConfig.Config;
37+
import google.registry.mosapi.MosApiModels.ServiceStatus;
1838
import google.registry.mosapi.MosApiModels.TldServiceState;
39+
import google.registry.request.lock.LockHandler;
40+
import google.registry.util.Clock;
1941
import jakarta.inject.Inject;
42+
import java.io.IOException;
43+
import java.time.Instant;
44+
import java.util.Iterator;
2045
import java.util.List;
46+
import java.util.concurrent.atomic.AtomicBoolean;
47+
import java.util.stream.Stream;
48+
import org.joda.time.Duration;
2149

2250
/** Metrics Exporter for MoSAPI. */
2351
public class MosApiMetrics {
2452

2553
private static final FluentLogger logger = FluentLogger.forEnclosingClass();
2654

55+
// Google Cloud Monitoring Limit: Max 200 TimeSeries per request
56+
private static final int MAX_TIMESERIES_PER_REQUEST = 195;
57+
58+
private static final int METRICS_ALREADY_EXIST = 409;
59+
60+
// Magic String Constants
61+
private static final String METRIC_DOMAIN = "custom.googleapis.com/mosapi/";
62+
private static final String PROJECT_RESOURCE_PREFIX = "projects/";
63+
private static final String RESOURCE_TYPE_GLOBAL = "global";
64+
private static final String LABEL_PROJECT_ID = "project_id";
65+
private static final String LABEL_TLD = "tld";
66+
private static final String LABEL_SERVICE_TYPE = "service_type";
67+
68+
// Lock Constants
69+
private static final String LOCK_NAME = "MosApiMetricCreation";
70+
private static final Duration LOCK_LEASE_TIME = Duration.standardHours(1);
71+
// Metric Names
72+
private static final String METRIC_TLD_STATUS = "tld_status";
73+
private static final String METRIC_SERVICE_STATUS = "service_status";
74+
private static final String METRIC_EMERGENCY_USAGE = "emergency_usage";
75+
private static final String GAUGE_METRIC_KIND = "GAUGE";
76+
77+
// Metric Display Names & Descriptions
78+
private static final String DISPLAY_NAME_TLD_STATUS =
79+
"Health of TLDs. 1 = UP, 0 = DOWN, 2= DISABLED/NOT_MONITORED";
80+
private static final String DESC_TLD_STATUS = "Overall Health of TLDs reported from ICANN";
81+
82+
private static final String DISPLAY_NAME_SERVICE_STATUS =
83+
"Health of Services. 1 = UP, 0 = DOWN, 2= DISABLED/NOT_MONITORED";
84+
private static final String DESC_SERVICE_STATUS =
85+
"Overall Health of Services reported from ICANN";
86+
87+
private static final String DISPLAY_NAME_EMERGENCY_USAGE =
88+
"Percentage of Emergency Threshold Consumed";
89+
private static final String DESC_EMERGENCY_USAGE =
90+
"Downtime threshold that if reached by any of the monitored Services may cause the TLDs"
91+
+ " Services emergency transition to an interim Registry Operator";
92+
93+
// MoSAPI Status Constants
94+
private static final String STATUS_UP_INCONCLUSIVE = "UP-INCONCLUSIVE";
95+
private static final String STATUS_DOWN = "DOWN";
96+
private static final String STATUS_DISABLED = "DISABLED";
97+
98+
private final Monitoring monitoringClient;
99+
private final String projectId;
100+
private final String projectName;
101+
private final Clock clock;
102+
private final MonitoredResource monitoredResource;
103+
private final LockHandler lockHandler;
104+
// Flag to ensure we only create descriptors once, lazily
105+
@VisibleForTesting static final AtomicBoolean isDescriptorInitialized = new AtomicBoolean(false);
106+
27107
@Inject
28-
public MosApiMetrics() {}
108+
public MosApiMetrics(
109+
Monitoring monitoringClient,
110+
@Config("projectId") String projectId,
111+
Clock clock,
112+
LockHandler lockHandler) {
113+
this.monitoringClient = monitoringClient;
114+
this.projectId = projectId;
115+
this.clock = clock;
116+
this.projectName = PROJECT_RESOURCE_PREFIX + projectId;
117+
this.lockHandler = lockHandler;
118+
this.monitoredResource =
119+
new MonitoredResource()
120+
.setType(RESOURCE_TYPE_GLOBAL)
121+
.setLabels(ImmutableMap.of(LABEL_PROJECT_ID, projectId));
122+
}
123+
124+
/** Accepts a list of states and processes them in a single async batch task. */
125+
public void recordStates(ImmutableList<TldServiceState> states) {
126+
// If this is the first time we are recording, ensure descriptors exist.
127+
ensureMetricDescriptorsWithLock();
128+
pushBatchMetrics(states);
129+
}
130+
131+
/**
132+
* Attempts to create metric descriptors using a distributed lock.
133+
*
134+
* <p>If the lock is acquired, this instance creates the descriptors and marks itself initialized.
135+
* If the lock is busy, it implies another instance is handling it, so we skip and proceed.
136+
*/
137+
private void ensureMetricDescriptorsWithLock() {
138+
lockHandler.executeWithLocks(
139+
() -> {
140+
if (!isDescriptorInitialized.get()) {
141+
createCustomMetricDescriptors();
142+
isDescriptorInitialized.set(true);
143+
}
144+
return null;
145+
},
146+
null,
147+
LOCK_LEASE_TIME,
148+
LOCK_NAME);
149+
}
150+
151+
// Defines the custom metrics in Cloud Monitoring
152+
private void createCustomMetricDescriptors() {
153+
// 1. TLD Status Descriptor
154+
createMetricDescriptor(
155+
METRIC_TLD_STATUS,
156+
DISPLAY_NAME_TLD_STATUS,
157+
DESC_TLD_STATUS,
158+
"INT64",
159+
ImmutableList.of(LABEL_TLD));
160+
161+
// 2. Service Status Descriptor
162+
createMetricDescriptor(
163+
METRIC_SERVICE_STATUS,
164+
DISPLAY_NAME_SERVICE_STATUS,
165+
DESC_SERVICE_STATUS,
166+
"INT64",
167+
ImmutableList.of(LABEL_TLD, LABEL_SERVICE_TYPE));
168+
169+
// 3. Emergency Usage Descriptor
170+
createMetricDescriptor(
171+
METRIC_EMERGENCY_USAGE,
172+
DISPLAY_NAME_EMERGENCY_USAGE,
173+
DESC_EMERGENCY_USAGE,
174+
"DOUBLE",
175+
ImmutableList.of(LABEL_TLD, LABEL_SERVICE_TYPE));
176+
177+
logger.atInfo().log("Metric descriptors ensured for project %s", projectId);
178+
}
179+
180+
private void createMetricDescriptor(
181+
String metricTypeSuffix,
182+
String displayName,
183+
String description,
184+
String valueType,
185+
ImmutableList<String> labelKeys) {
186+
187+
ImmutableList<LabelDescriptor> labelDescriptors =
188+
labelKeys.stream()
189+
.map(
190+
key ->
191+
new LabelDescriptor()
192+
.setKey(key)
193+
.setValueType("STRING")
194+
.setDescription(
195+
key.equals(LABEL_TLD)
196+
? "The TLD being monitored"
197+
: "The type of service"))
198+
.collect(toImmutableList());
199+
200+
MetricDescriptor descriptor =
201+
new MetricDescriptor()
202+
.setType(METRIC_DOMAIN + metricTypeSuffix)
203+
.setMetricKind(GAUGE_METRIC_KIND)
204+
.setValueType(valueType)
205+
.setDisplayName(displayName)
206+
.setDescription(description)
207+
.setLabels(labelDescriptors);
208+
try {
209+
monitoringClient
210+
.projects()
211+
.metricDescriptors()
212+
.create(this.projectName, descriptor)
213+
.execute();
214+
} catch (GoogleJsonResponseException e) {
215+
if (e.getStatusCode() == METRICS_ALREADY_EXIST) {
216+
// the metric already exists. This is expected.
217+
logger.atFine().log("Metric descriptor %s already exists.", metricTypeSuffix);
218+
} else {
219+
logger.atWarning().withCause(e).log(
220+
"Failed to create metric descriptor %s. Status: %d",
221+
metricTypeSuffix, e.getStatusCode());
222+
}
223+
} catch (Exception e) {
224+
logger.atWarning().withCause(e).log(
225+
"Unexpected error creating metric descriptor %s.", metricTypeSuffix);
226+
}
227+
}
228+
229+
private void pushBatchMetrics(ImmutableList<TldServiceState> states) {
230+
Instant now = Instant.ofEpochMilli(clock.nowUtc().getMillis());
231+
TimeInterval interval = new TimeInterval().setEndTime(now.toString());
232+
Stream<TimeSeries> allTimeSeriesStream =
233+
states.stream().flatMap(state -> createMetricsForState(state, interval));
234+
235+
Iterator<List<TimeSeries>> batchIterator =
236+
Iterators.partition(allTimeSeriesStream.iterator(), MAX_TIMESERIES_PER_REQUEST);
237+
238+
int successCount = 0;
239+
int failureCount = 0;
240+
241+
// Iterate and count
242+
while (batchIterator.hasNext()) {
243+
List<TimeSeries> batch = batchIterator.next();
244+
try {
245+
CreateTimeSeriesRequest request = new CreateTimeSeriesRequest().setTimeSeries(batch);
246+
monitoringClient.projects().timeSeries().create(this.projectName, request).execute();
247+
248+
successCount++;
249+
250+
} catch (IOException e) {
251+
failureCount++;
252+
// Log individual batch failures, so we have the stack trace for debugging
253+
logger.atWarning().withCause(e).log(
254+
"Failed to push batch of %d time series.", batch.size());
255+
}
256+
}
257+
258+
// 4. Log the final summary
259+
if (failureCount > 0) {
260+
logger.atWarning().log(
261+
"Metric push finished with errors. Batches Succeeded: %d, Failed: %d",
262+
successCount, failureCount);
263+
} else {
264+
logger.atInfo().log("Metric push finished successfully. Batches Succeeded: %d", successCount);
265+
}
266+
}
267+
268+
/** Generates all TimeSeries (TLD + Services) for a single state object. */
269+
private Stream<TimeSeries> createMetricsForState(TldServiceState state, TimeInterval interval) {
270+
// 1. TLD Status
271+
Stream<TimeSeries> tldStream = Stream.of(createTldStatusTimeSeries(state, interval));
272+
273+
// 2. Service Metrics (if any)
274+
Stream<TimeSeries> serviceStream =
275+
state.serviceStatuses().entrySet().stream()
276+
.flatMap(
277+
entry ->
278+
createServiceMetricsStream(
279+
state.tld(), entry.getKey(), entry.getValue(), interval));
280+
281+
return Stream.concat(tldStream, serviceStream);
282+
}
283+
284+
private Stream<TimeSeries> createServiceMetricsStream(
285+
String tld, String serviceType, ServiceStatus statusObj, TimeInterval interval) {
286+
ImmutableMap<String, String> labels =
287+
ImmutableMap.of(LABEL_TLD, tld, LABEL_SERVICE_TYPE, serviceType);
288+
289+
return Stream.of(
290+
createTimeSeries(
291+
METRIC_SERVICE_STATUS, labels, parseServiceStatus(statusObj.status()), interval),
292+
createTimeSeries(METRIC_EMERGENCY_USAGE, labels, statusObj.emergencyThreshold(), interval));
293+
}
294+
295+
private TimeSeries createTldStatusTimeSeries(TldServiceState state, TimeInterval interval) {
296+
return createTimeSeries(
297+
METRIC_TLD_STATUS,
298+
ImmutableMap.of(LABEL_TLD, state.tld()),
299+
parseTldStatus(state.status()),
300+
interval);
301+
}
302+
303+
private TimeSeries createTimeSeries(
304+
String suffix, ImmutableMap<String, String> labels, Number val, TimeInterval interval) {
305+
Metric metric = new Metric().setType(METRIC_DOMAIN + suffix).setLabels(labels);
306+
307+
TypedValue tv = new TypedValue();
308+
if (val instanceof Double) {
309+
tv.setDoubleValue((Double) val);
310+
} else {
311+
tv.setInt64Value(val.longValue());
312+
}
313+
314+
return new TimeSeries()
315+
.setMetric(metric)
316+
.setResource(this.monitoredResource)
317+
.setPoints(ImmutableList.of(new Point().setInterval(interval).setValue(tv)));
318+
}
319+
320+
/**
321+
* Translates MoSAPI status to a numeric metric.
322+
*
323+
* <p>Mappings: 1 (UP) = Healthy; 0 (DOWN) = Critical failure; 2 (UP-INCONCLUSIVE) = Disabled/Not
324+
* Monitored/In Maintenance.
325+
*
326+
* <p>A status of 2 indicates the SLA monitoring system is under maintenance. The TLD is
327+
* considered "UP" by default, but individual service checks are disabled. This distinguishes
328+
* maintenance windows from actual availability or outages.
329+
*
330+
* @see <a href="https://www.icann.org/mosapi-specification.pdf">ICANN MoSAPI Spec Sec 5.1</a>
331+
*/
332+
private long parseTldStatus(String status) {
333+
return switch (Ascii.toUpperCase(status)) {
334+
case STATUS_DOWN -> 0;
335+
case STATUS_UP_INCONCLUSIVE -> 2;
336+
default -> 1; // status is up
337+
};
338+
}
29339

30-
public void recordStates(List<TldServiceState> states) {
31-
// b/467541269: Logic to push status to Cloud Monitoring goes here
32-
logger.atInfo().log("MoSAPI record metrics logic will be implemented from here");
340+
/**
341+
* Translates MoSAPI service status to a numeric metric.
342+
*
343+
* <p>Mappings: 1 (UP) = Healthy; 0 (DOWN) = Critical failure; 2 (DISABLED/UP-INCONCLUSIVE*) =
344+
* Disabled/Not Monitored/In Maintenance.
345+
*
346+
* @see <a href="https://www.icann.org/mosapi-specification.pdf">ICANN MoSAPI Spec Sec 5.1</a>
347+
*/
348+
private long parseServiceStatus(String status) {
349+
String serviceStatus = Ascii.toUpperCase(status);
350+
if (serviceStatus.startsWith(STATUS_UP_INCONCLUSIVE)) {
351+
return 2;
352+
}
353+
return switch (serviceStatus) {
354+
case STATUS_DOWN -> 0;
355+
case STATUS_DISABLED -> 2;
356+
default -> 1; // status is Up
357+
};
33358
}
34359
}

0 commit comments

Comments
 (0)