|
14 | 14 |
|
15 | 15 | package google.registry.mosapi; |
16 | 16 |
|
| 17 | +import static com.google.common.collect.ImmutableList.toImmutableList; |
| 18 | + |
| 19 | +import com.google.api.client.googleapis.json.GoogleJsonResponseException; |
| 20 | +import com.google.api.services.monitoring.v3.Monitoring; |
| 21 | +import com.google.api.services.monitoring.v3.model.CreateTimeSeriesRequest; |
| 22 | +import com.google.api.services.monitoring.v3.model.LabelDescriptor; |
| 23 | +import com.google.api.services.monitoring.v3.model.Metric; |
| 24 | +import com.google.api.services.monitoring.v3.model.MetricDescriptor; |
| 25 | +import com.google.api.services.monitoring.v3.model.MonitoredResource; |
| 26 | +import com.google.api.services.monitoring.v3.model.Point; |
| 27 | +import com.google.api.services.monitoring.v3.model.TimeInterval; |
| 28 | +import com.google.api.services.monitoring.v3.model.TimeSeries; |
| 29 | +import com.google.api.services.monitoring.v3.model.TypedValue; |
| 30 | +import com.google.common.annotations.VisibleForTesting; |
| 31 | +import com.google.common.base.Ascii; |
| 32 | +import com.google.common.collect.ImmutableList; |
| 33 | +import com.google.common.collect.ImmutableMap; |
| 34 | +import com.google.common.collect.Iterators; |
17 | 35 | import com.google.common.flogger.FluentLogger; |
| 36 | +import google.registry.config.RegistryConfig.Config; |
| 37 | +import google.registry.mosapi.MosApiModels.ServiceStatus; |
18 | 38 | import google.registry.mosapi.MosApiModels.TldServiceState; |
| 39 | +import google.registry.request.lock.LockHandler; |
| 40 | +import google.registry.util.Clock; |
19 | 41 | import jakarta.inject.Inject; |
| 42 | +import java.io.IOException; |
| 43 | +import java.time.Instant; |
| 44 | +import java.util.Iterator; |
20 | 45 | import java.util.List; |
| 46 | +import java.util.concurrent.atomic.AtomicBoolean; |
| 47 | +import java.util.stream.Stream; |
| 48 | +import org.joda.time.Duration; |
21 | 49 |
|
22 | 50 | /** Metrics Exporter for MoSAPI. */ |
23 | 51 | public class MosApiMetrics { |
24 | 52 |
|
25 | 53 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); |
26 | 54 |
|
| 55 | + // Google Cloud Monitoring Limit: Max 200 TimeSeries per request |
| 56 | + private static final int MAX_TIMESERIES_PER_REQUEST = 195; |
| 57 | + |
| 58 | + private static final int METRICS_ALREADY_EXIST = 409; |
| 59 | + |
| 60 | + // Magic String Constants |
| 61 | + private static final String METRIC_DOMAIN = "custom.googleapis.com/mosapi/"; |
| 62 | + private static final String PROJECT_RESOURCE_PREFIX = "projects/"; |
| 63 | + private static final String RESOURCE_TYPE_GLOBAL = "global"; |
| 64 | + private static final String LABEL_PROJECT_ID = "project_id"; |
| 65 | + private static final String LABEL_TLD = "tld"; |
| 66 | + private static final String LABEL_SERVICE_TYPE = "service_type"; |
| 67 | + |
| 68 | + // Lock Constants |
| 69 | + private static final String LOCK_NAME = "MosApiMetricCreation"; |
| 70 | + private static final Duration LOCK_LEASE_TIME = Duration.standardHours(1); |
| 71 | + // Metric Names |
| 72 | + private static final String METRIC_TLD_STATUS = "tld_status"; |
| 73 | + private static final String METRIC_SERVICE_STATUS = "service_status"; |
| 74 | + private static final String METRIC_EMERGENCY_USAGE = "emergency_usage"; |
| 75 | + private static final String GAUGE_METRIC_KIND = "GAUGE"; |
| 76 | + |
| 77 | + // Metric Display Names & Descriptions |
| 78 | + private static final String DISPLAY_NAME_TLD_STATUS = |
| 79 | + "Health of TLDs. 1 = UP, 0 = DOWN, 2= DISABLED/NOT_MONITORED"; |
| 80 | + private static final String DESC_TLD_STATUS = "Overall Health of TLDs reported from ICANN"; |
| 81 | + |
| 82 | + private static final String DISPLAY_NAME_SERVICE_STATUS = |
| 83 | + "Health of Services. 1 = UP, 0 = DOWN, 2= DISABLED/NOT_MONITORED"; |
| 84 | + private static final String DESC_SERVICE_STATUS = |
| 85 | + "Overall Health of Services reported from ICANN"; |
| 86 | + |
| 87 | + private static final String DISPLAY_NAME_EMERGENCY_USAGE = |
| 88 | + "Percentage of Emergency Threshold Consumed"; |
| 89 | + private static final String DESC_EMERGENCY_USAGE = |
| 90 | + "Downtime threshold that if reached by any of the monitored Services may cause the TLDs" |
| 91 | + + " Services emergency transition to an interim Registry Operator"; |
| 92 | + |
| 93 | + // MoSAPI Status Constants |
| 94 | + private static final String STATUS_UP_INCONCLUSIVE = "UP-INCONCLUSIVE"; |
| 95 | + private static final String STATUS_DOWN = "DOWN"; |
| 96 | + private static final String STATUS_DISABLED = "DISABLED"; |
| 97 | + |
| 98 | + private final Monitoring monitoringClient; |
| 99 | + private final String projectId; |
| 100 | + private final String projectName; |
| 101 | + private final Clock clock; |
| 102 | + private final MonitoredResource monitoredResource; |
| 103 | + private final LockHandler lockHandler; |
| 104 | + // Flag to ensure we only create descriptors once, lazily |
| 105 | + @VisibleForTesting static final AtomicBoolean isDescriptorInitialized = new AtomicBoolean(false); |
| 106 | + |
27 | 107 | @Inject |
28 | | - public MosApiMetrics() {} |
| 108 | + public MosApiMetrics( |
| 109 | + Monitoring monitoringClient, |
| 110 | + @Config("projectId") String projectId, |
| 111 | + Clock clock, |
| 112 | + LockHandler lockHandler) { |
| 113 | + this.monitoringClient = monitoringClient; |
| 114 | + this.projectId = projectId; |
| 115 | + this.clock = clock; |
| 116 | + this.projectName = PROJECT_RESOURCE_PREFIX + projectId; |
| 117 | + this.lockHandler = lockHandler; |
| 118 | + this.monitoredResource = |
| 119 | + new MonitoredResource() |
| 120 | + .setType(RESOURCE_TYPE_GLOBAL) |
| 121 | + .setLabels(ImmutableMap.of(LABEL_PROJECT_ID, projectId)); |
| 122 | + } |
| 123 | + |
| 124 | + /** Accepts a list of states and processes them in a single async batch task. */ |
| 125 | + public void recordStates(ImmutableList<TldServiceState> states) { |
| 126 | + // If this is the first time we are recording, ensure descriptors exist. |
| 127 | + ensureMetricDescriptorsWithLock(); |
| 128 | + pushBatchMetrics(states); |
| 129 | + } |
| 130 | + |
| 131 | + /** |
| 132 | + * Attempts to create metric descriptors using a distributed lock. |
| 133 | + * |
| 134 | + * <p>If the lock is acquired, this instance creates the descriptors and marks itself initialized. |
| 135 | + * If the lock is busy, it implies another instance is handling it, so we skip and proceed. |
| 136 | + */ |
| 137 | + private void ensureMetricDescriptorsWithLock() { |
| 138 | + lockHandler.executeWithLocks( |
| 139 | + () -> { |
| 140 | + if (!isDescriptorInitialized.get()) { |
| 141 | + createCustomMetricDescriptors(); |
| 142 | + isDescriptorInitialized.set(true); |
| 143 | + } |
| 144 | + return null; |
| 145 | + }, |
| 146 | + null, |
| 147 | + LOCK_LEASE_TIME, |
| 148 | + LOCK_NAME); |
| 149 | + } |
| 150 | + |
| 151 | + // Defines the custom metrics in Cloud Monitoring |
| 152 | + private void createCustomMetricDescriptors() { |
| 153 | + // 1. TLD Status Descriptor |
| 154 | + createMetricDescriptor( |
| 155 | + METRIC_TLD_STATUS, |
| 156 | + DISPLAY_NAME_TLD_STATUS, |
| 157 | + DESC_TLD_STATUS, |
| 158 | + "INT64", |
| 159 | + ImmutableList.of(LABEL_TLD)); |
| 160 | + |
| 161 | + // 2. Service Status Descriptor |
| 162 | + createMetricDescriptor( |
| 163 | + METRIC_SERVICE_STATUS, |
| 164 | + DISPLAY_NAME_SERVICE_STATUS, |
| 165 | + DESC_SERVICE_STATUS, |
| 166 | + "INT64", |
| 167 | + ImmutableList.of(LABEL_TLD, LABEL_SERVICE_TYPE)); |
| 168 | + |
| 169 | + // 3. Emergency Usage Descriptor |
| 170 | + createMetricDescriptor( |
| 171 | + METRIC_EMERGENCY_USAGE, |
| 172 | + DISPLAY_NAME_EMERGENCY_USAGE, |
| 173 | + DESC_EMERGENCY_USAGE, |
| 174 | + "DOUBLE", |
| 175 | + ImmutableList.of(LABEL_TLD, LABEL_SERVICE_TYPE)); |
| 176 | + |
| 177 | + logger.atInfo().log("Metric descriptors ensured for project %s", projectId); |
| 178 | + } |
| 179 | + |
| 180 | + private void createMetricDescriptor( |
| 181 | + String metricTypeSuffix, |
| 182 | + String displayName, |
| 183 | + String description, |
| 184 | + String valueType, |
| 185 | + ImmutableList<String> labelKeys) { |
| 186 | + |
| 187 | + ImmutableList<LabelDescriptor> labelDescriptors = |
| 188 | + labelKeys.stream() |
| 189 | + .map( |
| 190 | + key -> |
| 191 | + new LabelDescriptor() |
| 192 | + .setKey(key) |
| 193 | + .setValueType("STRING") |
| 194 | + .setDescription( |
| 195 | + key.equals(LABEL_TLD) |
| 196 | + ? "The TLD being monitored" |
| 197 | + : "The type of service")) |
| 198 | + .collect(toImmutableList()); |
| 199 | + |
| 200 | + MetricDescriptor descriptor = |
| 201 | + new MetricDescriptor() |
| 202 | + .setType(METRIC_DOMAIN + metricTypeSuffix) |
| 203 | + .setMetricKind(GAUGE_METRIC_KIND) |
| 204 | + .setValueType(valueType) |
| 205 | + .setDisplayName(displayName) |
| 206 | + .setDescription(description) |
| 207 | + .setLabels(labelDescriptors); |
| 208 | + try { |
| 209 | + monitoringClient |
| 210 | + .projects() |
| 211 | + .metricDescriptors() |
| 212 | + .create(this.projectName, descriptor) |
| 213 | + .execute(); |
| 214 | + } catch (GoogleJsonResponseException e) { |
| 215 | + if (e.getStatusCode() == METRICS_ALREADY_EXIST) { |
| 216 | + // the metric already exists. This is expected. |
| 217 | + logger.atFine().log("Metric descriptor %s already exists.", metricTypeSuffix); |
| 218 | + } else { |
| 219 | + logger.atWarning().withCause(e).log( |
| 220 | + "Failed to create metric descriptor %s. Status: %d", |
| 221 | + metricTypeSuffix, e.getStatusCode()); |
| 222 | + } |
| 223 | + } catch (Exception e) { |
| 224 | + logger.atWarning().withCause(e).log( |
| 225 | + "Unexpected error creating metric descriptor %s.", metricTypeSuffix); |
| 226 | + } |
| 227 | + } |
| 228 | + |
| 229 | + private void pushBatchMetrics(ImmutableList<TldServiceState> states) { |
| 230 | + Instant now = Instant.ofEpochMilli(clock.nowUtc().getMillis()); |
| 231 | + TimeInterval interval = new TimeInterval().setEndTime(now.toString()); |
| 232 | + Stream<TimeSeries> allTimeSeriesStream = |
| 233 | + states.stream().flatMap(state -> createMetricsForState(state, interval)); |
| 234 | + |
| 235 | + Iterator<List<TimeSeries>> batchIterator = |
| 236 | + Iterators.partition(allTimeSeriesStream.iterator(), MAX_TIMESERIES_PER_REQUEST); |
| 237 | + |
| 238 | + int successCount = 0; |
| 239 | + int failureCount = 0; |
| 240 | + |
| 241 | + // Iterate and count |
| 242 | + while (batchIterator.hasNext()) { |
| 243 | + List<TimeSeries> batch = batchIterator.next(); |
| 244 | + try { |
| 245 | + CreateTimeSeriesRequest request = new CreateTimeSeriesRequest().setTimeSeries(batch); |
| 246 | + monitoringClient.projects().timeSeries().create(this.projectName, request).execute(); |
| 247 | + |
| 248 | + successCount++; |
| 249 | + |
| 250 | + } catch (IOException e) { |
| 251 | + failureCount++; |
| 252 | + // Log individual batch failures, so we have the stack trace for debugging |
| 253 | + logger.atWarning().withCause(e).log( |
| 254 | + "Failed to push batch of %d time series.", batch.size()); |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + // 4. Log the final summary |
| 259 | + if (failureCount > 0) { |
| 260 | + logger.atWarning().log( |
| 261 | + "Metric push finished with errors. Batches Succeeded: %d, Failed: %d", |
| 262 | + successCount, failureCount); |
| 263 | + } else { |
| 264 | + logger.atInfo().log("Metric push finished successfully. Batches Succeeded: %d", successCount); |
| 265 | + } |
| 266 | + } |
| 267 | + |
| 268 | + /** Generates all TimeSeries (TLD + Services) for a single state object. */ |
| 269 | + private Stream<TimeSeries> createMetricsForState(TldServiceState state, TimeInterval interval) { |
| 270 | + // 1. TLD Status |
| 271 | + Stream<TimeSeries> tldStream = Stream.of(createTldStatusTimeSeries(state, interval)); |
| 272 | + |
| 273 | + // 2. Service Metrics (if any) |
| 274 | + Stream<TimeSeries> serviceStream = |
| 275 | + state.serviceStatuses().entrySet().stream() |
| 276 | + .flatMap( |
| 277 | + entry -> |
| 278 | + createServiceMetricsStream( |
| 279 | + state.tld(), entry.getKey(), entry.getValue(), interval)); |
| 280 | + |
| 281 | + return Stream.concat(tldStream, serviceStream); |
| 282 | + } |
| 283 | + |
| 284 | + private Stream<TimeSeries> createServiceMetricsStream( |
| 285 | + String tld, String serviceType, ServiceStatus statusObj, TimeInterval interval) { |
| 286 | + ImmutableMap<String, String> labels = |
| 287 | + ImmutableMap.of(LABEL_TLD, tld, LABEL_SERVICE_TYPE, serviceType); |
| 288 | + |
| 289 | + return Stream.of( |
| 290 | + createTimeSeries( |
| 291 | + METRIC_SERVICE_STATUS, labels, parseServiceStatus(statusObj.status()), interval), |
| 292 | + createTimeSeries(METRIC_EMERGENCY_USAGE, labels, statusObj.emergencyThreshold(), interval)); |
| 293 | + } |
| 294 | + |
| 295 | + private TimeSeries createTldStatusTimeSeries(TldServiceState state, TimeInterval interval) { |
| 296 | + return createTimeSeries( |
| 297 | + METRIC_TLD_STATUS, |
| 298 | + ImmutableMap.of(LABEL_TLD, state.tld()), |
| 299 | + parseTldStatus(state.status()), |
| 300 | + interval); |
| 301 | + } |
| 302 | + |
| 303 | + private TimeSeries createTimeSeries( |
| 304 | + String suffix, ImmutableMap<String, String> labels, Number val, TimeInterval interval) { |
| 305 | + Metric metric = new Metric().setType(METRIC_DOMAIN + suffix).setLabels(labels); |
| 306 | + |
| 307 | + TypedValue tv = new TypedValue(); |
| 308 | + if (val instanceof Double) { |
| 309 | + tv.setDoubleValue((Double) val); |
| 310 | + } else { |
| 311 | + tv.setInt64Value(val.longValue()); |
| 312 | + } |
| 313 | + |
| 314 | + return new TimeSeries() |
| 315 | + .setMetric(metric) |
| 316 | + .setResource(this.monitoredResource) |
| 317 | + .setPoints(ImmutableList.of(new Point().setInterval(interval).setValue(tv))); |
| 318 | + } |
| 319 | + |
| 320 | + /** |
| 321 | + * Translates MoSAPI status to a numeric metric. |
| 322 | + * |
| 323 | + * <p>Mappings: 1 (UP) = Healthy; 0 (DOWN) = Critical failure; 2 (UP-INCONCLUSIVE) = Disabled/Not |
| 324 | + * Monitored/In Maintenance. |
| 325 | + * |
| 326 | + * <p>A status of 2 indicates the SLA monitoring system is under maintenance. The TLD is |
| 327 | + * considered "UP" by default, but individual service checks are disabled. This distinguishes |
| 328 | + * maintenance windows from actual availability or outages. |
| 329 | + * |
| 330 | + * @see <a href="https://www.icann.org/mosapi-specification.pdf">ICANN MoSAPI Spec Sec 5.1</a> |
| 331 | + */ |
| 332 | + private long parseTldStatus(String status) { |
| 333 | + return switch (Ascii.toUpperCase(status)) { |
| 334 | + case STATUS_DOWN -> 0; |
| 335 | + case STATUS_UP_INCONCLUSIVE -> 2; |
| 336 | + default -> 1; // status is up |
| 337 | + }; |
| 338 | + } |
29 | 339 |
|
30 | | - public void recordStates(List<TldServiceState> states) { |
31 | | - // b/467541269: Logic to push status to Cloud Monitoring goes here |
32 | | - logger.atInfo().log("MoSAPI record metrics logic will be implemented from here"); |
| 340 | + /** |
| 341 | + * Translates MoSAPI service status to a numeric metric. |
| 342 | + * |
| 343 | + * <p>Mappings: 1 (UP) = Healthy; 0 (DOWN) = Critical failure; 2 (DISABLED/UP-INCONCLUSIVE*) = |
| 344 | + * Disabled/Not Monitored/In Maintenance. |
| 345 | + * |
| 346 | + * @see <a href="https://www.icann.org/mosapi-specification.pdf">ICANN MoSAPI Spec Sec 5.1</a> |
| 347 | + */ |
| 348 | + private long parseServiceStatus(String status) { |
| 349 | + String serviceStatus = Ascii.toUpperCase(status); |
| 350 | + if (serviceStatus.startsWith(STATUS_UP_INCONCLUSIVE)) { |
| 351 | + return 2; |
| 352 | + } |
| 353 | + return switch (serviceStatus) { |
| 354 | + case STATUS_DOWN -> 0; |
| 355 | + case STATUS_DISABLED -> 2; |
| 356 | + default -> 1; // status is Up |
| 357 | + }; |
33 | 358 | } |
34 | 359 | } |
0 commit comments