Skip to content

Commit b59668d

Browse files
authored
Wzh UI d2 5964 refresh 12hrs shutdown handler (#2104)
If any rotating store failed to refresh / did't refresh for 12 hours, operator will shut itself down
1 parent cd6cecd commit b59668d

File tree

11 files changed

+201
-91
lines changed

11 files changed

+201
-91
lines changed

conf/docker-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
"enclave_platform": null,
4141
"failure_shutdown_wait_hours": 120,
4242
"salts_expired_shutdown_hours": 12,
43-
"keysetkeys_failed_shutdown_hours": 168,
43+
"store_refresh_stale_shutdown_hours": 12,
4444
"operator_type": "public",
4545
"disable_optout_token": true,
4646
"enable_remote_config": true,

conf/integ-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"cloud_encryption_keys_metadata_path": "http://localhost:8088/cloud_encryption_keys/retrieve",
1717
"runtime_config_metadata_path": "http://localhost:8088/operator/config",
1818
"salts_expired_shutdown_hours": 12,
19-
"keysetkeys_failed_shutdown_hours": 168,
19+
"store_refresh_stale_shutdown_hours": 12,
2020
"operator_type": "public",
2121
"disable_optout_token": true,
2222
"enable_remote_config": false,

conf/local-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
"key_sharing_endpoint_provide_app_names": true,
3939
"client_side_token_generate_log_invalid_http_origins": true,
4040
"salts_expired_shutdown_hours": 12,
41-
"keysetkeys_failed_shutdown_hours": 168,
41+
"store_refresh_stale_shutdown_hours": 12,
4242
"operator_type": "public",
4343
"encrypted_files": false,
4444
"disable_optout_token": true,

conf/local-e2e-docker-private-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
"optout_delta_rotate_interval": 60,
3131
"cloud_refresh_interval": 30,
3232
"salts_expired_shutdown_hours": 12,
33-
"keysetkeys_failed_shutdown_hours": 168,
33+
"store_refresh_stale_shutdown_hours": 12,
3434
"operator_type": "private",
3535
"enable_remote_config": true,
3636
"uid_instance_id_prefix": "local-private-operator"

conf/local-e2e-docker-public-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"optout_status_api_enabled": true,
3737
"cloud_refresh_interval": 30,
3838
"salts_expired_shutdown_hours": 12,
39-
"keysetkeys_failed_shutdown_hours": 168,
39+
"store_refresh_stale_shutdown_hours": 12,
4040
"operator_type": "public",
4141
"disable_optout_token": true,
4242
"enable_remote_config": true,

conf/local-e2e-private-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"client_side_token_generate_domain_name_check_enabled": false,
4242
"client_side_token_generate_log_invalid_http_origins": true,
4343
"salts_expired_shutdown_hours": 12,
44-
"keysetkeys_failed_shutdown_hours": 168,
44+
"store_refresh_stale_shutdown_hours": 12,
4545
"operator_type": "private",
4646
"enable_remote_config": true,
4747
"uid_instance_id_prefix": "local-private-operator"

conf/local-e2e-public-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"key_sharing_endpoint_provide_app_names": true,
4343
"client_side_token_generate_log_invalid_http_origins": true,
4444
"salts_expired_shutdown_hours": 12,
45-
"keysetkeys_failed_shutdown_hours": 168,
45+
"store_refresh_stale_shutdown_hours": 12,
4646
"operator_type": "public",
4747
"disable_optout_token": true,
4848
"enable_remote_config": true,

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<enclave-aws.version>2.1.0</enclave-aws.version>
2323
<enclave-azure.version>2.1.13</enclave-azure.version>
2424
<enclave-gcp.version>2.1.0</enclave-gcp.version>
25-
<uid2-shared.version>11.1.80</uid2-shared.version>
25+
<uid2-shared.version>11.1.91</uid2-shared.version>
2626
<image.version>${project.version}</image.version>
2727
<maven.compiler.source>21</maven.compiler.source>
2828
<maven.compiler.target>21</maven.compiler.target>

src/main/java/com/uid2/operator/Main.java

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ public Main(Vertx vertx, JsonObject config) throws Exception {
117117
this.encryptedCloudFilesEnabled = config.getBoolean(Const.Config.EncryptedFiles, false);
118118
this.shutdownHandler = new OperatorShutdownHandler(Duration.ofHours(12),
119119
Duration.ofHours(config.getInteger(Const.Config.SaltsExpiredShutdownHours, 12)),
120-
Duration.ofHours(config.getInteger(Const.Config.KeysetKeysFailedShutdownHours, 168)),
120+
Duration.ofHours(config.getInteger(Const.Config.StoreRefreshStaleShutdownHours, 12)),
121121
Clock.systemUTC(), new ShutdownService());
122122
this.uidInstanceIdProvider = new UidInstanceIdProvider(config);
123123

@@ -423,26 +423,26 @@ private Future<Void> createStoreVerticles() throws Exception {
423423
fs.add(createAndDeployRotatingStoreVerticle("runtime_config", (RuntimeConfigStore) configStore, Const.Config.ConfigScanPeriodMsProp));
424424
}
425425
fs.add(createAndDeployRotatingStoreVerticle("auth", clientKeyProvider, "auth_refresh_ms"));
426-
fs.add(createAndDeployRotatingStoreVerticle("keyset", keysetProvider, "keyset_refresh_ms"));
427-
fs.add(createAndDeployRotatingStoreVerticle("keysetkey", keysetKeyStore, "keysetkey_refresh_ms",
428-
this.shutdownHandler::handleKeysetKeyRefreshResponse));
429-
fs.add(createAndDeployRotatingStoreVerticle("salt", saltProvider, "salt_refresh_ms"));
426+
fs.add(createAndDeployRotatingStoreVerticle("keyset", keysetProvider, "keyset_refresh_ms"));
427+
fs.add(createAndDeployRotatingStoreVerticle("keysetkey", keysetKeyStore, "keysetkey_refresh_ms"));
428+
fs.add(createAndDeployRotatingStoreVerticle("salt", saltProvider, "salt_refresh_ms"));
430429
fs.add(createAndDeployCloudSyncStoreVerticle("optout", fsOptOut, optOutCloudSync));
431430
CompositeFuture.all(fs).onComplete(ar -> {
432431
if (ar.failed()) promise.fail(new Exception(ar.cause()));
433-
else promise.complete();
432+
else {
433+
promise.complete();
434+
this.shutdownHandler.startPeriodicStaleCheck(this.vertx);
435+
}
434436
});
435437

436438

437439
return promise.future();
438440
}
439441

440442
private Future<String> createAndDeployRotatingStoreVerticle(String name, IMetadataVersionedStore store, String storeRefreshConfigMs) {
441-
return createAndDeployRotatingStoreVerticle(name, store, storeRefreshConfigMs, null);
442-
}
443-
444-
private Future<String> createAndDeployRotatingStoreVerticle(String name, IMetadataVersionedStore store, String storeRefreshConfigMs, Consumer<Boolean> refreshCallback) {
445-
final int intervalMs = config.getInteger(storeRefreshConfigMs, 10000);
443+
final long intervalMs = config.getInteger(storeRefreshConfigMs, 10000);
444+
445+
Runnable refreshCallback = () -> this.shutdownHandler.handleStoreRefresh(name);
446446

447447
RotatingStoreVerticle rotatingStoreVerticle = new RotatingStoreVerticle(name, intervalMs, store, refreshCallback);
448448
return vertx.deployVerticle(rotatingStoreVerticle);

src/main/java/com/uid2/operator/vertx/OperatorShutdownHandler.java

Lines changed: 55 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import com.uid2.operator.service.ShutdownService;
44
import com.uid2.shared.attest.AttestationResponseCode;
5-
import lombok.extern.java.Log;
5+
import io.vertx.core.Vertx;
66
import org.slf4j.Logger;
77
import org.slf4j.LoggerFactory;
88
import software.amazon.awssdk.utils.Pair;
@@ -11,28 +11,30 @@
1111
import java.time.Duration;
1212
import java.time.Instant;
1313
import java.time.temporal.ChronoUnit;
14+
import java.util.Map;
15+
import java.util.concurrent.ConcurrentHashMap;
1416
import java.util.concurrent.atomic.AtomicReference;
1517

1618
public class OperatorShutdownHandler {
1719
private static final Logger LOGGER = LoggerFactory.getLogger(OperatorShutdownHandler.class);
1820
private static final int SALT_FAILURE_LOG_INTERVAL_MINUTES = 10;
19-
private static final int KEYSET_KEY_FAILURE_LOG_INTERVAL_MINUTES = 10;
21+
private static final int STORE_REFRESH_STALENESS_CHECK_INTERVAL_MINUTES = 60;
2022
private final Duration attestShutdownWaitTime;
2123
private final Duration saltShutdownWaitTime;
22-
private final Duration keysetKeyShutdownWaitTime;
24+
private final Duration storeRefreshStaleTimeout;
2325
private final AtomicReference<Instant> attestFailureStartTime = new AtomicReference<>(null);
2426
private final AtomicReference<Instant> saltFailureStartTime = new AtomicReference<>(null);
25-
private final AtomicReference<Instant> keysetKeyFailureStartTime = new AtomicReference<>(null);
2627
private final AtomicReference<Instant> lastSaltFailureLogTime = new AtomicReference<>(null);
27-
private final AtomicReference<Instant> lastKeysetKeyFailureLogTime = new AtomicReference<>(null);
28+
private final Map<String, AtomicReference<Instant>> lastSuccessfulRefreshTimes = new ConcurrentHashMap<>();
2829
private final Clock clock;
2930
private final ShutdownService shutdownService;
31+
private boolean isStalenessCheckScheduled = false;
3032

3133
public OperatorShutdownHandler(Duration attestShutdownWaitTime, Duration saltShutdownWaitTime,
32-
Duration keysetKeyShutdownWaitTime, Clock clock, ShutdownService shutdownService) {
34+
Duration storeRefreshStaleTimeout, Clock clock, ShutdownService shutdownService) {
3335
this.attestShutdownWaitTime = attestShutdownWaitTime;
3436
this.saltShutdownWaitTime = saltShutdownWaitTime;
35-
this.keysetKeyShutdownWaitTime = keysetKeyShutdownWaitTime;
37+
this.storeRefreshStaleTimeout = storeRefreshStaleTimeout;
3638
this.clock = clock;
3739
this.shutdownService = shutdownService;
3840
}
@@ -60,37 +62,6 @@ public void logSaltFailureAtInterval() {
6062
}
6163
}
6264

63-
public void handleKeysetKeyRefreshResponse(Boolean success) {
64-
if (success) {
65-
keysetKeyFailureStartTime.set(null);
66-
lastKeysetKeyFailureLogTime.set(null);
67-
LOGGER.debug("keyset keys sync successful");
68-
} else {
69-
Instant t = keysetKeyFailureStartTime.get();
70-
if (t == null) {
71-
keysetKeyFailureStartTime.set(clock.instant());
72-
lastKeysetKeyFailureLogTime.set(clock.instant());
73-
LOGGER.warn("keyset keys sync started failing. shutdown timer started");
74-
} else {
75-
Duration elapsed = Duration.between(t, clock.instant());
76-
if (elapsed.compareTo(this.keysetKeyShutdownWaitTime) > 0) {
77-
LOGGER.error("keyset keys have been failing to sync for too long. shutting down operator");
78-
this.shutdownService.Shutdown(1);
79-
} else {
80-
logKeysetKeyFailureProgressAtInterval(t, elapsed);
81-
}
82-
}
83-
}
84-
}
85-
86-
private void logKeysetKeyFailureProgressAtInterval(Instant failureStartTime, Duration elapsed) {
87-
Instant lastLogTime = lastKeysetKeyFailureLogTime.get();
88-
if (lastLogTime == null || clock.instant().isAfter(lastLogTime.plus(KEYSET_KEY_FAILURE_LOG_INTERVAL_MINUTES, ChronoUnit.MINUTES))) {
89-
LOGGER.warn("keyset keys sync still failing - elapsed time: {}d {}h {}m", elapsed.toDays(), elapsed.toHoursPart(), elapsed.toMinutesPart());
90-
lastKeysetKeyFailureLogTime.set(clock.instant());
91-
}
92-
}
93-
9465
public void handleAttestResponse(Pair<AttestationResponseCode, String> response) {
9566
if (response.left() == AttestationResponseCode.AttestationFailure) {
9667
LOGGER.error("core attestation failed with AttestationFailure, shutting down operator, core response: {}", response.right());
@@ -108,4 +79,50 @@ public void handleAttestResponse(Pair<AttestationResponseCode, String> response)
10879
}
10980
}
11081
}
82+
83+
public void handleStoreRefresh(String storeName) {
84+
lastSuccessfulRefreshTimes.computeIfAbsent(storeName, k -> new AtomicReference<>())
85+
.set(clock.instant());
86+
}
87+
88+
public void checkStoreRefreshStaleness() {
89+
Instant now = clock.instant();
90+
for (Map.Entry<String, AtomicReference<Instant>> entry : lastSuccessfulRefreshTimes.entrySet()) {
91+
String storeName = entry.getKey();
92+
Instant lastSuccess = entry.getValue().get();
93+
94+
if (lastSuccess == null) {
95+
// Store hasn't had a successful refresh yet
96+
// This should rarely happen since startup success also records timestamp, but keep as defensive guard for edge cases
97+
LOGGER.warn("Store '{}' has no recorded successful refresh - skipping staleness check", storeName);
98+
continue;
99+
}
100+
101+
Duration timeSinceLastRefresh = Duration.between(lastSuccess, now);
102+
LOGGER.debug("Store '{}' last successful refresh {} ago", storeName, timeSinceLastRefresh);
103+
if (timeSinceLastRefresh.compareTo(storeRefreshStaleTimeout) > 0) {
104+
LOGGER.error("Store '{}' has not refreshed successfully for {} hours ({}). Shutting down operator",
105+
storeName, timeSinceLastRefresh.toHours(), timeSinceLastRefresh);
106+
this.shutdownService.Shutdown(1);
107+
return; // Exit after triggering shutdown for first stale store
108+
}
109+
}
110+
}
111+
112+
public void startPeriodicStaleCheck(Vertx vertx) {
113+
if (isStalenessCheckScheduled) {
114+
LOGGER.warn("Periodic store staleness check already started");
115+
return;
116+
}
117+
118+
long intervalMs = STORE_REFRESH_STALENESS_CHECK_INTERVAL_MINUTES * 60 * 1000L;
119+
vertx.setPeriodic(intervalMs, id -> {
120+
LOGGER.debug("Running periodic store staleness check");
121+
checkStoreRefreshStaleness();
122+
});
123+
isStalenessCheckScheduled = true;
124+
LOGGER.info("Started periodic store staleness check (interval: {} minutes, timeout: {} hours)",
125+
STORE_REFRESH_STALENESS_CHECK_INTERVAL_MINUTES,
126+
storeRefreshStaleTimeout.toHours());
127+
}
111128
}

0 commit comments

Comments
 (0)