Skip to content

Commit aa220f1

Browse files
lizk886Release Workflow
andauthored
Implement keyset key fail-fast feature with 7-day timeout (#2088)
* Implement keyset key fail-fast feature with 7-day timeout - Update Main.java to wire keysetkey verticle callback to shutdown handler - Add Consumer import and overloaded createAndDeployRotatingStoreVerticle method - Update KeyManager to accept keyAvailabilityHandler callback - Call handler on successful/failed key retrieval in getMasterKey/getRefreshKey - Add 7-day timeout parameter to OperatorShutdownHandler constructor - Maintains backward compatibility with existing constructors This enables the operator to shut down after 7 days of consecutive keyset key sync failures, allowing Kubernetes to restart and potentially recover. * Add keysetkey fail-fast logic and handleKeysetKeyRefreshResponse * Remove unnecessary KeyManager callback - RotatingStoreVerticle handles monitoring * Update uid2-shared version reference * Enable DEBUG logging for OperatorShutdownHandler * Add timer accumulation logging and keyset key unit tests * [CI Pipeline] Released Snapshot version: 5.58.63-alpha-245-SNAPSHOT * make KeysetKeysFailedShutdownHours configurable, simplify reconvery logic * make KeysetKeysFailedShutdownHours configurable, simplify reconvery logic * remove fall back constuctor * bump version * shut down behavior changes * remove redundant logging * log failure at an interval * log failure at an interval * [CI Pipeline] Released Snapshot version: 5.58.64-alpha-246-SNAPSHOT * simplify delayed timer, since exact logging schedule is not that important * Remove DEBUG logger for OperatorShutdownHandler * beautify * simplify calcualtion logics for delayed logging * Update uid2-shared.version to 11.1.80 * Update version from 5.58.64-alpha-246-SNAPSHOT to 5.58.62 --------- Co-authored-by: Release Workflow <[email protected]>
1 parent bbc14de commit aa220f1

File tree

12 files changed

+130
-7
lines changed

12 files changed

+130
-7
lines changed

conf/docker-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"enclave_platform": null,
4141
"failure_shutdown_wait_hours": 120,
4242
"salts_expired_shutdown_hours": 12,
43+
"keysetkeys_failed_shutdown_hours": 168,
4344
"operator_type": "public",
4445
"disable_optout_token": true,
4546
"enable_remote_config": true,

conf/integ-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"cloud_encryption_keys_metadata_path": "http://localhost:8088/cloud_encryption_keys/retrieve",
1717
"runtime_config_metadata_path": "http://localhost:8088/operator/config",
1818
"salts_expired_shutdown_hours": 12,
19+
"keysetkeys_failed_shutdown_hours": 168,
1920
"operator_type": "public",
2021
"disable_optout_token": true,
2122
"enable_remote_config": false,

conf/local-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"key_sharing_endpoint_provide_app_names": true,
3939
"client_side_token_generate_log_invalid_http_origins": true,
4040
"salts_expired_shutdown_hours": 12,
41+
"keysetkeys_failed_shutdown_hours": 168,
4142
"operator_type": "public",
4243
"encrypted_files": false,
4344
"disable_optout_token": true,

conf/local-e2e-docker-private-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"optout_delta_rotate_interval": 60,
3131
"cloud_refresh_interval": 30,
3232
"salts_expired_shutdown_hours": 12,
33+
"keysetkeys_failed_shutdown_hours": 168,
3334
"operator_type": "private",
3435
"enable_remote_config": true,
3536
"uid_instance_id_prefix": "local-private-operator"

conf/local-e2e-docker-public-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"optout_status_api_enabled": true,
3737
"cloud_refresh_interval": 30,
3838
"salts_expired_shutdown_hours": 12,
39+
"keysetkeys_failed_shutdown_hours": 168,
3940
"operator_type": "public",
4041
"disable_optout_token": true,
4142
"enable_remote_config": true,

conf/local-e2e-private-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"client_side_token_generate_domain_name_check_enabled": false,
4242
"client_side_token_generate_log_invalid_http_origins": true,
4343
"salts_expired_shutdown_hours": 12,
44+
"keysetkeys_failed_shutdown_hours": 168,
4445
"operator_type": "private",
4546
"enable_remote_config": true,
4647
"uid_instance_id_prefix": "local-private-operator"

conf/local-e2e-public-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"key_sharing_endpoint_provide_app_names": true,
4343
"client_side_token_generate_log_invalid_http_origins": true,
4444
"salts_expired_shutdown_hours": 12,
45+
"keysetkeys_failed_shutdown_hours": 168,
4546
"operator_type": "public",
4647
"disable_optout_token": true,
4748
"enable_remote_config": true,

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<enclave-aws.version>2.1.0</enclave-aws.version>
2323
<enclave-azure.version>2.1.13</enclave-azure.version>
2424
<enclave-gcp.version>2.1.0</enclave-gcp.version>
25-
<uid2-shared.version>11.1.69</uid2-shared.version>
25+
<uid2-shared.version>11.1.80</uid2-shared.version>
2626
<image.version>${project.version}</image.version>
2727
<maven.compiler.source>21</maven.compiler.source>
2828
<maven.compiler.target>21</maven.compiler.target>

src/main/java/com/uid2/operator/Main.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import java.time.Duration;
6262
import java.time.Instant;
6363
import java.util.*;
64+
import java.util.function.Consumer;
6465
import java.util.function.Supplier;
6566

6667
import static com.uid2.operator.Const.Config.EnableRemoteConfigProp;
@@ -114,7 +115,10 @@ public Main(Vertx vertx, JsonObject config) throws Exception {
114115
this.clientSideTokenGenerate = config.getBoolean(Const.Config.EnableClientSideTokenGenerate, false);
115116
this.validateServiceLinks = config.getBoolean(Const.Config.ValidateServiceLinks, false);
116117
this.encryptedCloudFilesEnabled = config.getBoolean(Const.Config.EncryptedFiles, false);
117-
this.shutdownHandler = new OperatorShutdownHandler(Duration.ofHours(12), Duration.ofHours(config.getInteger(Const.Config.SaltsExpiredShutdownHours, 12)), Clock.systemUTC(), new ShutdownService());
118+
this.shutdownHandler = new OperatorShutdownHandler(Duration.ofHours(12),
119+
Duration.ofHours(config.getInteger(Const.Config.SaltsExpiredShutdownHours, 12)),
120+
Duration.ofHours(config.getInteger(Const.Config.KeysetKeysFailedShutdownHours, 168)),
121+
Clock.systemUTC(), new ShutdownService());
118122
this.uidInstanceIdProvider = new UidInstanceIdProvider(config);
119123

120124
String coreAttestUrl = this.config.getString(Const.Config.CoreAttestUrlProp);
@@ -420,7 +424,8 @@ private Future<Void> createStoreVerticles() throws Exception {
420424
}
421425
fs.add(createAndDeployRotatingStoreVerticle("auth", clientKeyProvider, "auth_refresh_ms"));
422426
fs.add(createAndDeployRotatingStoreVerticle("keyset", keysetProvider, "keyset_refresh_ms"));
423-
fs.add(createAndDeployRotatingStoreVerticle("keysetkey", keysetKeyStore, "keysetkey_refresh_ms"));
427+
fs.add(createAndDeployRotatingStoreVerticle("keysetkey", keysetKeyStore, "keysetkey_refresh_ms",
428+
this.shutdownHandler::handleKeysetKeyRefreshResponse));
424429
fs.add(createAndDeployRotatingStoreVerticle("salt", saltProvider, "salt_refresh_ms"));
425430
fs.add(createAndDeployCloudSyncStoreVerticle("optout", fsOptOut, optOutCloudSync));
426431
CompositeFuture.all(fs).onComplete(ar -> {
@@ -433,9 +438,13 @@ private Future<Void> createStoreVerticles() throws Exception {
433438
}
434439

435440
private Future<String> createAndDeployRotatingStoreVerticle(String name, IMetadataVersionedStore store, String storeRefreshConfigMs) {
441+
return createAndDeployRotatingStoreVerticle(name, store, storeRefreshConfigMs, null);
442+
}
443+
444+
private Future<String> createAndDeployRotatingStoreVerticle(String name, IMetadataVersionedStore store, String storeRefreshConfigMs, Consumer<Boolean> refreshCallback) {
436445
final int intervalMs = config.getInteger(storeRefreshConfigMs, 10000);
437446

438-
RotatingStoreVerticle rotatingStoreVerticle = new RotatingStoreVerticle(name, intervalMs, store);
447+
RotatingStoreVerticle rotatingStoreVerticle = new RotatingStoreVerticle(name, intervalMs, store, refreshCallback);
439448
return vertx.deployVerticle(rotatingStoreVerticle);
440449
}
441450

src/main/java/com/uid2/operator/vertx/OperatorShutdownHandler.java

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,23 @@
1616
public class OperatorShutdownHandler {
1717
private static final Logger LOGGER = LoggerFactory.getLogger(OperatorShutdownHandler.class);
1818
private static final int SALT_FAILURE_LOG_INTERVAL_MINUTES = 10;
19+
private static final int KEYSET_KEY_FAILURE_LOG_INTERVAL_MINUTES = 10;
1920
private final Duration attestShutdownWaitTime;
2021
private final Duration saltShutdownWaitTime;
22+
private final Duration keysetKeyShutdownWaitTime;
2123
private final AtomicReference<Instant> attestFailureStartTime = new AtomicReference<>(null);
2224
private final AtomicReference<Instant> saltFailureStartTime = new AtomicReference<>(null);
25+
private final AtomicReference<Instant> keysetKeyFailureStartTime = new AtomicReference<>(null);
2326
private final AtomicReference<Instant> lastSaltFailureLogTime = new AtomicReference<>(null);
27+
private final AtomicReference<Instant> lastKeysetKeyFailureLogTime = new AtomicReference<>(null);
2428
private final Clock clock;
2529
private final ShutdownService shutdownService;
2630

27-
public OperatorShutdownHandler(Duration attestShutdownWaitTime, Duration saltShutdownWaitTime, Clock clock, ShutdownService shutdownService) {
31+
public OperatorShutdownHandler(Duration attestShutdownWaitTime, Duration saltShutdownWaitTime,
32+
Duration keysetKeyShutdownWaitTime, Clock clock, ShutdownService shutdownService) {
2833
this.attestShutdownWaitTime = attestShutdownWaitTime;
2934
this.saltShutdownWaitTime = saltShutdownWaitTime;
35+
this.keysetKeyShutdownWaitTime = keysetKeyShutdownWaitTime;
3036
this.clock = clock;
3137
this.shutdownService = shutdownService;
3238
}
@@ -54,6 +60,37 @@ public void logSaltFailureAtInterval() {
5460
}
5561
}
5662

63+
public void handleKeysetKeyRefreshResponse(Boolean success) {
64+
if (success) {
65+
keysetKeyFailureStartTime.set(null);
66+
lastKeysetKeyFailureLogTime.set(null);
67+
LOGGER.debug("keyset keys sync successful");
68+
} else {
69+
Instant t = keysetKeyFailureStartTime.get();
70+
if (t == null) {
71+
keysetKeyFailureStartTime.set(clock.instant());
72+
lastKeysetKeyFailureLogTime.set(clock.instant());
73+
LOGGER.warn("keyset keys sync started failing. shutdown timer started");
74+
} else {
75+
Duration elapsed = Duration.between(t, clock.instant());
76+
if (elapsed.compareTo(this.keysetKeyShutdownWaitTime) > 0) {
77+
LOGGER.error("keyset keys have been failing to sync for too long. shutting down operator");
78+
this.shutdownService.Shutdown(1);
79+
} else {
80+
logKeysetKeyFailureProgressAtInterval(t, elapsed);
81+
}
82+
}
83+
}
84+
}
85+
86+
private void logKeysetKeyFailureProgressAtInterval(Instant failureStartTime, Duration elapsed) {
87+
Instant lastLogTime = lastKeysetKeyFailureLogTime.get();
88+
if (lastLogTime == null || clock.instant().isAfter(lastLogTime.plus(KEYSET_KEY_FAILURE_LOG_INTERVAL_MINUTES, ChronoUnit.MINUTES))) {
89+
LOGGER.warn("keyset keys sync still failing - elapsed time: {}d {}h {}m", elapsed.toDays(), elapsed.toHoursPart(), elapsed.toMinutesPart());
90+
lastKeysetKeyFailureLogTime.set(clock.instant());
91+
}
92+
}
93+
5794
public void handleAttestResponse(Pair<AttestationResponseCode, String> response) {
5895
if (response.left() == AttestationResponseCode.AttestationFailure) {
5996
LOGGER.error("core attestation failed with AttestationFailure, shutting down operator, core response: {}", response.right());

0 commit comments

Comments
 (0)