Skip to content

Commit 4d22a34

Browse files
committed
Update exception handling based on review feedback
1 parent 25c10f6 commit 4d22a34

File tree

4 files changed

+143
-52
lines changed

4 files changed

+143
-52
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.scalar.admin.kubernetes;
2+
3+
public class PauseFailedException extends Exception {
4+
public PauseFailedException(String message) {
5+
super(message);
6+
}
7+
8+
public PauseFailedException(String message, Throwable cause) {
9+
super(message, cause);
10+
}
11+
}

lib/src/main/java/com/scalar/admin/kubernetes/Pauser.java

Lines changed: 110 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import java.io.IOException;
1111
import java.net.InetSocketAddress;
1212
import java.time.Instant;
13+
import java.util.Objects;
1314
import java.util.concurrent.TimeUnit;
1415
import java.util.stream.Collectors;
1516
import javax.annotation.Nullable;
@@ -39,6 +40,14 @@ public class Pauser {
3940

4041
private final Logger logger = LoggerFactory.getLogger(Pauser.class);
4142
private final TargetSelector targetSelector;
43+
private Instant startTime;
44+
private Instant endTime;
45+
private PauseFailedException pauseFailedException;
46+
private UnpauseFailedException unpauseFailedException;
47+
private StatusCheckFailedException statusCheckFailedException;
48+
private boolean pauseSuccessful = false;
49+
private boolean unpauseSuccessful = false;
50+
private boolean compareTargetSuccessful = false;
4251

4352
/**
4453
* @param namespace The namespace where the pods are deployed.
@@ -72,55 +81,45 @@ public Pauser(String namespace, String helmReleaseName) throws PauserException {
7281
* @return The start and end time of the pause operation.
7382
*/
7483
public PausedDuration pause(int pauseDuration, @Nullable Long maxPauseWaitTime)
75-
throws PauserException {
84+
throws PauserException, UnpauseFailedException, PauseFailedException,
85+
StatusCheckFailedException {
7686
if (pauseDuration < 1) {
7787
throw new IllegalArgumentException(
7888
"pauseDuration is required to be greater than 0 millisecond.");
7989
}
8090

81-
TargetSnapshot target;
91+
// Get pods and deployment information before pause.
92+
TargetSnapshot targetBeforePause;
8293
try {
83-
target = getTarget();
94+
targetBeforePause = getTarget();
8495
} catch (Exception e) {
8596
throw new PauserException("Failed to find the target pods to pause.", e);
8697
}
8798

88-
RequestCoordinator coordinator;
99+
// Get RequestCoordinator of Scalar Admin to pause.
100+
RequestCoordinator requestCoordinator;
89101
try {
90-
coordinator = getRequestCoordinator(target);
102+
requestCoordinator = getRequestCoordinator(targetBeforePause);
91103
} catch (Exception e) {
92-
throw new PauserException("Failed to initialize the coordinator.", e);
104+
throw new PauserException("Failed to initialize the request coordinator.", e);
93105
}
94106

95-
Instant startTime;
96-
Instant endTime;
107+
// Run pause operation.
97108
try {
98-
coordinator.pause(true, maxPauseWaitTime);
99-
100-
startTime = Instant.now();
101-
102-
Uninterruptibles.sleepUninterruptibly(pauseDuration, TimeUnit.MILLISECONDS);
103-
104-
endTime = Instant.now();
105-
106-
unpauseWithRetry(coordinator, MAX_UNPAUSE_RETRY_COUNT, target);
109+
pauseSuccessful = pauseInternal(requestCoordinator, pauseDuration, maxPauseWaitTime);
110+
} catch (Exception e) {
111+
pauseFailedException = new PauseFailedException("Pause operation failed.", e);
112+
}
107113

114+
// Run unpause operation.
115+
try {
116+
unpauseSuccessful =
117+
unpauseWithRetry(requestCoordinator, MAX_UNPAUSE_RETRY_COUNT, targetBeforePause);
108118
} catch (Exception e) {
109-
try {
110-
unpauseWithRetry(coordinator, MAX_UNPAUSE_RETRY_COUNT, target);
111-
} catch (PauserException ex) {
112-
throw new PauserException("unpauseWithRetry() method failed twice.", e);
113-
} catch (Exception ex) {
114-
throw new PauserException(
115-
"unpauseWithRetry() method failed twice due to unexpected exception.", e);
116-
}
117-
throw new PauserException(
118-
"The pause operation failed for some reason. However, the unpause operation succeeded"
119-
+ " afterward. Currently, the scalar products are running with the unpause status."
120-
+ " You should retry the pause operation to ensure proper backup.",
121-
e);
119+
unpauseFailedException = new UnpauseFailedException("Unpause operation failed.", e);
122120
}
123121

122+
// Get pods and deployment information after pause.
124123
TargetSnapshot targetAfterPause;
125124
try {
126125
targetAfterPause = getTarget();
@@ -131,41 +130,85 @@ public PausedDuration pause(int pauseDuration, @Nullable Long maxPauseWaitTime)
131130
e);
132131
}
133132

134-
if (!target.getStatus().equals(targetAfterPause.getStatus())) {
135-
throw new PauserException("The target pods were updated during paused. Please retry.");
133+
// Check if pods and deployment information are the same between before pause and after pause.
134+
try {
135+
compareTargetSuccessful = compareTargetStates(targetBeforePause, targetAfterPause);
136+
} catch (Exception e) {
137+
statusCheckFailedException = new StatusCheckFailedException("Status check failed.", e);
136138
}
137139

138-
return new PausedDuration(startTime, endTime);
140+
// If both the pause operation and status check succeeded, you can use the backup that was taken
141+
// during the pause duration.
142+
boolean backupOk = pauseSuccessful && compareTargetSuccessful;
143+
144+
// Return the final result based on each process.
145+
if (backupOk) { // Backup OK
146+
if (unpauseSuccessful) { // Backup OK and Unpause OK
147+
return new PausedDuration(startTime, endTime);
148+
} else { // Backup OK but Unpause NG
149+
String errorMessage =
150+
String.format(
151+
"Unpause operation failed. Scalar products might still be in a paused state. You"
152+
+ " must restart related pods by using the `kubectl rollout restart deployment"
153+
+ " %s` command to unpause all pods. However, the pause operations for taking"
154+
+ " backup succeeded. You can use a backup that was taken during this pause"
155+
+ " duration: Start Time = %s, End Time = %s.",
156+
Objects.requireNonNull(targetBeforePause.getDeployment().getMetadata()).getName(),
157+
startTime,
158+
endTime);
159+
// Users who directly utilize this library, bypassing our CLI, are responsible for proper
160+
// exception handling. However, this scenario represents a critical issue. Consequently,
161+
// we output the error message here regardless of whether the calling code handles the
162+
// exception.
163+
logger.error(errorMessage);
164+
throw new UnpauseFailedException(errorMessage, unpauseFailedException);
165+
}
166+
} else { // Backup NG
167+
if (unpauseSuccessful) { // Backup NG but Unpause OK
168+
if (!pauseSuccessful) { // Backup NG (Pause operation failed) but Unpause OK
169+
String errorMessage =
170+
String.format(
171+
"Pause operation failed. You cannot use the backup that was taken during this"
172+
+ " pause duration. You need to retry the pause operation from the beginning"
173+
+ " to take a backup.");
174+
throw new PauseFailedException(errorMessage, pauseFailedException);
175+
} else { // Backup NG (Status check failed) but Unpause OK
176+
String errorMessage =
177+
String.format(
178+
"Status check failed. You cannot use the backup that was taken during this pause"
179+
+ " duration. You need to retry the pause operation from the beginning to"
180+
+ " take a backup.");
181+
throw new StatusCheckFailedException(errorMessage, statusCheckFailedException);
182+
}
183+
} else { // Backup NG and Unpause NG
184+
String errorMessage =
185+
String.format(
186+
"Pause and unpause operation failed. Scalar products might still be in a paused"
187+
+ " state. You must restart related pods by using the `kubectl rollout restart"
188+
+ " deployment %s` command to unpause all pods.",
189+
Objects.requireNonNull(targetBeforePause.getDeployment().getMetadata()).getName());
190+
// Users who directly utilize this library, bypassing our CLI, are responsible for proper
191+
// exception handling. However, this scenario represents a critical issue. Consequently,
192+
// we output the error message here regardless of whether the calling code handles the
193+
// exception.
194+
logger.error(errorMessage);
195+
throw new UnpauseFailedException(errorMessage, unpauseFailedException);
196+
}
197+
}
139198
}
140199

141200
@VisibleForTesting
142-
void unpauseWithRetry(RequestCoordinator coordinator, int maxRetryCount, TargetSnapshot target)
201+
boolean unpauseWithRetry(RequestCoordinator coordinator, int maxRetryCount, TargetSnapshot target)
143202
throws PauserException {
144203
int retryCounter = 0;
145204

146205
while (true) {
147206
try {
148207
coordinator.unpause();
149-
return;
208+
return true;
150209
} catch (Exception e) {
151210
if (++retryCounter >= maxRetryCount) {
152-
// Users who directly utilize this library, bypassing our CLI, are responsible for proper
153-
// exception handling. However, this scenario represents a critical issue. Consequently,
154-
// we output the error message here regardless of whether the calling code handles the
155-
// exception.
156-
logger.error(
157-
"Failed to unpause Scalar product. They are still in paused. You must restart related"
158-
+ " pods by using the `kubectl rollout restart deployment {}`"
159-
+ " command to unpause all pods.",
160-
target.getDeployment().getMetadata().getName());
161-
// In our CLI, we catch this exception and output the message as an error on the CLI side.
162-
throw new PauserException(
163-
String.format(
164-
"Failed to unpause Scalar product. They are still in paused. You must restart"
165-
+ " related pods by using the `kubectl rollout restart deployment %s` command"
166-
+ " to unpause all pods.",
167-
target.getDeployment().getMetadata().getName()),
168-
e);
211+
throw e;
169212
}
170213
}
171214
}
@@ -183,4 +226,19 @@ RequestCoordinator getRequestCoordinator(TargetSnapshot target) {
183226
.map(p -> new InetSocketAddress(p.getStatus().getPodIP(), target.getAdminPort()))
184227
.collect(Collectors.toList()));
185228
}
229+
230+
private boolean pauseInternal(
231+
RequestCoordinator requestCoordinator, int pauseDuration, @Nullable Long maxPauseWaitTime) {
232+
233+
requestCoordinator.pause(true, maxPauseWaitTime);
234+
startTime = Instant.now();
235+
Uninterruptibles.sleepUninterruptibly(pauseDuration, TimeUnit.MILLISECONDS);
236+
endTime = Instant.now();
237+
238+
return true;
239+
}
240+
241+
private boolean compareTargetStates(TargetSnapshot before, TargetSnapshot after) {
242+
return before.getStatus().equals(after.getStatus());
243+
}
186244
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.scalar.admin.kubernetes;
2+
3+
public class StatusCheckFailedException extends Exception {
4+
public StatusCheckFailedException(String message) {
5+
super(message);
6+
}
7+
8+
public StatusCheckFailedException(String message, Throwable cause) {
9+
super(message, cause);
10+
}
11+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.scalar.admin.kubernetes;
2+
3+
public class UnpauseFailedException extends Exception {
4+
public UnpauseFailedException(String message) {
5+
super(message);
6+
}
7+
8+
public UnpauseFailedException(String message, Throwable cause) {
9+
super(message, cause);
10+
}
11+
}

0 commit comments

Comments
 (0)