Skip to content

Commit 6cb9a73

Browse files
Fix negative reload estimated time and year 2088 rebalance timestamp (#17833)
Two bugs are fixed: 1. Reload status: estimatedTimeRemainingInMinutes goes negative when successCount exceeds totalSegmentCount (due to segments being added/replaced after the reload job started). Fixed by clamping remainingSegments to Math.max(0, ...). Also added a derived "status" field (COMPLETED / COMPLETED_WITH_ERRORS / IN_PROGRESS) to the reload status response so consumers don't have to infer completion from raw counts. 2. Rebalance: timeToFinishInSeconds becomes a Unix timestamp (~1.7B) instead of a duration when startTimeMs is never initialized (stays at default 0). This happens for NO_OP or early-failure rebalances that skip START_TRIGGER. The UI then computes submissionTimeMs + (timestamp * 1000) ≈ year 2088. Fixed by guarding against startTimeMs <= 0 and returning 0 seconds. Made-with: Cursor
1 parent fa8ad70 commit 6cb9a73

File tree

3 files changed

+50
-8
lines changed

3 files changed

+50
-8
lines changed

pinot-controller/src/main/java/org/apache/pinot/controller/api/dto/PinotTableReloadStatusResponse.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
@InterfaceStability.Evolving
2727
public class PinotTableReloadStatusResponse {
28+
private String _status;
2829
private double _timeElapsedInMinutes;
2930
private double _estimatedTimeRemainingInMinutes;
3031
private int _totalSegmentCount;
@@ -35,6 +36,15 @@ public class PinotTableReloadStatusResponse {
3536
private PinotControllerJobMetadataDto _metadata;
3637
private List<SegmentReloadFailureResponse> _segmentReloadFailures;
3738

39+
public String getStatus() {
40+
return _status;
41+
}
42+
43+
public PinotTableReloadStatusResponse setStatus(String status) {
44+
_status = status;
45+
return this;
46+
}
47+
3848
public int getTotalSegmentCount() {
3949
return _totalSegmentCount;
4050
}

pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/ZkBasedTableRebalanceObserver.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,8 @@ private void updateOnStart(Map<String, Map<String, String>> currentState,
196196
@Override
197197
public void onNoop(String msg) {
198198
_controllerMetrics.setValueOfTableGauge(_tableNameWithType, ControllerGauge.TABLE_REBALANCE_IN_PROGRESS, 0);
199-
long timeToFinishInSeconds = (System.currentTimeMillis() - _tableRebalanceProgressStats.getStartTimeMs()) / 1000L;
200199
_tableRebalanceProgressStats.setCompletionStatusMsg(msg);
201-
_tableRebalanceProgressStats.setTimeToFinishInSeconds(timeToFinishInSeconds);
200+
_tableRebalanceProgressStats.setTimeToFinishInSeconds(computeElapsedTimeInSeconds());
202201
_tableRebalanceProgressStats.setStatus(RebalanceResult.Status.NO_OP);
203202
trackStatsInZk();
204203
}
@@ -208,9 +207,8 @@ public void onSuccess(String msg) {
208207
Preconditions.checkState(RebalanceResult.Status.DONE != _tableRebalanceProgressStats.getStatus(),
209208
"Table Rebalance already completed");
210209
_controllerMetrics.setValueOfTableGauge(_tableNameWithType, ControllerGauge.TABLE_REBALANCE_IN_PROGRESS, 0);
211-
long timeToFinishInSeconds = (System.currentTimeMillis() - _tableRebalanceProgressStats.getStartTimeMs()) / 1000L;
212210
_tableRebalanceProgressStats.setCompletionStatusMsg(msg);
213-
_tableRebalanceProgressStats.setTimeToFinishInSeconds(timeToFinishInSeconds);
211+
_tableRebalanceProgressStats.setTimeToFinishInSeconds(computeElapsedTimeInSeconds());
214212
_tableRebalanceProgressStats.setStatus(RebalanceResult.Status.DONE);
215213
// Zero out the in_progress convergence stats
216214
TableRebalanceProgressStats.RebalanceStateStats stats = new TableRebalanceProgressStats.RebalanceStateStats();
@@ -226,13 +224,25 @@ public void onSuccess(String msg) {
226224
@Override
227225
public void onError(String errorMsg) {
228226
_controllerMetrics.setValueOfTableGauge(_tableNameWithType, ControllerGauge.TABLE_REBALANCE_IN_PROGRESS, 0);
229-
long timeToFinishInSeconds = (System.currentTimeMillis() - _tableRebalanceProgressStats.getStartTimeMs()) / 1000L;
230-
_tableRebalanceProgressStats.setTimeToFinishInSeconds(timeToFinishInSeconds);
227+
_tableRebalanceProgressStats.setTimeToFinishInSeconds(computeElapsedTimeInSeconds());
231228
_tableRebalanceProgressStats.setStatus(RebalanceResult.Status.FAILED);
232229
_tableRebalanceProgressStats.setCompletionStatusMsg(errorMsg);
233230
trackStatsInZk();
234231
}
235232

233+
/**
234+
* Safely computes elapsed time in seconds since rebalance started.
235+
* Returns 0 if startTimeMs was never set (i.e. still at default 0), which happens
236+
* when the rebalance completes as NO_OP or fails before START_TRIGGER fires.
237+
*/
238+
private long computeElapsedTimeInSeconds() {
239+
long startTimeMs = _tableRebalanceProgressStats.getStartTimeMs();
240+
if (startTimeMs <= 0) {
241+
return 0L;
242+
}
243+
return (System.currentTimeMillis() - startTimeMs) / 1000L;
244+
}
245+
236246
@Override
237247
public void onRollback() {
238248
_tableRebalanceProgressStats.setRebalanceProgressStatsCurrentStep(

pinot-controller/src/main/java/org/apache/pinot/controller/services/PinotTableReloadStatusReporter.java

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ public PinotTableReloadStatusReporter(PinotHelixResourceManager pinotHelixResour
7070

7171
private static double computeEstimatedRemainingTimeInMinutes(PinotTableReloadStatusResponse finalResponse,
7272
double timeElapsedInMinutes) {
73-
int remainingSegments = finalResponse.getTotalSegmentCount() - finalResponse.getSuccessCount();
73+
// Clamp to 0 to handle cases where successCount > totalSegmentCount (e.g. segments added after job started)
74+
int remainingSegments = Math.max(0, finalResponse.getTotalSegmentCount() - finalResponse.getSuccessCount());
7475

7576
double estimatedRemainingTimeInMinutes = -1;
7677
if (finalResponse.getSuccessCount() > 0) {
@@ -80,6 +81,26 @@ private static double computeEstimatedRemainingTimeInMinutes(PinotTableReloadSta
8081
return estimatedRemainingTimeInMinutes;
8182
}
8283

84+
/**
85+
* Derives the overall reload job status from aggregated counts.
86+
* - COMPLETED: all segments reloaded successfully, no server call failures
87+
* - COMPLETED_WITH_ERRORS: reload finished but some segments failed
88+
* - IN_PROGRESS: reload is still running
89+
*/
90+
private static String deriveReloadStatus(PinotTableReloadStatusResponse response) {
91+
int processed = response.getSuccessCount() + (response.getFailureCount() != null
92+
? response.getFailureCount().intValue() : 0);
93+
boolean allProcessed = processed >= response.getTotalSegmentCount();
94+
95+
if (allProcessed && response.getTotalServerCallsFailed() == 0) {
96+
if (response.getFailureCount() != null && response.getFailureCount() > 0) {
97+
return "COMPLETED_WITH_ERRORS";
98+
}
99+
return "COMPLETED";
100+
}
101+
return "IN_PROGRESS";
102+
}
103+
83104
private static double computeTimeElapsedInMinutes(double submissionTime) {
84105
return ((double) System.currentTimeMillis() - submissionTime) / (1000.0 * 60.0);
85106
}
@@ -210,7 +231,8 @@ public PinotTableReloadStatusResponse getReloadJobStatus(String reloadJobId)
210231

211232
return response.setMetadata(reloadJobMetadata)
212233
.setTimeElapsedInMinutes(timeElapsedInMinutes)
213-
.setEstimatedTimeRemainingInMinutes(estimatedRemainingTimeInMinutes);
234+
.setEstimatedTimeRemainingInMinutes(estimatedRemainingTimeInMinutes)
235+
.setStatus(deriveReloadStatus(response));
214236
}
215237

216238
private PinotControllerJobMetadataDto getControllerJobMetadataFromZk(String reloadJobId) {

0 commit comments

Comments
 (0)