Skip to content

Commit 0f4d865

Browse files
authored
[client] Fix infinite retry loop in idempotent writer on lost response with OutOfOrderSequenceException (apache#2827)
1 parent bfcc249 commit 0f4d865

File tree

3 files changed

+132
-6
lines changed

3 files changed

+132
-6
lines changed

fluss-client/src/main/java/org/apache/fluss/client/write/IdempotenceManager.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,23 @@ synchronized boolean canRetry(WriteBatch batch, TableBucket tableBucket, Errors
294294
return false;
295295
}
296296

297+
/**
298+
* Returns true if the batch has already been committed on the server.
299+
*
300+
* <p>If the batch's sequence is less than or equal to {@code lastAckedBatchSequence}, it means
301+
* a higher-sequence batch has already been acknowledged. This implies the previous batch was
302+
* also successfully written on the server (otherwise the higher-sequence batch could not have
303+
* been committed).
304+
*
305+
* @param batch the batch to check
306+
* @param tableBucket the target table-bucket
307+
* @return true if the batch is already committed on the server
308+
*/
309+
synchronized boolean isAlreadyCommitted(WriteBatch batch, TableBucket tableBucket) {
310+
Optional<Integer> lastAcked = lastAckedBatchSequence(tableBucket);
311+
return lastAcked.isPresent() && batch.batchSequence() <= lastAcked.get();
312+
}
313+
297314
void maybeWaitForWriterId(Set<PhysicalTablePath> tablePaths) throws Throwable {
298315
if (isWriterIdValid()) {
299316
return;

fluss-client/src/main/java/org/apache/fluss/client/write/Sender.java

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,27 @@ private Set<PhysicalTablePath> handleWriteBatchException(
532532
ReadyWriteBatch readyWriteBatch, ApiError error) {
533533
Set<PhysicalTablePath> invalidMetadataTables = new HashSet<>();
534534
WriteBatch writeBatch = readyWriteBatch.writeBatch();
535-
if (canRetry(readyWriteBatch, error.error())) {
535+
if (error.error() == Errors.DUPLICATE_SEQUENCE_EXCEPTION) {
536+
// If we have received a duplicate batch sequence error, it means that the batch
537+
// sequence has advanced beyond the sequence of the current batch.
538+
// The only thing we can do is to return success to the user.
539+
completeBatch(readyWriteBatch);
540+
} else if (error.error() == Errors.OUT_OF_ORDER_SEQUENCE_EXCEPTION
541+
&& idempotenceManager.idempotenceEnabled()
542+
&& idempotenceManager.isAlreadyCommitted(
543+
writeBatch, readyWriteBatch.tableBucket())) {
544+
// The batch received OUT_OF_ORDER_SEQUENCE_EXCEPTION but its sequence is already
545+
// <= lastAckedBatchSequence, which means it was successfully written on the server
546+
// but the response was lost. Complete it as success to avoid infinite retry loop.
547+
LOG.warn(
548+
"Batch for table-bucket {} with sequence {} received "
549+
+ "OUT_OF_ORDER_SEQUENCE_EXCEPTION but has already been committed "
550+
+ "(lastAckedBatchSequence={}). Treating as success due to lost response.",
551+
readyWriteBatch.tableBucket(),
552+
writeBatch.batchSequence(),
553+
idempotenceManager.lastAckedBatchSequence(readyWriteBatch.tableBucket()));
554+
completeBatch(readyWriteBatch);
555+
} else if (canRetry(readyWriteBatch, error.error())) {
536556
// if batch failed because of retrievable exception, we need to retry send all those
537557
// batches.
538558
LOG.warn(
@@ -575,11 +595,6 @@ private Set<PhysicalTablePath> handleWriteBatchException(
575595
}
576596
invalidMetadataTables.add(writeBatch.physicalTablePath());
577597
}
578-
} else if (error.error() == Errors.DUPLICATE_SEQUENCE_EXCEPTION) {
579-
// If we have received a duplicate batch sequence error, it means that the batch
580-
// sequence has advanced beyond the sequence of the current batch.
581-
// The only thing we can do is to return success to the user.
582-
completeBatch(readyWriteBatch);
583598
} else {
584599
LOG.warn(
585600
"Get error write response on table bucket {}, fail. Error: {}",

fluss-client/src/test/java/org/apache/fluss/client/write/SenderTest.java

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,100 @@ void testCorrectHandlingOfOutOfOrderResponsesWhenSecondSucceeds() throws Excepti
610610
assertThat(future1.get()).isNull();
611611
}
612612

613+
/**
614+
* Tests that when a batch's response is lost (e.g., due to request timeout) but the batch was
615+
* successfully written on the server, and subsequent batches with higher sequence numbers are
616+
* acknowledged, the client should treat the retried batch as already committed instead of
617+
* entering an infinite retry loop with {@link
618+
* org.apache.fluss.exception.OutOfOrderSequenceException}.
619+
*
620+
* <p>Detailed scenario:
621+
*
622+
* <ol>
623+
* <li>Send batch1(seq=0) ~ batch5(seq=4). All 5 batches are successfully written on the
624+
* server (server {@code lastBatchSeq=4}).
625+
* <li>batch2~5 (seq=1~4) responses return normally. Client {@code lastAckedBatchSequence=4}.
626+
* <li>Send batch6(seq=5) and ack successfully. Server {@code lastBatchSeq=5}.
627+
* <li>batch1(seq=0) response is lost due to {@code REQUEST_TIME_OUT}. batch1 is re-enqueued
628+
* for retry.
629+
* <li>Client retries batch1(seq=0). Since server {@code lastBatchSeq=5} and {@code 0 != 5+1},
630+
* server returns {@code OUT_OF_ORDER_SEQUENCE_EXCEPTION}.
631+
* <li>Client detects {@code batch1.seq(0) <= lastAckedBatchSequence(5)}: batch1 is already
632+
* committed. Client completes batch1 successfully without further retries.
633+
* </ol>
634+
*/
635+
@Test
636+
void testCorrectHandlingOfOutOfOrderResponsesWhenResponseLostButSubsequentBatchesSucceeded()
637+
throws Exception {
638+
IdempotenceManager idempotenceManager = createIdempotenceManager(true);
639+
Sender sender1 = setupWithIdempotenceState(idempotenceManager);
640+
sender1.runOnce();
641+
assertThat(idempotenceManager.isWriterIdValid()).isTrue();
642+
assertThat(idempotenceManager.nextSequence(tb1)).isEqualTo(0);
643+
644+
// Send batch1 (seq=0): its response will be lost later.
645+
CompletableFuture<Exception> future1 = new CompletableFuture<>();
646+
appendToAccumulator(tb1, row(1, "a"), (tb, leo, e) -> future1.complete(e));
647+
sender1.runOnce();
648+
assertThat(future1.isDone()).isFalse();
649+
assertThat(idempotenceManager.nextSequence(tb1)).isEqualTo(1);
650+
assertThat(idempotenceManager.lastAckedBatchSequence(tb1)).isNotPresent();
651+
652+
// Send batch2~5 (seq=1~4) and collect their futures.
653+
int numFollowingBatches = 4;
654+
List<CompletableFuture<Exception>> followingFutures = new ArrayList<>();
655+
for (int i = 0; i < numFollowingBatches; i++) {
656+
CompletableFuture<Exception> future = new CompletableFuture<>();
657+
followingFutures.add(future);
658+
appendToAccumulator(tb1, row(i + 2, "b"), (tb, leo, e) -> future.complete(e));
659+
sender1.runOnce();
660+
assertThat(future.isDone()).isFalse();
661+
}
662+
assertThat(idempotenceManager.nextSequence(tb1)).isEqualTo(5);
663+
664+
// batch2~5 (seq=1~4) responses return normally.
665+
for (int seq = 1; seq <= numFollowingBatches; seq++) {
666+
finishIdempotentProduceLogRequest(
667+
seq, tb1, 1, createProduceLogResponse(tb1, seq, seq + 1L));
668+
assertThat(idempotenceManager.lastAckedBatchSequence(tb1)).isEqualTo(Optional.of(seq));
669+
assertThat(followingFutures.get(seq - 1).isDone()).isTrue();
670+
assertThat(followingFutures.get(seq - 1).get()).isNull();
671+
}
672+
673+
// Send batch6 (seq=5) and ack successfully.
674+
// Now server lastBatchSeq=5. batch1 (seq=0) is still waiting response.
675+
CompletableFuture<Exception> future6 = new CompletableFuture<>();
676+
appendToAccumulator(tb1, row(6, "f"), (tb, leo, e) -> future6.complete(e));
677+
sender1.runOnce(); // drain and send batch6 (seq=5)
678+
finishIdempotentProduceLogRequest(5, tb1, 1, createProduceLogResponse(tb1, 5L, 6L));
679+
assertThat(idempotenceManager.lastAckedBatchSequence(tb1)).isEqualTo(Optional.of(5));
680+
assertThat(future6.isDone()).isTrue();
681+
assertThat(future6.get()).isNull();
682+
683+
// All 6 batches are written successfully on the server (server lastBatchSeq=5).
684+
// batch1 (seq=0) response is lost, simulated by REQUEST_TIME_OUT.
685+
finishIdempotentProduceLogRequest(
686+
0, tb1, 0, createProduceLogResponse(tb1, Errors.REQUEST_TIME_OUT));
687+
assertThat(future1.isDone()).isFalse();
688+
689+
// Now retry batch1 (seq=0). Server lastBatchSeq=5, so 0 != 5+1,
690+
// server returns OUT_OF_ORDER_SEQUENCE_EXCEPTION.
691+
sender1.runOnce(); // send retried batch1
692+
finishIdempotentProduceLogRequest(
693+
0, tb1, 0, createProduceLogResponse(tb1, Errors.OUT_OF_ORDER_SEQUENCE_EXCEPTION));
694+
695+
// The client should detect that batch1.seq(0) <= lastAckedBatchSequence(5),
696+
// meaning batch1 was already committed on the server (its response was just lost).
697+
// It should complete batch1 successfully instead of entering an infinite retry loop.
698+
assertThat(future1.isDone()).isTrue();
699+
assertThat(future1.get()).isNull();
700+
// lastAckedBatchSequence should remain at 5 (not changed by completing already-committed
701+
// batch1)
702+
assertThat(idempotenceManager.lastAckedBatchSequence(tb1)).isEqualTo(Optional.of(5));
703+
// No more inflight batches
704+
assertThat(sender1.numOfInFlightBatches(tb1)).isEqualTo(0);
705+
}
706+
613707
@Test
614708
void testCorrectHandlingOfDuplicateSequenceError() throws Exception {
615709
IdempotenceManager idempotenceManager = createIdempotenceManager(true);

0 commit comments

Comments
 (0)