KAFKA-19012 Fix rare producer message corruption, don't reuse buffers on the client in certain error cases (#21065)

dnadolny · web-flow · commit 1df2ac5b2ba4 · 2026-01-11T02:19:21.000+08:00
Client versions 2.8.0 and later are affected by a [change](30bc21c) that exposes a latent bug in how BufferPool is used (BufferPool is a class used on the client side to allocate memory in ByteBuffers, for performance it will reuse them with the caller of the class doing manual memory management by calling free when they are done with the memory). The bug is that a pooled ByteBuffer can be freed while it is still in use by the network sending thread - this early freeing can happen when batches expire / brokers are disconnecting from clients. This bug has existed for more than a decade (since Kafka 0.x it seems), but never manifested because prior to 2.8.0 the pooled ByteBuffer (which contained record data aka your publishes) was copied into a freshly allocated ByteBuffer before any potential reuse and that fresh ByteBuffer was what got written over the network to the broker. With a change included in 2.8.0, the pooled ByteBuffer remains as-is inside of a MemoryRecords instance and this pooled ByteBuffer (which in some cases can be reused and overwritten with other data) is written over the network. Two contributing factors are that the checksum for Kafka records only includes the key/value/headers/etc and not the topic so there is no protection there, and also an implementation detail is that, also newly in the commit that exposed the bug, the produce request header (which includes the topic and partition of a group of message batches) is serialized in a buffer separately from the messages themselves (and the latter is what gets put in the pooled ByteBuffer) which allows you to get messages misrouted to a random recently used topic as opposed to simple duplicate messages on their intended topic. The key change is in Sender.sendProducerData, we cannot allow the pooled ByteBuffer to be reused for expired in-flight batches until the request completes. For these batches we avoid deallocating the buffer in the normal failBatch call, deferring it until we call completeBatch (or a different path of failBatch). There are some automated tests to cover this, and also manual testing done to reproduce the issue from KAFKA-19012 and verify that this is sufficient to stop it. Reviewers: Justine Olshan <jolshan@confluent.io>, Jun Rao <junrao@gmail.com>, Chia-Ping Tsai <chia7712@gmail.com>
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java
@@ -72,6 +72,9 @@ private enum FinalState { ABORTED, FAILED, SUCCEEDED }
     private final AtomicInteger attempts = new AtomicInteger(0);
     private final boolean isSplitBatch;
     private final AtomicReference<FinalState> finalState = new AtomicReference<>(null);
+    private boolean bufferDeallocated = false;
+    // Tracks if the batch has been sent to the NetworkClient
+    private boolean inflight = false;
 
     int recordCount;
     int maxRecordSize;
@@ -581,6 +584,22 @@ public boolean sequenceHasBeenReset() {
         return reopened;
     }
 
+    public boolean isBufferDeallocated() {
+        return bufferDeallocated;
+    }
+
+    public void markBufferDeallocated() {
+        bufferDeallocated = true;
+    }
+
+    public boolean isInflight() {
+        return inflight;
+    }
+
+    public void setInflight(boolean inflight) {
+        this.inflight = inflight;
+    }
+
     // VisibleForTesting
     OptionalInt currentLeaderEpoch() {
         return currentLeaderEpoch;
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java
@@ -1027,14 +1027,39 @@ BuiltInPartitioner createBuiltInPartitioner(LogContext logContext, String topic,
     }
 
     /**
-     * Deallocate the record batch
+     * Complete and deallocate the record batch
+     */
+    public void completeAndDeallocateBatch(ProducerBatch batch) {
+        completeBatch(batch);
+        deallocate(batch);
+    }
+
+    /**
+     * Only perform deallocation (and not removal from the incomplete set)
      */
     public void deallocate(ProducerBatch batch) {
-        incomplete.remove(batch);
         // Only deallocate the batch if it is not a split batch because split batch are allocated outside the
         // buffer pool.
-        if (!batch.isSplitBatch())
-            free.deallocate(batch.buffer(), batch.initialCapacity());
+        if (!batch.isSplitBatch()) {
+            if (batch.isBufferDeallocated()) {
+                log.warn("Skipping deallocating a batch that has already been deallocated. Batch is {}, created time is {}", batch, batch.createdMs);
+            } else {
+                batch.markBufferDeallocated();
+                if (batch.isInflight()) {
+                    // Create a fresh ByteBuffer to give to BufferPool to reuse since we can't safely call deallocate with the ProduceBatch's buffer
+                    free.deallocate(ByteBuffer.allocate(batch.initialCapacity()));
+                    throw new IllegalStateException("Attempting to deallocate a batch that is inflight. Batch is " + batch);
+                }
+                free.deallocate(batch.buffer(), batch.initialCapacity());
+            }
+        }
+    }
+
+    /**
+     * Remove from the incomplete list but do not free memory yet
+     */
+    public void completeBatch(ProducerBatch batch) {
+        incomplete.remove(batch);
     }
 
     /**
@@ -1132,7 +1157,14 @@ void abortBatches(final RuntimeException reason) {
                 dq.remove(batch);
             }
             batch.abort(reason);
-            deallocate(batch);
+            if (batch.isInflight()) {
+                // KAFKA-19012: if the batch has been sent it might still be in use by the network client so we cannot allow it to be reused yet.
+                // We skip deallocating it now. When the request in network client completes with a response, either Sender.completeBatch() or
+                // Sender.failBatch() will be called with deallocateBatch=true. The buffer associated with the batch will be deallocated then.
+                completeBatch(batch);
+            } else {
+                completeAndDeallocateBatch(batch);
+            }
         }
     }
 
@@ -1152,7 +1184,7 @@ void abortUndrainedBatches(RuntimeException reason) {
             }
             if (aborted) {
                 batch.abort(reason);
-                deallocate(batch);
+                completeAndDeallocateBatch(batch);
             }
         }
     }
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/Sender.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/Sender.java
@@ -171,7 +171,12 @@ private void maybeRemoveFromInflightBatches(ProducerBatch batch) {
 
     private void maybeRemoveAndDeallocateBatch(ProducerBatch batch) {
         maybeRemoveFromInflightBatches(batch);
-        this.accumulator.deallocate(batch);
+        this.accumulator.completeAndDeallocateBatch(batch);
+    }
+
+    private void maybeRemoveAndDeallocateBatchLater(ProducerBatch batch) {
+        maybeRemoveFromInflightBatches(batch);
+        this.accumulator.completeBatch(batch);
     }
 
     /**
@@ -354,6 +359,24 @@ private boolean shouldHandleAuthorizationError(RuntimeException exception) {
         return false;
     }
 
+    private void failExpiredBatches(List<ProducerBatch> expiredBatches, long now, boolean deallocateBuffer) {
+        // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
+        // for expired batches. see the documentation of @TransactionState.resetIdempotentProducerId to understand why
+        // we need to reset the producer id here.
+        if (!expiredBatches.isEmpty())
+            log.trace("Expired {} batches in accumulator", expiredBatches.size());
+        for (ProducerBatch expiredBatch : expiredBatches) {
+            String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition
+                + ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation. "
+                + "The request has not been sent, or no server response has been received yet.";
+            failBatch(expiredBatch, new TimeoutException(errorMessage), false, deallocateBuffer);
+            if (transactionManager != null && expiredBatch.inRetry()) {
+                // This ensures that no new batches are drained until the current in flight batches are fully resolved.
+                transactionManager.markSequenceUnresolved(expiredBatch);
+            }
+        }
+    }
+
     private long sendProducerData(long now) {
         MetadataSnapshot metadataSnapshot = metadata.fetchMetadataSnapshot();
         // get the list of partitions with data ready to send
@@ -405,23 +428,10 @@ private long sendProducerData(long now) {
         accumulator.resetNextBatchExpiryTime();
         List<ProducerBatch> expiredInflightBatches = getExpiredInflightBatches(now);
         List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(now);
-        expiredBatches.addAll(expiredInflightBatches);
 
-        // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
-        // for expired batches. see the documentation of @TransactionState.resetIdempotentProducerId to understand why
-        // we need to reset the producer id here.
-        if (!expiredBatches.isEmpty())
-            log.trace("Expired {} batches in accumulator", expiredBatches.size());
-        for (ProducerBatch expiredBatch : expiredBatches) {
-            String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition
-                + ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation. " 
-                + "The request has not been sent, or no server response has been received yet.";
-            failBatch(expiredBatch, new TimeoutException(errorMessage), false);
-            if (transactionManager != null && expiredBatch.inRetry()) {
-                // This ensures that no new batches are drained until the current in flight batches are fully resolved.
-                transactionManager.markSequenceUnresolved(expiredBatch);
-            }
-        }
+        failExpiredBatches(expiredBatches, now, true);
+        failExpiredBatches(expiredInflightBatches, now, false);
+
         sensors.updateProduceRequestMetrics(batches);
 
         // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
@@ -524,6 +534,7 @@ private void maybeAbortBatches(RuntimeException exception) {
         if (accumulator.hasIncomplete()) {
             log.error("Aborting producer batches due to fatal error", exception);
             accumulator.abortBatches(exception);
+            inFlightBatches.clear();
         }
     }
 
@@ -659,6 +670,7 @@ private void handleProduceResponse(ClientResponse response, Map<TopicPartition,
      */
     private void completeBatch(ProducerBatch batch, ProduceResponse.PartitionResponse response, long correlationId,
                                long now, Map<TopicPartition, Metadata.LeaderIdAndEpoch> partitionsWithUpdatedLeaderInfo) {
+        batch.setInflight(false);
         Errors error = response.error;
 
         if (error == Errors.MESSAGE_TOO_LARGE && batch.recordCount > 1 && !batch.isDone() &&
@@ -696,7 +708,7 @@ private void completeBatch(ProducerBatch batch, ProduceResponse.PartitionRespons
                 // tell the user the result of their request. We only adjust sequence numbers if the batch didn't exhaust
                 // its retries -- if it did, we don't know whether the sequence number was accepted or not, and
                 // thus it is not safe to reassign the sequence.
-                failBatch(batch, response, batch.attempts() < this.retries);
+                failBatch(batch, response, batch.attempts() < this.retries, true);
             }
             if (error.exception() instanceof InvalidMetadataException) {
                 if (error.exception() instanceof UnknownTopicOrPartitionException) {
@@ -749,12 +761,16 @@ private void completeBatch(ProducerBatch batch, ProduceResponse.PartitionRespons
 
         if (batch.complete(response.baseOffset, response.logAppendTime)) {
             maybeRemoveAndDeallocateBatch(batch);
+        } else {
+            // Always safe to call deallocate because the batch keeps track of whether or not it was deallocated yet
+            this.accumulator.deallocate(batch);
         }
     }
 
     private void failBatch(ProducerBatch batch,
                            ProduceResponse.PartitionResponse response,
-                           boolean adjustSequenceNumbers) {
+                           boolean adjustSequenceNumbers,
+                           boolean deallocateBatch) {
         final RuntimeException topLevelException;
         if (response.error == Errors.TOPIC_AUTHORIZATION_FAILED)
             topLevelException = new TopicAuthorizationException(Collections.singleton(batch.topicPartition.topic()));
@@ -764,7 +780,7 @@ else if (response.error == Errors.CLUSTER_AUTHORIZATION_FAILED)
             topLevelException = response.error.exception(response.errorMessage);
 
         if (response.recordErrors == null || response.recordErrors.isEmpty()) {
-            failBatch(batch, topLevelException, adjustSequenceNumbers);
+            failBatch(batch, topLevelException, adjustSequenceNumbers, deallocateBatch);
         } else {
             Map<Integer, RuntimeException> recordErrorMap = new HashMap<>(response.recordErrors.size());
             for (ProduceResponse.RecordError recordError : response.recordErrors) {
@@ -803,23 +819,25 @@ else if (response.error == Errors.CLUSTER_AUTHORIZATION_FAILED)
                 }
             };
 
-            failBatch(batch, topLevelException, recordExceptions, adjustSequenceNumbers);
+            failBatch(batch, topLevelException, recordExceptions, adjustSequenceNumbers, deallocateBatch);
         }
     }
 
     private void failBatch(
         ProducerBatch batch,
         RuntimeException topLevelException,
-        boolean adjustSequenceNumbers
+        boolean adjustSequenceNumbers,
+        boolean deallocateBatch
     ) {
-        failBatch(batch, topLevelException, batchIndex -> topLevelException, adjustSequenceNumbers);
+        failBatch(batch, topLevelException, batchIndex -> topLevelException, adjustSequenceNumbers, deallocateBatch);
     }
 
     private void failBatch(
         ProducerBatch batch,
         RuntimeException topLevelException,
         Function<Integer, RuntimeException> recordExceptions,
-        boolean adjustSequenceNumbers
+        boolean adjustSequenceNumbers,
+        boolean deallocateBatch
     ) {
         this.sensors.recordErrors(batch.topicPartition.topic(), batch.recordCount);
 
@@ -833,7 +851,20 @@ private void failBatch(
                     log.debug("Encountered error when transaction manager was handling a failed batch", e);
                 }
             }
-            maybeRemoveAndDeallocateBatch(batch);
+            if (deallocateBatch) {
+                maybeRemoveAndDeallocateBatch(batch);
+            } else {
+                // Fix for KAFKA-19012
+                // The pooled ByteBuffer associated with this batch might still be in use by the network client so we
+                // cannot allow it to be reused yet. We skip deallocating it now. When the request in the network client 
+                // completes with a response, either completeBatch() or failBatch() will be called with deallocateBatch=true.
+                // The buffer associated with the batch will be deallocated then.
+                maybeRemoveAndDeallocateBatchLater(batch);
+            }
+        } else {
+            if (deallocateBatch) {
+                this.accumulator.deallocate(batch);
+            }
         }
     }
 
@@ -886,6 +917,7 @@ private void sendProduceRequest(long now, int destination, short acks, int timeo
                     .setIndex(tp.partition())
                     .setRecords(records));
             recordsByPartition.put(tp, batch);
+            batch.setInflight(true);
         }
 
         String transactionalId = null;
diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/RecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/RecordAccumulatorTest.java
@@ -405,7 +405,7 @@ public void testStressfulSituation() throws Exception {
                 for (ProducerBatch batch : batches) {
                     for (@SuppressWarnings("UnusedLocalVariable") Record ignored : batch.records().records())
                         read++;
-                    accum.deallocate(batch);
+                    accum.completeAndDeallocateBatch(batch);
                 }
             }
         }
@@ -669,7 +669,7 @@ public void testFlush() throws Exception {
 
         for (List<ProducerBatch> batches: results.values())
             for (ProducerBatch batch: batches)
-                accum.deallocate(batch);
+                accum.completeAndDeallocateBatch(batch);
 
         // should be complete with no unsent records.
         accum.awaitFlushCompletion();
@@ -1575,7 +1575,7 @@ private int prepareSplitBatches(RecordAccumulator accum, long seed, int recordSi
         assertEquals(1, batches.values().iterator().next().size());
         ProducerBatch batch = batches.values().iterator().next().get(0);
         int numSplitBatches = accum.splitAndReenqueue(batch);
-        accum.deallocate(batch);
+        accum.completeAndDeallocateBatch(batch);
 
         return numSplitBatches;
     }
@@ -1599,7 +1599,7 @@ private BatchDrainedResult completeOrSplitBatches(RecordAccumulator accum, int b
                     } else {
                         batch.complete(0L, 0L);
                     }
-                    accum.deallocate(batch);
+                    accum.completeAndDeallocateBatch(batch);
                 }
             }
         } while (batchDrained);
diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/SenderTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/SenderTest.java

Original file line number	Diff line number	Diff line change
`@@ -405,7 +405,7 @@ public void testStressfulSituation() throws Exception {`
`405`	`405`	`for (ProducerBatch batch : batches) {`
`406`	`406`	`for (@SuppressWarnings("UnusedLocalVariable") Record ignored : batch.records().records())`
`407`	`407`	`read++;`
`408`		`- accum.deallocate(batch);`
	`408`	`+ accum.completeAndDeallocateBatch(batch);`
`409`	`409`	`}`
`410`	`410`	`}`
`411`	`411`	`}`
`@@ -669,7 +669,7 @@ public void testFlush() throws Exception {`
`669`	`669`
`670`	`670`	`for (List<ProducerBatch> batches: results.values())`
`671`	`671`	`for (ProducerBatch batch: batches)`
`672`		`- accum.deallocate(batch);`
	`672`	`+ accum.completeAndDeallocateBatch(batch);`
`673`	`673`
`674`	`674`	`// should be complete with no unsent records.`
`675`	`675`	`accum.awaitFlushCompletion();`
`@@ -1575,7 +1575,7 @@ private int prepareSplitBatches(RecordAccumulator accum, long seed, int recordSi`
`1575`	`1575`	`assertEquals(1, batches.values().iterator().next().size());`
`1576`	`1576`	`ProducerBatch batch = batches.values().iterator().next().get(0);`
`1577`	`1577`	`int numSplitBatches = accum.splitAndReenqueue(batch);`
`1578`		`- accum.deallocate(batch);`
	`1578`	`+ accum.completeAndDeallocateBatch(batch);`
`1579`	`1579`
`1580`	`1580`	`return numSplitBatches;`
`1581`	`1581`	`}`
`@@ -1599,7 +1599,7 @@ private BatchDrainedResult completeOrSplitBatches(RecordAccumulator accum, int b`
`1599`	`1599`	`} else {`
`1600`	`1600`	`batch.complete(0L, 0L);`
`1601`	`1601`	`}`
`1602`		`- accum.deallocate(batch);`
	`1602`	`+ accum.completeAndDeallocateBatch(batch);`
`1603`	`1603`	`}`
`1604`	`1604`	`}`
`1605`	`1605`	`} while (batchDrained);`