apache
diff --git a/‎core/src/main/java/kafka/server/share/SharePartition.java‎
Lines changed: 88 additions & 2 deletions b/‎core/src/main/java/kafka/server/share/SharePartition.java‎
Lines changed: 88 additions & 2 deletions
@@ -106,6 +106,11 @@ public class SharePartition {
 
     private static final Logger log = LoggerFactory.getLogger(SharePartition.class);
 
+    /**
+     * Minimum number of records to deliver when throttling
+     */
+    private static final int MINIMUM_THROTTLE_RECORDS_DELIVERY_LIMIT = 2;
+
     /**
      * empty member id used to indicate when a record is not acquired by any member.
      */
@@ -201,6 +206,11 @@ enum SharePartitionState {
      */
     private final int maxDeliveryCount;
 
+    /**
+     * Records whose delivery count exceeds this are deemed abnormal and the batching of these records
+     * should be reduced. The limit is set to half of maxDeliveryCount rounded up, with a minimum of 2.
+     */
+    private final int throttleRecordsDeliveryLimit;
     /**
      * The group config manager is used to retrieve the values for dynamic group configurations
      */
@@ -362,6 +372,7 @@ enum SharePartitionState {
         this.leaderEpoch = leaderEpoch;
         this.maxInFlightRecords = maxInFlightRecords;
         this.maxDeliveryCount = maxDeliveryCount;
+        this.throttleRecordsDeliveryLimit = Math.max(MINIMUM_THROTTLE_RECORDS_DELIVERY_LIMIT, (int) Math.ceil((double) maxDeliveryCount / 2));
         this.cachedState = new ConcurrentSkipListMap<>();
         this.lock = new ReentrantReadWriteLock();
         this.findNextFetchOffset = false;
@@ -834,7 +845,16 @@ public ShareAcquiredRecords acquire(
                 boolean fullMatch = checkForFullMatch(inFlightBatch, firstBatch.baseOffset(), lastOffsetToAcquire);
                 int numRecordsRemaining = maxRecordsToAcquire - acquiredCount;
                 boolean recordLimitSubsetMatch = isRecordLimitMode && checkForRecordLimitSubsetMatch(inFlightBatch, maxRecordsToAcquire, acquiredCount);
-                if (!fullMatch || inFlightBatch.offsetState() != null || recordLimitSubsetMatch) {
+                boolean throttleRecordsDelivery = shouldThrottleRecordsDelivery(inFlightBatch, firstBatch.baseOffset(), lastOffsetToAcquire);
+                // Stop acquiring more records if records delivery has to be throttled. The throttling prevents
+                // complete batch to be archived in case of a single record being corrupt.
+                // Below check isolates the current batch/offsets to be delivered individually in subsequent fetches.
+                if (throttleRecordsDelivery && acquiredCount > 0) {
+                    // Set the max records to acquire as 0 to prevent further acquisition of records.
+                    maxRecordsToAcquire = 0;
+                    break;
+                }
+                if (!fullMatch || inFlightBatch.offsetState() != null || recordLimitSubsetMatch || throttleRecordsDelivery) {
                     log.trace("Subset or offset tracked batch record found for share partition,"
                             + " batch: {} request offsets - first: {}, last: {} for the share"
                             + " partition: {}-{}", inFlightBatch, firstBatch.baseOffset(),
@@ -859,7 +879,14 @@ public ShareAcquiredRecords acquire(
                     // maxRecordsToAcquire. Hence, pass the remaining number of records that can
                     // be acquired.
                     int acquiredSubsetCount = acquireSubsetBatchRecords(memberId, isRecordLimitMode, numRecordsRemaining, firstBatch.baseOffset(), lastOffsetToAcquire, inFlightBatch, result);
+
                     acquiredCount += acquiredSubsetCount;
+                    // If records are throttled, return immediately and set `maxRecordsToAcquire = 0`
+                    // to prevent acquiring any new records afterwards.
+                    if (throttleRecordsDelivery && acquiredSubsetCount > 0) {
+                        maxRecordsToAcquire = 0;
+                        break;
+                    }
                     continue;
                 }
 
@@ -1866,6 +1893,8 @@ private int acquireSubsetBatchRecords(
     ) {
         lock.writeLock().lock();
         int acquiredCount = 0;
+        long maxFetchRecordsWhileThrottledRecords = -1;
+        boolean hasThrottledRecord = false;
         try {
             for (Map.Entry<Long, InFlightState> offsetState : inFlightBatch.offsetState().entrySet()) {
                 // For the first batch which might have offsets prior to the request base
@@ -1885,7 +1914,29 @@ private int acquireSubsetBatchRecords(
                     continue;
                 }
 
-                InFlightState updateResult =  offsetState.getValue().tryUpdateState(RecordState.ACQUIRED, DeliveryCountOps.INCREASE,
+                int recordDeliveryCount = offsetState.getValue().deliveryCount();
+                // If the record is on last delivery attempt then isolate that record to be delivered alone.
+                // If the respective record is corrupt then it prevents increasing delivery count of multiple
+                // records in a single response batch. Condition below checks if the current record has reached
+                // the delivery limit and already have some records to return in response then skip processing
+                // the current record, which shall be delivered alone in next fetch.
+                if (maxDeliveryCount > 2 && recordDeliveryCount == maxDeliveryCount - 1 && acquiredCount > 0) {
+                    break;
+                }
+
+                // When record delivery count reach the throttle threshold, progressively reduce batch size to isolate records.
+                // The `maxFetchRecordsWhileThrottledRecords` is halved with each additional delivery attempt beyond the throttle limit.
+                // Example:
+                //   - maxDeliveryCount = 6, throttleRecordsDeliveryLimit = 3, batch size = 500
+                //   - deliveryCount = 3: maxFetchRecords = 500 >> (3 - 3 + 1) = 250
+                //   - deliveryCount = 4: maxFetchRecords = 500 >> (4 - 3 + 1) = 125
+                // The `maxFetchRecordsWhileThrottledRecords` is calculated based on the first acquirable record that meets the throttling criteria in the batch.
+                if (recordDeliveryCount >= throttleRecordsDeliveryLimit && maxFetchRecordsWhileThrottledRecords < 0) {
+                    maxFetchRecordsWhileThrottledRecords = Math.max(1, (long) inFlightBatch.offsetState().size() >> (recordDeliveryCount - throttleRecordsDeliveryLimit + 1));
+                    hasThrottledRecord = true;
+                }
+
+                InFlightState updateResult = offsetState.getValue().tryUpdateState(RecordState.ACQUIRED, DeliveryCountOps.INCREASE,
                     maxDeliveryCount, memberId);
                 if (updateResult == null || updateResult.state() != RecordState.ACQUIRED) {
                     log.trace("Unable to acquire records for the offset: {} in batch: {}"
@@ -1904,10 +1955,18 @@ private int acquireSubsetBatchRecords(
                     .setLastOffset(offsetState.getKey())
                     .setDeliveryCount((short) offsetState.getValue().deliveryCount()));
                 acquiredCount++;
+
+                // Delivered alone.
+                if (offsetState.getValue().deliveryCount() == maxDeliveryCount && maxDeliveryCount > 2) {
+                    break;
+                }
                 if (isRecordLimitMode && acquiredCount == maxFetchRecords) {
                     // In record_limit mode, acquire only the requested number of records.
                     break;
                 }
+                if (hasThrottledRecord && acquiredCount == maxFetchRecordsWhileThrottledRecords) {
+                    break;
+                }
             }
         } finally {
             lock.writeLock().unlock();
@@ -1942,6 +2001,33 @@ private boolean checkForStartOffsetWithinBatch(long batchFirstOffset, long batch
         return batchFirstOffset < localStartOffset && batchLastOffset >= localStartOffset;
     }
 
+    /**
+     * Check if the in-flight batch should be throttled based on delivery count.
+     *
+     * @param inFlightBatch       The in-flight batch to check for throttling.
+     * @param requestFirstOffset  The first offset to acquire.
+     * @param requestLastOffset   THe last offset to acquire.
+     * @return True if the batch should be throttled (delivery count >= threshold), false otherwise.
+     */
+    private boolean shouldThrottleRecordsDelivery(InFlightBatch inFlightBatch, long requestFirstOffset, long requestLastOffset) {
+        if (inFlightBatch.offsetState() == null) {
+            return inFlightBatch.batchDeliveryCount() >= throttleRecordsDeliveryLimit;
+        }
+
+        return inFlightBatch.offsetState().entrySet().stream().filter(entry -> {
+            if (entry.getKey() < requestFirstOffset) {
+                return false;
+            }
+            if (entry.getKey() > requestLastOffset) {
+                return false;
+            }
+            if (entry.getValue().state() != RecordState.AVAILABLE) {
+                return false;
+            }
+            return true;
+        }).mapToInt(entry -> entry.getValue().deliveryCount()).max().orElse(0) >= throttleRecordsDeliveryLimit;
+    }
+
     // Visibility for test
     static Map<Long, Byte> fetchAckTypeMapForBatch(ShareAcknowledgementBatch batch) {
         // Client can either send a single entry in acknowledgeTypes which represents the state