Remove minimum 1024 cache size for faster

emasab · emasab · commit 2f86688b4194 · 2025-10-19T18:05:53.000+02:00
rebalances in case of long processing time
diff --git a/ci/tests/run_perf_test.js b/ci/tests/run_perf_test.js
@@ -232,14 +232,14 @@ async function main() {
     consumerKjsMessageMaxLatencyT0T1 = extractValue(outputKjsProducerConsumer, '=== Consumer max E2E latency T0-T1 (eachMessage):');
     consumerKjsMessageAvgLatencyT0T2 = extractValue(outputKjsProducerConsumer, '=== Consumer average E2E latency T0-T2 (eachMessage):');
     consumerKjsMessageMaxLatencyT0T2 = extractValue(outputKjsProducerConsumer, '=== Consumer max E2E latency T0-T2 (eachMessage):');
+    consumerKjsTime = extractValue(outputKjsProducerConsumer, '=== Consumption time (eachMessage):');
     consumerKjsMessageAverageRSS = extractValue(outputKjsProducerConsumer, '=== Average consumer-each-message RSS KB:');
     consumerKjsMessageMaxRSS = extractValue(outputKjsProducerConsumer, '=== Max consumer-each-message RSS KB:');
     consumerKjsMessageAverageBrokerLag = extractValue(outputKjsProducerConsumer, `=== Average broker lag (${groupIdEachMessageKafkaJS}):`);
     consumerKjsMessageMaxBrokerLag = extractValue(outputKjsProducerConsumer, `=== Max broker lag (${groupIdEachMessageKafkaJS}):`);
     consumerKjsMessageTotalLagMeasurements = extractValue(outputKjsProducerConsumer, `=== Sample size for broker lag measurement (${groupIdEachMessageKafkaJS}):`);
   }
   if (consumerModeAll || consumerModeEachBatch) {
-    consumerKjsTime = extractValue(outputKjsProducerConsumer, '=== Consumption time (eachMessage):');
     consumerKjsBatch = extractValue(outputKjsProducerConsumer, '=== Consumer Rate MB/s (eachBatch):');
     consumerKjsBatchRate = extractValue(outputKjsProducerConsumer, '=== Consumer Rate msg/s (eachBatch):');
     consumerKjsBatchAvgLatencyT0T1 = extractValue(outputKjsProducerConsumer, '=== Consumer average E2E latency T0-T1 (eachBatch):');
diff --git a/examples/performance/performance-primitives-common.js b/examples/performance/performance-primitives-common.js
@@ -190,7 +190,7 @@ async function runConsumer(consumer, topic, warmupMessages, totalMessageCnt, eac
 
                 if (actionOnMessages) {
                     await actionOnMessages(batch.messages);
-                    if (messagesMeasured > 0) {
+                    if (messagesMeasured > 0 && messages.length > 0) {
                         let i = 1;
                         const now = Date.now();
                         for (const message of messages) {
diff --git a/lib/kafkajs/_consumer.js b/lib/kafkajs/_consumer.js
@@ -957,6 +957,9 @@ class Consumer {
 
   #updateMaxMessageCacheSize() {
     if (this.#maxBatchSize === -1) {
+      // In case of unbounded max batch size it returns all available messages
+      // for a partition in each batch. Cache is unbounded as well as
+      // it takes only one call to process each partition.
       return;
     }
 
@@ -965,15 +968,16 @@ class Consumer {
         nowNs > this.#lastFetchClockNs) {
       const consumptionDurationSeconds = Number(nowNs - this.#lastFetchClockNs) / 1e9;
       const messagesPerSecondSingleWorker = this.#lastFetchedMessageCnt / this.#lastFetchedConcurrency / consumptionDurationSeconds;
-      // Keep enough messages in the cache for 1.5 seconds of consumption.
+      // Keep enough messages in the cache for 1.5 seconds of concurrent consumption.
       this.#messageCacheMaxSize = Math.round(1.5 * messagesPerSecondSingleWorker) * this.#concurrency;
       const minCacheSize = this.#runConfig.eachBatch ? this.#maxBatchesSize : this.#concurrency;
       if (this.#messageCacheMaxSize < minCacheSize)
+        // Keep at least one batch or one message per worker.
+        // It's possible less workers than requested were active in previous run.
         this.#messageCacheMaxSize = minCacheSize;
       else if (this.#messageCacheMaxSize > minCacheSize * 10)
+        // Keep at most 10 messages or batches per requested worker.
         this.#messageCacheMaxSize = minCacheSize * 10;
-      if (this.#messageCacheMaxSize < 1024)
-        this.#messageCacheMaxSize = 1024;
     }
   }
 
@@ -1018,7 +1022,7 @@ class Consumer {
       const fetchResult = new DeferredPromise();
       this.#logger.debug(`Attempting to fetch ${size} messages to the message cache`,
         this.#createConsumerBindingMessageMetadata());
-      
+
       this.#updateMaxMessageCacheSize();
       this.#internalClient.consume(size, (err, messages) =>
         fetchResult.resolve([err, messages]));
diff --git a/test/promisified/consumer/consumeMessages.spec.js b/test/promisified/consumer/consumeMessages.spec.js
@@ -412,6 +412,11 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
             partitions: partitions,
         });
 
+        // If you have a large consume time and consuming one message at a time,
+        // you need to have very small batch sizes to keep the concurrency up.
+        // It's to avoid having a too large cache and postponing the next fetch
+        // and so the rebalance too much.
+        const producer = createProducer({}, {'batch.num.messages': '1'});
         await producer.connect();
         await consumer.connect();
         await consumer.subscribe({ topic: topicName });
@@ -448,6 +453,7 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
         await producer.send({ topic: topicName, messages });
         await maxConcurrentWorkersReached;
         expect(inProgressMaxValue).toBe(expectedMaxConcurrentWorkers);
+        await producer.disconnect();
     });
 
     it('consume GZIP messages', async () => {
@@ -612,6 +618,7 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
         let assigns = 0;
         let revokes = 0;
         let lost = 0;
+        let firstBatchProcessing;
         consumer = createConsumer({
             groupId,
             maxWaitTimeInMs: 100,
@@ -649,14 +656,14 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
                 receivedMessages++;
 
                 try {
-                    if (event.batch.messages.length >= 32) {
-                        expect(event.isStale()).toEqual(false);
-                        await sleep(7500);
-                        /* 7.5s 'processing'
-                         * doesn't exceed max poll interval.
-                         * Cache reset is transparent */
-                        expect(event.isStale()).toEqual(false);
-                    }
+                    expect(event.isStale()).toEqual(false);
+                    await sleep(7500);
+                    /* 7.5s 'processing'
+                     * doesn't exceed max poll interval.
+                     * Cache reset is transparent */
+                    expect(event.isStale()).toEqual(false);
+                    if (firstBatchProcessing === undefined)
+                        firstBatchProcessing = receivedMessages;
                 } catch (e) {
                     console.error(e);
                     errors = true;
@@ -680,6 +687,8 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
         /* Triggers revocation */
         await consumer.disconnect();
 
+        expect(firstBatchProcessing).toBeDefined();
+        expect(receivedMessages).toBeGreaterThan(firstBatchProcessing);
         /* First assignment */
         expect(assigns).toEqual(1);
         /* Revocation on disconnect */
@@ -777,6 +786,9 @@ describe.each(cases)('Consumer - partitionsConsumedConcurrently = %s -', (partit
         /* Triggers revocation */
         await consumer.disconnect();
 
+        expect(firstLongBatchProcessing).toBeDefined();
+        expect(receivedMessages).toBeGreaterThan(firstLongBatchProcessing);
+
         /* First assignment + assignment after partitions lost */
         expect(assigns).toEqual(2);
         /* Partitions lost + revocation on disconnect */