confluentinc
diff --git a/‎lib/kafkajs/_consumer.js‎
Lines changed: 99 additions & 156 deletions b/‎lib/kafkajs/_consumer.js‎
Lines changed: 99 additions & 156 deletions
@@ -131,22 +131,41 @@ class Consumer {
   #userManagedStores = false;
 
   /**
-   * Populated with Promises for each partition that is being processed concurrently.
-   * Each promise might run eachMessage/eachBatch.
+   * Signals an intent to disconnect the consumer.
    */
-  #runningPromises = [];
+  #disconnectStarted = false;
 
   /**
-   * Each message that is consumed has an associated cache index.
-   * This array maps a an index within runningPromises to the associated cached index.
-   * ie. runningPromises[i] is associated with the cache index #savedIndexToPromiseIndex[i].
+   * Number of partitions owned by the consumer.
+   * @note This value may or may not be completely accurate, it's more so a hint for spawning concurrent workers.
    */
-  #savedIndexToPromiseIndex = [];
+  #partitionCount = 0;
 
   /**
-   * Signals an intent to disconnect the consumer.
+   * Whether worker termination has been scheduled.
    */
-  #disconnectStarted = false;
+  #workerTerminationScheduled = false;
+
+  /**
+   * The worker functions currently running in the consumer.
+   */
+  #workers = [];
+
+  /**
+   * The number of partitions to consume concurrently as set by the user, or 1.
+   */
+  #concurrency = 1;
+
+  /**
+   * Whether any call to the internalClient's consume() method is in progress.
+   */
+  #fetchInProgress = false;
+
+  /**
+   * TODO: remove this or make it a bit more reliable.
+   * This is a debug property for this branch.
+   */
+  clientId = null;
 
   /**
    * @constructor
@@ -217,7 +236,6 @@ class Consumer {
    * @param {import("../../types").TopicPartition[]} assignment
    */
   #rebalanceCallback(err, assignment) {
-    // Create the librdkafka error
     err = LibrdKafkaError.create(err);
     const userSpecifiedRebalanceCb = this.#userConfig['rebalance_cb'];
 
@@ -276,6 +294,8 @@ class Consumer {
          * and marked the cache stale. This means that the cache is always expired when a rebalance
          * is triggered.
          * This is applicable both for incremental and non-incremental rebalances.
+         * Multiple consume()s cannot be called together, too, because we make sure that only
+         * one worker is calling into the internal consumer at a time.
          */
 
         try {
@@ -285,10 +305,13 @@ class Consumer {
             if (checkPendingSeeks && !assignmentModified)
               assignment = this.#assignAsPerSeekedOffsets(assignment);
 
-            if (this.#internalClient.rebalanceProtocol() === "EAGER")
+            if (this.#internalClient.rebalanceProtocol() === "EAGER") {
               this.#internalClient.assign(assignment);
-            else
+              this.#partitionCount = assignment.length;
+            } else {
               this.#internalClient.incrementalAssign(assignment);
+              this.#partitionCount += assignment.length;
+            }
 
             if (checkPendingSeeks) {
               const offsetsToCommit = assignment
@@ -313,9 +336,11 @@ class Consumer {
             if (this.#internalClient.rebalanceProtocol() === "EAGER") {
               this.#internalClient.unassign();
               this.#messageCache.removeTopicPartitions();
+              this.#partitionCount = 0;
             } else {
               this.#internalClient.incrementalUnassign(assignment);
               this.#messageCache.removeTopicPartitions(assignment);
+              this.#partitionCount -= assignment.length;
             }
           }
         } catch (e) {
@@ -324,6 +349,18 @@ class Consumer {
             this.#internalClient.emit('rebalance.error', e);
           }
         }
+
+        /**
+         * Schedule worker termination here, in case the number of workers is not equal to the target concurrency.
+         * We need to do this so we will respawn workers with the correct concurrency count.
+         */
+        const workersToSpawn = Math.max(1, Math.min(this.#concurrency, this.#partitionCount));
+        if (workersToSpawn !== this.#workers.length) {
+          this.#workerTerminationScheduled = true;
+          /* We don't need to await the workers here. We are OK if the termination and respawning
+           * occurs later, since even if we have a few more or few less workers for a while, it's
+           * not a big deal. */
+        }
       });
   }
 
@@ -338,6 +375,8 @@ class Consumer {
         { code: error.ErrorCodes.ERR__INVALID_ARG });
     }
     const rdKafkaConfig = kafkaJSToRdKafkaConfig(kjsConfig);
+    this.clientId = rdKafkaConfig['client.id'];
+    this.#logger = new DefaultLogger();
 
     /* Consumer specific configuration */
     if (Object.hasOwn(kjsConfig, 'groupId')) {
@@ -663,8 +702,14 @@ class Consumer {
       return null;
     }
 
+    if (this.#fetchInProgress) {
+      return null;
+    }
+
+    this.#fetchInProgress = true;
     return new Promise((resolve, reject) => {
       this.#internalClient.consume(this.#messageCache.maxSize, (err, messages) => {
+        this.#fetchInProgress = false;
         if (err) {
           reject(createKafkaJsErrorFromLibRdKafkaError(err));
           return;
@@ -846,14 +891,10 @@ class Consumer {
 
     const rdKafkaConfig = this.#config();
     const maxPollInterval = rdKafkaConfig['max.poll.interval.ms'] ?? 300000;
-    this.#messageCache = new MessageCache(Math.floor(maxPollInterval * 0.8), config.partitionsConsumedConcurrently);
+    this.#messageCache = new MessageCache(Math.floor(maxPollInterval * 0.8), config.partitionsConsumedConcurrently, this.#logger);
 
-    /* We deliberately don't await this. */
-    if (config.eachMessage) {
-      this.#runInternalEachMessage(config);
-    } else {
-      this.#runInternalEachBatch(config);
-    }
+    /* We deliberately don't await this because we want to return from this method immediately. */
+    this.#runInternal(config);
   }
 
   /**
@@ -960,8 +1001,7 @@ class Consumer {
       /* The value of eachBatchAutoResolve is not important. The only place where a message is marked processed
        * despite an error is if the user says so, and the user can use resolveOffsets for both the possible
        * values eachBatchAutoResolve can take. */
-      if (config.eachBatch)
-        eachMessageProcessed = payload._messageResolved
+      eachMessageProcessed = payload._messageResolved;
     }
 
     /* If the message is unprocessed, due to an error, or because the user has not resolved it, we seek back. */
@@ -999,68 +1039,25 @@ class Consumer {
   }
 
   /**
-   * Awaits the completion of a single message's processing.
+   * Starts a worker to fetch messages/batches from the internal consumer and process them.
    *
-   * @returns {Promise<number>} the cache index of the message in the cache that was processed.
-   */
-  async waitOne() {
-    const savedIndex = await Promise.any(this.#runningPromises);
-    const promiseIndex = this.#savedIndexToPromiseIndex.findIndex(p => p === savedIndex);
-    if (promiseIndex === -1) {
-      console.error("Promise not found in runningPromises");
-      throw new Error("Promise not found in runningPromises");
-    }
-    this.#runningPromises[promiseIndex] = this.#runningPromises[this.#runningPromises.length - 1];
-    this.#savedIndexToPromiseIndex[promiseIndex] = this.#savedIndexToPromiseIndex[this.#savedIndexToPromiseIndex.length - 1];
-    this.#runningPromises.pop();
-    this.#savedIndexToPromiseIndex.pop();
-
-    return savedIndex;
-  }
-
-  /**
-   * Awaits the completion of all messages that are being processed.
+   * A worker runs until it's told to stop.
+   * Conditions where the worker is told to stop:
+   *  1. Cache globally stale
+   *  2. Disconnected initiated
+   *  3. Rebalance
+   *  4. Some other worker has started terminating.
    *
-   * @returns {Promise<number[]>} a list of cache indices of the messages that were processed.
+   * Worker termination acts as a async barrier.
    */
-  async waitAll() {
-    const indices = await Promise.all(this.#runningPromises);
-    this.#runningPromises = [];
-    this.#savedIndexToPromiseIndex = [];
-    return indices;
-  }
-
-  /* Internal polling loop.
-   * It accepts the same config object that `run` accepts, but config.eachMessage must be set. */
-  async #runInternalEachMessage(config) {
-    const concurrency = config.partitionsConsumedConcurrently;
+  async #worker(config, perMessageProcessor, id) {
     let nextIdx = -1;
-    while (!(await acquireOrLog(this.#lock, this.#logger)));
-
-    while (this.#state === ConsumerState.CONNECTED) {
-      /* Release lock and cleanup if we intend to disconnect. */
-      if (this.#disconnectStarted) {
-        const indices = await this.waitAll();
-        indices.forEach(idx => this.#messageCache.return(idx));
-        if (nextIdx !== -1) {
-          this.#messageCache.return(nextIdx);
-        }
-        nextIdx = -1;
-        this.#lock.release();
-        break;
-      }
-
+    while (!this.#workerTerminationScheduled) {
       /* Invalidate the message cache if needed */
       const locallyStale = this.#messageCache.popLocallyStale();
       if (this.#messageCache.isStale()) { /* global staleness */
-        const indices = await this.waitAll();
-        indices.forEach(idx => this.#messageCache.return(idx));
-        if (nextIdx !== -1) {
-          this.#messageCache.return(nextIdx);
-        }
-        nextIdx = -1;
-        await this.#clearCacheAndResetPositions();
-        continue;
+        this.#workerTerminationScheduled = true;
+        break;
       } else if (locallyStale.length !== 0) { /* local staleness */
         // TODO: is it correct to await some concurrent promises for eachMessage here?
         // to be safe we can do it, but I don't think we really need to do that for
@@ -1079,99 +1076,44 @@ class Consumer {
       nextIdx = -1;
 
       if (!m) {
-        // await any concurrency related promises right here if this is null, if any such promise exists.
-        // see note in consumeSingleCached
-        if (this.#runningPromises.length) {
-          nextIdx = await this.waitOne();
-        }
-        continue;
-      }
-
-      const p = this.#messageProcessor(m, config);
-      this.#runningPromises.push(p);
-      this.#savedIndexToPromiseIndex.push(m.index);
-
-      if (this.#runningPromises.length < concurrency) {
+        /* Backoff a little. If m is null, we might be fetching from the internal consumer (fetch in progress),
+         * and calling consumeSingleCached in a tight loop will help no one. */
+        await new Promise((resolve) => setTimeout(resolve, 1));
         continue;
       }
 
-      nextIdx = await this.waitOne();
+      nextIdx = await perMessageProcessor(m, config);
+    }
 
-      /* TODO: another check we need to do here is to see how kafkaJS is handling
-       * commits. Are they commmitting after a message is _processed_?
-       * In that case we need to turn off librdkafka's auto-commit, and commit
-       * inside this function.
-       */
+    if (nextIdx !== -1) {
+      this.#messageCache.return(nextIdx);
     }
   }
 
-  /* Internal polling loop.
-   * It accepts the same config object that `run` accepts, but config.eachBatch must be set. */
-  async #runInternalEachBatch(config) {
-    const concurrency = config.partitionsConsumedConcurrently;
-    let nextIdx = -1;
+  /**
+   * Internal polling loop.
+   * Spawns and awaits workers until disconnect is initiated.
+   */
+  async #runInternal(config) {
+    this.#concurrency = config.partitionsConsumedConcurrently;
+    const perMessageProcessor = config.eachMessage ? this.#messageProcessor : this.#batchProcessor;
+    this.#workers = [];
     while (!(await acquireOrLog(this.#lock, this.#logger)));
 
-    while (this.#state === ConsumerState.CONNECTED) {
-      /* Release lock and cleanup if we intend to disconnect. */
-      if (this.#disconnectStarted) {
-        const indices = await this.waitAll();
-        indices.forEach(idx => this.#messageCache.return(idx));
-        if (nextIdx !== -1) {
-          this.#messageCache.return(nextIdx);
-        }
-        nextIdx = -1;
-        this.#lock.release();
-        break;
-      }
+    while (!this.#disconnectStarted) {
+      this.#workerTerminationScheduled = false;
+      const workersToSpawn = Math.max(1, Math.min(this.#concurrency, this.#partitionCount));
+      this.#workers = Array(workersToSpawn).fill().map((_, i) => this.#worker(config, perMessageProcessor.bind(this), i));
+      await Promise.all(this.#workers);
 
-      /* Invalidate the message cache if needed */
-      const locallyStale = this.#messageCache.popLocallyStale();
-      if (this.#messageCache.isStale()) { /* global staleness */
-        const indices = await this.waitAll();
-        indices.forEach(idx => this.#messageCache.return(idx));
-        if (nextIdx !== -1) {
-          this.#messageCache.return(nextIdx);
-        }
-        nextIdx = -1;
+      /* One of the possible reasons for the workers to end is that the cache is globally stale.
+       * We need to take care of expiring it. */
+      if (this.#messageCache.isStale()) {
         await this.#clearCacheAndResetPositions();
-        continue;
-      } else if (locallyStale.length !== 0) { /* local staleness */
-        // TODO: is it correct to await some concurrent promises for eachMessage here?
-        // to be safe we can do it, but I don't think we really need to do that for
-        // any correctness reason.
-        await this.#clearCacheAndResetPositions(locallyStale);
-        continue;
       }
-
-      const m = await this.#consumeSingleCached(nextIdx).catch(e => {
-        /* Since this error cannot be exposed to the user in the current situation, just log and retry.
-         * This is due to restartOnFailure being set to always true. */
-        if (this.#logger)
-          this.#logger.error(`Consumer encountered error while consuming. Retrying. Error details: ${JSON.stringify(e)}`);
-      });
-
-      nextIdx = -1;
-
-      if (!m) {
-        // await any concurrency related promises right here if this is null, if any such promise exists.
-        // see note in consumeSingleCached
-        if (this.#runningPromises.length) {
-          nextIdx = await this.waitOne();
-        }
-        continue;
-      }
-
-      const p = this.#batchProcessor(m, config);
-      this.#runningPromises.push(p);
-      this.#savedIndexToPromiseIndex.push(m.index);
-
-      if (this.#runningPromises.length < concurrency) {
-        continue;
-      }
-
-      nextIdx = await this.waitOne();
     }
+
+    this.#lock.release();
   }
 
   /**
@@ -1582,6 +1524,7 @@ class Consumer {
     }
 
     this.#disconnectStarted = true;
+    this.#workerTerminationScheduled = true;
     while (!(await acquireOrLog(this.#lock, this.#logger))); /* Just retry... */
 
     this.#state = ConsumerState.DISCONNECTING;