Add partition level concurrency to faux-eachBatch

milindl · milindl · commit b140a48caf0e · 2024-06-17T15:48:21.000+05:30
diff --git a/lib/kafkajs/_consumer.js b/lib/kafkajs/_consumer.js
@@ -665,7 +665,6 @@ class Consumer {
 
     return new Promise((resolve, reject) => {
       this.#internalClient.consume(this.#messageCache.maxSize, (err, messages) => {
-
         if (err) {
           reject(createKafkaJsErrorFromLibRdKafkaError(err));
           return;
@@ -882,6 +881,9 @@ class Consumer {
        * So - do nothing but a debug log, but at this point eachMessageProcessed is false.
        */
       this.#logger.debug(`Consumer encountered error while processing message. Error details: ${e}: ${e.stack}. The same message may be reprocessed.`);
+
+      /* TODO: log error if error type is not KafkaJSError and if no pause() has been called */
+      this.#logger.error(`Consumer encountered error while processing message. Error details: ${e}: ${e.stack}. The same message may be reprocessed.`);
     }
 
     /* If the message is unprocessed, due to an error, or because the user has not resolved it, we seek back. */
@@ -920,6 +922,82 @@ class Consumer {
     return m.index;
   }
 
+  /**
+   * Processes a batch message (a single message as of now).
+   *
+   * @param m Message as obtained from #consumeSingleCached.
+   * @param config Config as passed to run().
+   * @returns {Promise<number>} the cache index of the message that was processed.
+   */
+  async #batchProcessor(m, config) {
+    let eachMessageProcessed = false;
+    const payload = this.#createBatchPayload(m);
+    try {
+      await config.eachBatch(payload);
+      if (config.eachBatchAutoResolve) {
+        eachMessageProcessed = true;
+      } else {
+        eachMessageProcessed = payload._messageResolved;
+      }
+    } catch (e) {
+      /* It's not only possible, but expected that an error will be thrown by eachBatch.
+       * This is especially true since the pattern of pause() followed by throwing an error
+       * is encouraged. To meet the API contract, we seek one offset backward (which
+       * means seeking to the message offset).
+       * However, we don't do this inside the catch, but just outside it. This is because throwing an
+       * error is not the only case where we might want to seek back. We might want to seek back
+       * if the user has not called `resolveOffset` manually in case of using eachBatch without
+       * eachBatchAutoResolve being set.
+       *
+       * So - do nothing but a debug log, but at this point eachMessageProcessed needs to be false unless
+       * the user has explicitly marked it as true.
+       */
+      this.#logger.debug(`Consumer encountered error while processing message. Error details: ${e}: ${e.stack}. The same message may be reprocessed.`);
+
+      /* TODO: log error if error type is not KafkaJSError and if no pause() has been called */
+      this.#logger.error(`Consumer encountered error while processing message. Error details: ${e}: ${e.stack}. The same message may be reprocessed.`);
+
+      /* The value of eachBatchAutoResolve is not important. The only place where a message is marked processed
+       * despite an error is if the user says so, and the user can use resolveOffsets for both the possible
+       * values eachBatchAutoResolve can take. */
+      if (config.eachBatch)
+        eachMessageProcessed = payload._messageResolved
+    }
+
+    /* If the message is unprocessed, due to an error, or because the user has not resolved it, we seek back. */
+    if (!eachMessageProcessed) {
+      await this.seek({
+        topic: m.topic,
+        partition: m.partition,
+        offset: m.offset,
+      });
+    }
+
+    /* Store the offsets we need to store, or at least record them for cache invalidation reasons. */
+    if (eachMessageProcessed) {
+      try {
+        if (!this.#userManagedStores) {
+          this.#internalClient.offsetsStore([{
+            topic: m.topic, partition: m.partition, offset: Number(m.offset) + 1, leaderEpoch: m.leaderEpoch
+          }]);
+        }
+        this.#lastConsumedOffsets.set(partitionKey(m), Number(m.offset) + 1);
+      } catch (e) {
+        /* Not much we can do, except log the error. */
+        if (this.#logger)
+          this.#logger.error(`Consumer encountered error while storing offset. Error details: ${JSON.stringify(e)}`);
+      }
+    }
+
+    /* Force a immediate seek here. It's possible that there are no more messages to be passed to the user,
+     * but the user seeked in the call to eachMessage, or else we encountered the error catch block.
+     * In that case, the results of that seek will never be reflected unless we do this. */
+    if (this.#checkPendingSeeks)
+      await this.#seekInternal();
+
+    return m.index;
+  }
+
   /**
    * Awaits the completion of a single message's processing.
    *
@@ -1030,130 +1108,69 @@ class Consumer {
   /* Internal polling loop.
    * It accepts the same config object that `run` accepts, but config.eachBatch must be set. */
   async #runInternalEachBatch(config) {
-    let savedIdx = -1;
-    while (this.#state === ConsumerState.CONNECTED) {
+    const concurrency = config.partitionsConsumedConcurrently;
+    let nextIdx = -1;
+    while (!(await acquireOrLog(this.#lock, this.#logger)));
 
-      /* We need to acquire a lock here, because we need to ensure that we don't
-      * disconnect while in the middle of processing a message. */
-      if (!(await acquireOrLog(this.#lock, this.#logger)))
-        continue;
+    while (this.#state === ConsumerState.CONNECTED) {
+      /* Release lock and cleanup if we intend to disconnect. */
+      if (this.#disconnectStarted) {
+        const indices = await this.waitAll();
+        indices.forEach(idx => this.#messageCache.return(idx));
+        if (nextIdx !== -1) {
+          this.#messageCache.return(nextIdx);
+        }
+        nextIdx = -1;
+        this.#lock.release();
+        break;
+      }
 
       /* Invalidate the message cache if needed */
       const locallyStale = this.#messageCache.popLocallyStale();
       if (this.#messageCache.isStale()) { /* global staleness */
+        const indices = await this.waitAll();
+        indices.forEach(idx => this.#messageCache.return(idx));
+        if (nextIdx !== -1) {
+          this.#messageCache.return(nextIdx);
+        }
+        nextIdx = -1;
         await this.#clearCacheAndResetPositions();
-        await this.#lock.release();
         continue;
       } else if (locallyStale.length !== 0) { /* local staleness */
+        // TODO: is it correct to await some concurrent promises for eachMessage here?
+        // to be safe we can do it, but I don't think we really need to do that for
+        // any correctness reason.
         await this.#clearCacheAndResetPositions(locallyStale);
-        await this.#lock.release();
         continue;
       }
 
-      const m = await this.#consumeSingleCached(savedIdx).catch(e => {
+      const m = await this.#consumeSingleCached(nextIdx).catch(e => {
         /* Since this error cannot be exposed to the user in the current situation, just log and retry.
          * This is due to restartOnFailure being set to always true. */
         if (this.#logger)
           this.#logger.error(`Consumer encountered error while consuming. Retrying. Error details: ${JSON.stringify(e)}`);
       });
 
-      if (!m) {
-        savedIdx = -1;
-        await this.#lock.release();
-        continue;
-      }
-      savedIdx = m.index;
-
-      /* TODO: add partitionsConsumedConcurrently-based concurrency here.
-      * If we maintain a map of topic partitions to promises, and a counter,
-      * we can probably achieve it with the correct guarantees of ordering
-      * though to maximize performance, we need to consume only from partitions for which
-      * an eachMessage call is not already going.
-      * It's risky to consume, and then store the message in something like an
-      * array/list until it can be processed, because librdkafka marks it as
-      * 'stored'... but anyway - we can implement something like this.
-      */
-
-      /* Make pending seeks 'concrete'. */
-      if (this.#checkPendingSeeks) {
-        const invalidateMessage = await this.#seekInternal({ topic: m.topic, partition: m.partition });
-        if (invalidateMessage) {
-          /* Don't pass this message on to the user if this topic partition was seeked to. */
-          this.#lock.release();
-          continue;
-        }
-      }
+      nextIdx = -1;
 
-      let eachMessageProcessed = false;
-      const payload = this.#createBatchPayload(m);
-      try {
-        await config.eachBatch(payload);
-        if (config.eachBatchAutoResolve) {
-          eachMessageProcessed = true;
-        } else {
-          eachMessageProcessed = payload._messageResolved;
+      if (!m) {
+        // await any concurrency related promises right here if this is null, if any such promise exists.
+        // see note in consumeSingleCached
+        if (this.#runningPromises.length) {
+          nextIdx = await this.waitOne();
         }
-      } catch (e) {
-        /* It's not only possible, but expected that an error will be thrown by eachBatch.
-         * This is especially true since the pattern of pause() followed by throwing an error
-         * is encouraged. To meet the API contract, we seek one offset backward (which
-         * means seeking to the message offset).
-         * However, we don't do this inside the catch, but just outside it. This is because throwing an
-         * error is not the only case where we might want to seek back. We might want to seek back
-         * if the user has not called `resolveOffset` manually in case of using eachBatch without
-         * eachBatchAutoResolve being set.
-         *
-         * So - do nothing but a debug log, but at this point eachMessageProcessed needs to be false unless
-         * the user has explicitly marked it as true.
-         */
-        this.#logger.debug(`Consumer encountered error while processing message. Error details: ${e}: ${e.stack}. The same message may be reprocessed.`);
-
-        /* The value of eachBatchAutoResolve is not important. The only place where a message is marked processed
-         * despite an error is if the user says so, and the user can use resolveOffsets for both the possible
-         * values eachBatchAutoResolve can take. */
-        if (config.eachBatch)
-          eachMessageProcessed = payload._messageResolved
+        continue;
       }
 
-      /* If the message is unprocessed, due to an error, or because the user has not resolved it, we seek back. */
-      if (!eachMessageProcessed) {
-        await this.seek({
-          topic: m.topic,
-          partition: m.partition,
-          offset: m.offset,
-        });
-      }
+      const p = this.#batchProcessor(m, config);
+      this.#runningPromises.push(p);
+      this.#savedIndexToPromiseIndex.push(m.index);
 
-      /* Store the offsets we need to store, or at least record them for cache invalidation reasons. */
-      if (eachMessageProcessed) {
-        try {
-          if (!this.#userManagedStores) {
-            this.#internalClient.offsetsStore([{
-              topic: m.topic, partition: m.partition, offset: Number(m.offset) + 1, leaderEpoch: m.leaderEpoch
-            }]);
-          }
-          this.#lastConsumedOffsets.set(partitionKey(m), Number(m.offset) + 1);
-        } catch (e) {
-          /* Not much we can do, except log the error. */
-          if (this.#logger)
-            this.#logger.error(`Consumer encountered error while storing offset. Error details: ${JSON.stringify(e)}`);
-        }
+      if (this.#runningPromises.length < concurrency) {
+        continue;
       }
 
-      /* Force a immediate seek here. It's possible that there are no more messages to be passed to the user,
-       * but the user seeked in the call to eachMessage, or else we encountered the error catch block.
-       * In that case, the results of that seek will never be reflected unless we do this. */
-      if (this.#checkPendingSeeks)
-        await this.#seekInternal();
-
-      /* TODO: another check we need to do here is to see how kafkaJS is handling
-       * commits. Are they commmitting after a message is _processed_?
-       * In that case we need to turn off librdkafka's auto-commit, and commit
-       * inside this function.
-       */
-
-      /* Release the lock so that any pending disconnect can go through. */
-      await this.#lock.release();
+      nextIdx = await this.waitOne();
     }
   }