fix: maxPerJobConcurrency was not respected because status of pending chunks was not updated in redis/local queue

bdshadow · bdshadow · commit 81817f6bf926 · 2025-12-18T11:55:12.000+01:00
Signed-off-by: dmitrii.bocharov &lt;bdshadow@gmail.com&gt;
diff --git a/backend/app/src/test/kotlin/io/tolgee/batch/AbstractBatchJobsGeneralTest.kt b/backend/app/src/test/kotlin/io/tolgee/batch/AbstractBatchJobsGeneralTest.kt
@@ -343,4 +343,19 @@ abstract class AbstractBatchJobsGeneralTest :
       .id.assert
       .isNotEqualTo(anotherJobId)
   }
+
+  @Test
+  fun `mt job respects maxPerJobConcurrency`() {
+    val mtJob = util.runMtJob(100)
+
+    // by now mt jobs have maxPerJobConcurrency = 1 and this won't probably change,
+    // because if parallelized we will start hitting rate limits on OpenAI and even get over it.
+    // At the beginning, we only check that the organization has availableCredits > 0 and
+    // only when we have the result from OpenAI, we are able to calculate how many credits to charge the organization.
+    val maxConcurrency = 1
+    util.assertAllowedMaxPerJobConcurrency(mtJob, maxConcurrency)
+    util.assertMaxPerJobConcurrencyIsLessThanOrEqualTo(maxConcurrency)
+    util.waitForJobSuccess(mtJob)
+    util.assertJobUnlocked()
+  }
 }
diff --git a/backend/app/src/test/kotlin/io/tolgee/batch/BatchJobTestUtil.kt b/backend/app/src/test/kotlin/io/tolgee/batch/BatchJobTestUtil.kt
@@ -7,6 +7,7 @@ import io.tolgee.batch.processors.DeleteKeysChunkProcessor
 import io.tolgee.batch.processors.PreTranslationByTmChunkProcessor
 import io.tolgee.batch.request.AutomationBjRequest
 import io.tolgee.batch.request.DeleteKeysRequest
+import io.tolgee.batch.request.MachineTranslationRequest
 import io.tolgee.batch.request.PreTranslationByTmRequest
 import io.tolgee.batch.state.BatchJobStateProvider
 import io.tolgee.component.CurrentDateProvider
@@ -43,6 +44,7 @@ import org.springframework.context.ApplicationContext
 import org.springframework.transaction.PlatformTransactionManager
 import java.time.Duration
 import java.util.Date
+import kotlin.apply
 import kotlin.coroutines.CoroutineContext
 
 class BatchJobTestUtil(
@@ -447,6 +449,49 @@ class BatchJobTestUtil(
     }
   }
 
+  fun runMtJob(
+    keyCount: Int,
+    author: UserAccount = testData.user,
+  ): BatchJob {
+    return executeInNewTransaction(transactionManager) {
+      batchJobService.startJob(
+        request =
+          MachineTranslationRequest().apply {
+            keyIds = (1L..keyCount).map { it }
+            targetLanguageIds =
+              listOf(
+                testData.projectBuilder
+                  .getLanguageByTag("cs")!!
+                  .self.id,
+              )
+          },
+        project = testData.projectBuilder.self,
+        author = author,
+        type = BatchJobType.MACHINE_TRANSLATE,
+        isHidden = false,
+      )
+    }
+  }
+
+  fun assertAllowedMaxPerJobConcurrency(
+    job: BatchJob,
+    maxConcurrency: Int,
+  ) {
+    batchJobService
+      .getJobDto(job.id)
+      .maxPerJobConcurrency.assert
+      .isEqualTo(maxConcurrency)
+  }
+
+  fun assertMaxPerJobConcurrencyIsLessThanOrEqualTo(maxConcurrency: Int) {
+    waitFor(pollTime = 100) {
+      batchJobConcurrentLauncher.runningJobs.assert
+        .size()
+        .isLessThanOrEqualTo(maxConcurrency)
+      batchJobChunkExecutionQueue.size == 0
+    }
+  }
+
   fun getSingleJob(): BatchJob = entityManager.createQuery("""from BatchJob""", BatchJob::class.java).singleResult
 
   private val batchJobProjectLockingManager: BatchJobProjectLockingManager
diff --git a/backend/data/src/main/kotlin/io/tolgee/batch/BatchJobConcurrentLauncher.kt b/backend/data/src/main/kotlin/io/tolgee/batch/BatchJobConcurrentLauncher.kt
@@ -263,12 +263,16 @@ class BatchJobConcurrentLauncher(
 
   private fun ExecutionQueueItem.trySetRunningState(): Boolean {
     return progressManager.trySetExecutionRunning(this.chunkExecutionId, this.jobId) {
+      val maxPerJobConcurrency = batchJobService.getJobDto(this.jobId).maxPerJobConcurrency
+      if (maxPerJobConcurrency == -1) {
+        return@trySetExecutionRunning true
+      }
       val count =
         it.values.count { executionState -> executionState.status == BatchJobChunkExecutionStatus.RUNNING }
       if (count == 0) {
         return@trySetExecutionRunning true
       }
-      batchJobService.getJobDto(this.jobId).maxPerJobConcurrency > count
+      maxPerJobConcurrency > count
     }
   }
 }
diff --git a/backend/data/src/main/kotlin/io/tolgee/batch/ProgressManager.kt b/backend/data/src/main/kotlin/io/tolgee/batch/ProgressManager.kt
@@ -42,6 +42,7 @@ class ProgressManager(
     return batchJobStateProvider.updateState(batchJobId) {
       if (canRunFn(it)) {
         if (it[executionId] != null) {
+          it[executionId]?.status = BatchJobChunkExecutionStatus.RUNNING
           return@updateState true
         }
         it[executionId] =
diff --git a/backend/data/src/main/kotlin/io/tolgee/batch/processors/MachineTranslationChunkProcessor.kt b/backend/data/src/main/kotlin/io/tolgee/batch/processors/MachineTranslationChunkProcessor.kt
@@ -47,6 +47,13 @@ class MachineTranslationChunkProcessor(
     }
   }
 
+  /**
+   * For MT jobs, it is set to 1, because it may happen that we start hitting rate limits on OpenAI,
+   * so we may not want one job to consume them in parallel. Moreover, at the beginning, we only check, that
+   * the organization has `availableCredits > 0` and only when we have the result from OpenAI,we are able to
+   * calculate how many credits to charge the organization. However, if it were to be parallelized too much,
+   * it would happen that you could get well over the limit
+   */
   override fun getMaxPerJobConcurrency(): Int {
     return 1
   }
diff --git a/backend/data/src/main/kotlin/io/tolgee/model/batch/BatchJob.kt b/backend/data/src/main/kotlin/io/tolgee/model/batch/BatchJob.kt
@@ -63,6 +63,11 @@ class BatchJob :
 
   val chunkedTarget get() = chunkTarget(chunkSize, target)
 
+  /**
+   * The maximum number of coroutines among all tolgee instances (e.g. k8s pods). The default value is -1,
+   * which means that the concurrency is not limited.
+   * (check `BatchJobConcurrentLauncher#ExecutionQueueItem.trySetRunningState()` for more info).
+   */
   var maxPerJobConcurrency: Int = -1
 
   @Enumerated(STRING)

Original file line number	Diff line number	Diff line change
`@@ -263,12 +263,16 @@ class BatchJobConcurrentLauncher(`
`263`	`263`
`264`	`264`	`private fun ExecutionQueueItem.trySetRunningState(): Boolean {`
`265`	`265`	`return progressManager.trySetExecutionRunning(this.chunkExecutionId, this.jobId) {`
	`266`	`+ val maxPerJobConcurrency = batchJobService.getJobDto(this.jobId).maxPerJobConcurrency`
	`267`	`+ if (maxPerJobConcurrency == -1) {`
	`268`	`+ return@trySetExecutionRunning true`
	`269`	`+ }`
`266`	`270`	`val count =`
`267`	`271`	`it.values.count { executionState -> executionState.status == BatchJobChunkExecutionStatus.RUNNING }`
`268`	`272`	`if (count == 0) {`
`269`	`273`	`return@trySetExecutionRunning true`
`270`	`274`	`}`
`271`		`- batchJobService.getJobDto(this.jobId).maxPerJobConcurrency > count`
	`275`	`+ maxPerJobConcurrency > count`
`272`	`276`	`}`
`273`	`277`	`}`
`274`	`278`	`}`
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ class ProgressManager(`
`42`	`42`	`return batchJobStateProvider.updateState(batchJobId) {`
`43`	`43`	`if (canRunFn(it)) {`
`44`	`44`	`if (it[executionId] != null) {`
	`45`	`+ it[executionId]?.status = BatchJobChunkExecutionStatus.RUNNING`
`45`	`46`	`return@updateState true`
`46`	`47`	`}`
`47`	`48`	`it[executionId] =`
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,13 @@ class MachineTranslationChunkProcessor(`
`47`	`47`	`}`
`48`	`48`	`}`
`49`	`49`
	`50`	`+ /**`
	`51`	`+ * For MT jobs, it is set to 1, because it may happen that we start hitting rate limits on OpenAI,`
	`52`	`+ * so we may not want one job to consume them in parallel. Moreover, at the beginning, we only check, that`
	`53`	+ * the organization has `availableCredits > 0` and only when we have the result from OpenAI,we are able to
	`54`	`+ * calculate how many credits to charge the organization. However, if it were to be parallelized too much,`
	`55`	`+ * it would happen that you could get well over the limit`
	`56`	`+ */`
`50`	`57`	`override fun getMaxPerJobConcurrency(): Int {`
`51`	`58`	`return 1`
`52`	`59`	`}`