fix: PLT-844: Dynamic batch size based on annotaion.result (#8187)

triklozoid · robot-ci-heartex · web-flow · commit 694150e4051d · 2025-08-21T11:53:56.000+04:00
Co-authored-by: robot-ci-heartex &lt;robot-ci-heartex@users.noreply.github.com&gt;
Co-authored-by: triklozoid &lt;triklozoid@users.noreply.github.com&gt;
diff --git a/label_studio/projects/models.py b/label_studio/projects/models.py
@@ -1109,14 +1109,41 @@ def _update_tasks_counters_and_task_states(
 
         return objs
 
+    def get_max_annotation_result_size(self):
+        """Get the maximum annotation result size for this project"""
+        # For SQLite, return 0 (no annotations to consider)
+        if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
+            return 0
+
+        # Using raw SQL to ensure we use the specific index annotation_proj_result_octlen_idx
+        # which is optimized for this query pattern (project_id, octet_length DESC)
+        with connection.cursor() as cursor:
+            cursor.execute(
+                """
+                SELECT id,
+                       octet_length(result::text) AS bytes
+                FROM   task_completion
+                WHERE  project_id = %s
+                ORDER  BY octet_length(result::text) DESC
+                LIMIT  1
+            """,
+                [self.id],
+            )
+
+            row = cursor.fetchone()
+            if not row or not row[1]:
+                return 0
+
+            return row[1]
+
     def get_task_batch_size(self):
-        """Calculate optimal batch size based on task data size"""
+        """Calculate optimal batch size based on task data size and annotation result size"""
         # For SQLite, use default MAX_TASK_BATCH_SIZE
         if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
             return settings.MAX_TASK_BATCH_SIZE
 
-        # Using raw SQL to ensure we use the specific index task_proj_octlen_idx
-        # which is optimized for this query pattern (project_id, octet_length DESC)
+        # Get maximum task data size using the optimized index
+        max_task_size = 0
         with connection.cursor() as cursor:
             cursor.execute(
                 """
@@ -1131,10 +1158,17 @@ def get_task_batch_size(self):
             )
 
             row = cursor.fetchone()
-            if not row or not row[1]:
-                return settings.MAX_TASK_BATCH_SIZE
+            if row and row[1]:
+                max_task_size = row[1]
+
+        # Get maximum annotation result size using the new optimized index
+        max_annotation_size = self.get_max_annotation_result_size()
+
+        # Use the larger of the two sizes for batch calculation
+        max_data_size = max(max_task_size, max_annotation_size)
 
-            max_data_size = row[1]
+        if max_data_size == 0:
+            return settings.MAX_TASK_BATCH_SIZE
 
         batch_size = settings.TASK_DATA_PER_BATCH // max_data_size
 
@@ -1143,7 +1177,11 @@ def get_task_batch_size(self):
         elif batch_size < 1:
             batch_size = 1
 
-        logger.info(f'Project {self.id}: max task data size {max_data_size} bytes, calculated batch size {batch_size}')
+        logger.info(
+            f'Project {self.id}: max task size {max_task_size} bytes, '
+            f'max annotation size {max_annotation_size} bytes, '
+            f'calculated batch size {batch_size}'
+        )
         return batch_size
 
     def __str__(self):
diff --git a/label_studio/tasks/migrations/0057_annotation_proj_result_octlen_idx_async.py b/label_studio/tasks/migrations/0057_annotation_proj_result_octlen_idx_async.py
@@ -0,0 +1,71 @@
+# Generated by Django 5.1.9 on 2025-08-18 12:00
+
+from django.db import migrations
+from django.conf import settings
+from core.models import AsyncMigrationStatus
+from core.redis import start_job_async_or_sync
+import logging
+logger = logging.getLogger(__name__)
+
+IS_SQLITE = settings.DJANGO_DB == settings.DJANGO_DB_SQLITE
+
+migration_name = '0057_annotation_proj_result_octlen_idx_async'
+
+sql_create_index = (
+    'CREATE INDEX CONCURRENTLY IF NOT EXISTS annotation_proj_result_octlen_idx '
+    'ON task_completion (project_id, octet_length(result::text) DESC) '
+    'INCLUDE (id);'
+)
+sql_drop_index = (
+    'DROP INDEX CONCURRENTLY IF EXISTS annotation_proj_result_octlen_idx;'
+)
+
+def forward_migration(migration_name):
+    migration, created = AsyncMigrationStatus.objects.get_or_create(
+        name=migration_name,
+        defaults={'status': AsyncMigrationStatus.STATUS_STARTED},
+    )
+    if not created:
+        return
+    
+    logger.info(f'Start async migration {migration_name}')
+    from django.db import connection
+    cursor = connection.cursor()
+    cursor.execute(sql_create_index)
+    migration.status = AsyncMigrationStatus.STATUS_FINISHED
+    migration.save()
+    logger.info(f'Async migration {migration_name} complete')
+
+def backward_migration(migration_name):
+    migration = AsyncMigrationStatus.objects.create(
+        name=migration_name,
+        status=AsyncMigrationStatus.STATUS_STARTED,
+    )
+    logger.info(f'Start revert of async migration {migration_name}')
+    from django.db import connection
+    cursor = connection.cursor()
+    cursor.execute(sql_drop_index)
+    migration.status = AsyncMigrationStatus.STATUS_FINISHED
+    migration.save()
+    logger.info(f'Async migration {migration_name} revert complete')
+
+def forwards(apps, schema_editor):
+    if IS_SQLITE:
+        logger.info('SQLite execution')
+        logger.info('Skipping async index creation for non-PostgreSQL databases')
+        return
+
+    start_job_async_or_sync(forward_migration, migration_name=migration_name)
+
+def backwards(apps, schema_editor):
+    start_job_async_or_sync(backward_migration, migration_name=migration_name)
+
+class Migration(migrations.Migration):
+    atomic = False
+
+    dependencies = [
+        ("tasks", "0056_task_prediction_result_proj_gin_idx_async"),
+    ]
+    operations = [
+        migrations.RunPython(forwards, backwards),
+    ]