fix: PLT-838: Reimport memory usage optimization (#8105)

triklozoid · farioas · web-flow · commit 144fd7c4b941 · 2025-08-08T20:37:42.000+04:00
Co-authored-by: Sergey Zhuk &lt;sergey.zhuk@humansignal.com&gt;
diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py
@@ -555,6 +555,8 @@
 # Total size of task data (in bytes) to process per batch - used to calculate dynamic batch sizes
 # For example: if task data is 10MB, batch will be ~5 tasks to stay under 50MB limit
 TASK_DATA_PER_BATCH = int(get_env('TASK_DATA_PER_BATCH', 50 * 1024 * 1024))  # 50 MB in bytes
+# Batch size for streaming reimport operations to reduce memory usage
+REIMPORT_BATCH_SIZE = int(get_env('REIMPORT_BATCH_SIZE', 1000))
 # Batch size for processing prediction imports to avoid memory issues with large datasets
 PREDICTION_IMPORT_BATCH_SIZE = int(get_env('PREDICTION_IMPORT_BATCH_SIZE', 500))
 PROJECT_TITLE_MIN_LEN = 3
diff --git a/label_studio/data_import/functions.py b/label_studio/data_import/functions.py
@@ -3,10 +3,12 @@
 import traceback
 from typing import Callable, Optional
 
+from core.feature_flags import flag_set
 from core.utils.common import load_func
 from django.conf import settings
 from django.db import transaction
 from projects.models import ProjectImport, ProjectReimport, ProjectSummary
+from tasks.models import Task
 from users.models import User
 from webhooks.models import WebhookAction
 from webhooks.utils import emit_webhooks_for_instance
@@ -131,6 +133,125 @@ def reformat_predictions(tasks, preannotated_from_fields):
 post_process_reimport = load_func(settings.POST_PROCESS_REIMPORT)
 
 
+def _async_reimport_background_streaming(reimport, project, organization_id, user):
+    """Streaming version of reimport that processes tasks in batches to reduce memory usage"""
+    try:
+        # Get batch size from settings or use default
+        batch_size = settings.REIMPORT_BATCH_SIZE
+
+        # Initialize counters
+        total_task_count = 0
+        total_annotation_count = 0
+        total_prediction_count = 0
+        all_found_formats = {}
+        all_data_columns = set()
+        all_created_task_ids = []
+
+        # Remove old tasks once before starting
+        with transaction.atomic():
+            project.remove_tasks_by_file_uploads(reimport.file_upload_ids)
+
+        # Process tasks in batches
+        batch_number = 0
+        for batch_tasks, batch_formats, batch_columns in FileUpload.load_tasks_from_uploaded_files_streaming(
+            project, reimport.file_upload_ids, files_as_tasks_list=reimport.files_as_tasks_list, batch_size=batch_size
+        ):
+            if not batch_tasks:
+                logger.info(f'Empty batch received for reimport {reimport.id}')
+                continue
+
+            batch_number += 1
+            logger.info(f'Processing batch {batch_number} with {len(batch_tasks)} tasks for reimport {reimport.id}')
+
+            # Process batch in transaction
+            with transaction.atomic():
+                # Lock summary for update to avoid race conditions
+                summary = ProjectSummary.objects.select_for_update().get(project=project)
+
+                # Serialize and save batch
+                serializer = ImportApiSerializer(
+                    data=batch_tasks, many=True, context={'project': project, 'user': user}
+                )
+                serializer.is_valid(raise_exception=True)
+                batch_db_tasks = serializer.save(project_id=project.id)
+
+                # Collect task IDs for later use
+                all_created_task_ids.extend([t.id for t in batch_db_tasks])
+
+                # Update batch counters
+                batch_task_count = len(batch_db_tasks)
+                batch_annotation_count = len(serializer.db_annotations)
+                batch_prediction_count = len(serializer.db_predictions)
+
+                total_task_count += batch_task_count
+                total_annotation_count += batch_annotation_count
+                total_prediction_count += batch_prediction_count
+
+                # Update formats and columns
+                all_found_formats.update(batch_formats)
+                if batch_columns:
+                    if not all_data_columns:
+                        all_data_columns = batch_columns
+                    else:
+                        all_data_columns &= batch_columns
+
+                # Update data columns in summary
+                summary.update_data_columns(batch_db_tasks)
+
+            logger.info(
+                f'Batch {batch_number} processed successfully: {batch_task_count} tasks, '
+                f'{batch_annotation_count} annotations, {batch_prediction_count} predictions'
+            )
+
+        # After all batches are processed, emit webhooks and update task states once
+        if all_created_task_ids:
+            logger.info(
+                f'Finalizing reimport: emitting webhooks and updating task states for {len(all_created_task_ids)} tasks'
+            )
+
+            # Emit webhooks for all tasks at once (passing list of IDs)
+            emit_webhooks_for_instance(organization_id, project, WebhookAction.TASKS_CREATED, all_created_task_ids)
+
+            # Update task states for all tasks at once
+            all_tasks_queryset = Task.objects.filter(id__in=all_created_task_ids)
+            recalculate_stats_counts = {
+                'task_count': total_task_count,
+                'annotation_count': total_annotation_count,
+                'prediction_count': total_prediction_count,
+            }
+
+            project.update_tasks_counters_and_task_states(
+                tasks_queryset=all_tasks_queryset,
+                maximum_annotations_changed=False,
+                overlap_cohort_percentage_changed=False,
+                tasks_number_changed=True,
+                recalculate_stats_counts=recalculate_stats_counts,
+            )
+            logger.info('Tasks bulk_update finished (async streaming reimport)')
+
+        # Update reimport with final statistics
+        reimport.task_count = total_task_count
+        reimport.annotation_count = total_annotation_count
+        reimport.prediction_count = total_prediction_count
+        reimport.found_formats = all_found_formats
+        reimport.data_columns = list(all_data_columns)
+        reimport.status = ProjectReimport.Status.COMPLETED
+        reimport.save()
+
+        logger.info(f'Streaming reimport {reimport.id} completed: {total_task_count} tasks imported')
+
+        # Run post-processing
+        post_process_reimport(reimport)
+
+    except Exception as e:
+        logger.error(f'Error in streaming reimport {reimport.id}: {str(e)}', exc_info=True)
+        reimport.status = ProjectReimport.Status.FAILED
+        reimport.traceback = traceback.format_exc()
+        reimport.error = str(e)
+        reimport.save()
+        raise
+
+
 def async_reimport_background(reimport_id, organization_id, user, **kwargs):
 
     with transaction.atomic():
@@ -147,50 +268,56 @@ def async_reimport_background(reimport_id, organization_id, user, **kwargs):
 
     project = reimport.project
 
-    tasks, found_formats, data_columns = FileUpload.load_tasks_from_uploaded_files(
-        reimport.project, reimport.file_upload_ids, files_as_tasks_list=reimport.files_as_tasks_list
-    )
+    # Check feature flag for memory improvement
+    if flag_set('fflag_fix_back_plt_838_reimport_memory_improvement_05082025_short', user='auto'):
+        logger.info(f'Using streaming reimport for project {project.id}')
+        _async_reimport_background_streaming(reimport, project, organization_id, user)
+    else:
+        # Original implementation
+        tasks, found_formats, data_columns = FileUpload.load_tasks_from_uploaded_files(
+            reimport.project, reimport.file_upload_ids, files_as_tasks_list=reimport.files_as_tasks_list
+        )
 
-    with transaction.atomic():
-        # Lock summary for update to avoid race conditions
-        summary = ProjectSummary.objects.select_for_update().get(project=project)
+        with transaction.atomic():
+            # Lock summary for update to avoid race conditions
+            summary = ProjectSummary.objects.select_for_update().get(project=project)
 
-        project.remove_tasks_by_file_uploads(reimport.file_upload_ids)
-        serializer = ImportApiSerializer(data=tasks, many=True, context={'project': project, 'user': user})
-        serializer.is_valid(raise_exception=True)
-        tasks = serializer.save(project_id=project.id)
-        emit_webhooks_for_instance(organization_id, project, WebhookAction.TASKS_CREATED, tasks)
+            project.remove_tasks_by_file_uploads(reimport.file_upload_ids)
+            serializer = ImportApiSerializer(data=tasks, many=True, context={'project': project, 'user': user})
+            serializer.is_valid(raise_exception=True)
+            tasks = serializer.save(project_id=project.id)
+            emit_webhooks_for_instance(organization_id, project, WebhookAction.TASKS_CREATED, tasks)
 
-        task_count = len(tasks)
-        annotation_count = len(serializer.db_annotations)
-        prediction_count = len(serializer.db_predictions)
-
-        recalculate_stats_counts = {
-            'task_count': task_count,
-            'annotation_count': annotation_count,
-            'prediction_count': prediction_count,
-        }
-
-        # Update counters (like total_annotations) for new tasks and after bulk update tasks stats. It should be a
-        # single operation as counters affect bulk is_labeled update
-        project.update_tasks_counters_and_task_states(
-            tasks_queryset=tasks,
-            maximum_annotations_changed=False,
-            overlap_cohort_percentage_changed=False,
-            tasks_number_changed=True,
-            recalculate_stats_counts=recalculate_stats_counts,
-        )
-        logger.info('Tasks bulk_update finished (async reimport)')
+            task_count = len(tasks)
+            annotation_count = len(serializer.db_annotations)
+            prediction_count = len(serializer.db_predictions)
 
-        summary.update_data_columns(tasks)
-        # TODO: summary.update_created_annotations_and_labels
+            recalculate_stats_counts = {
+                'task_count': task_count,
+                'annotation_count': annotation_count,
+                'prediction_count': prediction_count,
+            }
+
+            # Update counters (like total_annotations) for new tasks and after bulk update tasks stats. It should be a
+            # single operation as counters affect bulk is_labeled update
+            project.update_tasks_counters_and_task_states(
+                tasks_queryset=tasks,
+                maximum_annotations_changed=False,
+                overlap_cohort_percentage_changed=False,
+                tasks_number_changed=True,
+                recalculate_stats_counts=recalculate_stats_counts,
+            )
+            logger.info('Tasks bulk_update finished (async reimport)')
+
+            summary.update_data_columns(tasks)
+            # TODO: summary.update_created_annotations_and_labels
 
-    reimport.task_count = task_count
-    reimport.annotation_count = annotation_count
-    reimport.prediction_count = prediction_count
-    reimport.found_formats = found_formats
-    reimport.data_columns = list(data_columns)
-    reimport.status = ProjectReimport.Status.COMPLETED
-    reimport.save()
+        reimport.task_count = task_count
+        reimport.annotation_count = annotation_count
+        reimport.prediction_count = prediction_count
+        reimport.found_formats = found_formats
+        reimport.data_columns = list(data_columns)
+        reimport.status = ProjectReimport.Status.COMPLETED
+        reimport.save()
 
-    post_process_reimport(reimport)
+        post_process_reimport(reimport)
diff --git a/label_studio/data_import/models.py b/label_studio/data_import/models.py
@@ -197,6 +197,63 @@ def load_tasks_from_uploaded_files(
 
         return tasks, dict(Counter(fileformats)), common_data_fields
 
+    @classmethod
+    def load_tasks_from_uploaded_files_streaming(
+        cls, project, file_upload_ids=None, formats=None, files_as_tasks_list=True, batch_size=5000
+    ):
+        """Stream tasks from uploaded files in batches to reduce memory usage"""
+        fileformats = []
+        common_data_fields = set()
+        batch = []
+        total_yielded = 0
+
+        # scan all files
+        file_uploads = FileUpload.objects.filter(project=project)
+        if file_upload_ids:
+            file_uploads = file_uploads.filter(id__in=file_upload_ids)
+
+        for file_upload in file_uploads:
+            file_format = file_upload.format
+            if formats and file_format not in formats:
+                continue
+
+            new_tasks = file_upload.read_tasks(files_as_tasks_list)
+            fileformats.append(file_format)
+
+            # Validate data fields consistency
+            if new_tasks:
+                new_data_fields = set(new_tasks[0]['data'].keys())
+                if not common_data_fields:
+                    common_data_fields = new_data_fields
+                elif not common_data_fields.intersection(new_data_fields):
+                    raise ValidationError(
+                        _old_vs_new_data_keys_inconsistency_message(
+                            new_data_fields, common_data_fields, file_upload.file.name
+                        )
+                    )
+                else:
+                    common_data_fields &= new_data_fields
+
+            # Add file_upload_id to tasks and batch them
+            for task in new_tasks:
+                task['file_upload_id'] = file_upload.id
+                batch.append(task)
+
+                # Yield batch when it reaches the size limit
+                if len(batch) >= batch_size:
+                    yield batch, dict(Counter(fileformats)), common_data_fields
+                    total_yielded += len(batch)
+                    batch = []
+
+        # Yield remaining tasks if any
+        if batch:
+            yield batch, dict(Counter(fileformats)), common_data_fields
+            total_yielded += len(batch)
+
+        # If no tasks were yielded, return empty batch with metadata
+        if total_yielded == 0:
+            yield [], dict(Counter(fileformats)), common_data_fields
+
 
 def _old_vs_new_data_keys_inconsistency_message(new_data_keys, old_data_keys, current_file):
     new_data_keys_list = ','.join(new_data_keys)
diff --git a/label_studio/tests/data_import/test_streaming_import.py b/label_studio/tests/data_import/test_streaming_import.py
@@ -0,0 +1,75 @@
+"""Test streaming import functionality for memory optimization"""
+from unittest.mock import MagicMock, patch
+
+import pytest
+from data_import.models import FileUpload
+from organizations.tests.factories import OrganizationFactory
+from projects.tests.factories import ProjectFactory
+from users.tests.factories import UserFactory
+
+pytestmark = pytest.mark.django_db
+
+
+class TestStreamingImport:
+    @pytest.fixture
+    def user(self):
+        return UserFactory()
+
+    @pytest.fixture
+    def organization(self):
+        return OrganizationFactory()
+
+    @pytest.fixture
+    def project(self, user, organization):
+        return ProjectFactory(
+            created_by=user,
+            organization=organization,
+            label_config='<View><Text name="text" value="$text"/><Choices name="label" toName="text"><Choice value="pos"/><Choice value="neg"/></Choices></View>',
+        )
+
+    def test_load_tasks_from_uploaded_files_streaming_basic(self, user, project):
+        """Test basic streaming functionality with small batches"""
+        # Mock file upload objects
+        with patch.object(FileUpload.objects, 'filter') as mock_filter:
+            mock_file_upload1 = MagicMock()
+            mock_file_upload1.format = '.json'
+            mock_file_upload1.id = 1
+            mock_file_upload1.read_tasks.return_value = [{'data': {'text': f'Task {i}'}} for i in range(10)]
+
+            mock_file_upload2 = MagicMock()
+            mock_file_upload2.format = '.json'
+            mock_file_upload2.id = 2
+            mock_file_upload2.read_tasks.return_value = [{'data': {'text': f'Task {i+10}'}} for i in range(10)]
+
+            mock_filter.return_value = [mock_file_upload1, mock_file_upload2]
+
+            # Test streaming with batch size 5
+            batches = list(FileUpload.load_tasks_from_uploaded_files_streaming(project, batch_size=5))
+
+            # Should have 4 batches (20 tasks / 5 per batch)
+            assert len(batches) == 4
+
+            # Check batch sizes
+            assert len(batches[0][0]) == 5  # First batch tasks
+            assert len(batches[1][0]) == 5  # Second batch tasks
+            assert len(batches[2][0]) == 5  # Third batch tasks
+            assert len(batches[3][0]) == 5  # Fourth batch tasks
+
+            # Check that all tasks have file_upload_id
+            for batch_tasks, _, _ in batches:
+                for task in batch_tasks:
+                    assert 'file_upload_id' in task
+
+    def test_load_tasks_from_uploaded_files_streaming_empty(self, project):
+        """Test streaming with no file uploads"""
+        # Mock empty file uploads
+        with patch.object(FileUpload.objects, 'filter') as mock_filter:
+            mock_filter.return_value = []
+
+            batches = list(FileUpload.load_tasks_from_uploaded_files_streaming(project, batch_size=5))
+
+            # Should have one empty batch with metadata
+            assert len(batches) == 1
+            assert len(batches[0][0]) == 0  # Empty tasks
+            assert batches[0][1] == {}  # Empty formats
+            assert batches[0][2] == set()  # Empty columns