ModelEngine-Group
diff --git a/‎backend/apps/file_management_app.py‎
Lines changed: 4 additions & 2 deletions b/‎backend/apps/file_management_app.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backend/consts/const.py‎
Lines changed: 7 additions & 0 deletions b/‎backend/consts/const.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backend/data_process/app.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/data_process/app.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/data_process/ray_actors.py‎
Lines changed: 59 additions & 6 deletions b/‎backend/data_process/ray_actors.py‎
Lines changed: 59 additions & 6 deletions
diff --git a/‎backend/data_process/tasks.py‎
Lines changed: 82 additions & 9 deletions b/‎backend/data_process/tasks.py‎
Lines changed: 82 additions & 9 deletions
diff --git a/‎backend/data_process/worker.py‎
Lines changed: 2 additions & 1 deletion b/‎backend/data_process/worker.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/services/data_process_service.py‎
Lines changed: 20 additions & 10 deletions b/‎backend/services/data_process_service.py‎
Lines changed: 20 additions & 10 deletions
@@ -34,13 +34,15 @@ async def upload_files(
         destination: str = Form(...,
                                 description="Upload destination: 'local' or 'minio'"),
         folder: str = Form(
-            "attachments", description="Storage folder path for MinIO (optional)")
+            "attachments", description="Storage folder path for MinIO (optional)"),
+        index_name: Optional[str] = Form(
+            None, description="Knowledge base index for conflict resolution")
 ):
     if not file:
         raise HTTPException(status_code=HTTPStatus.BAD_REQUEST,
                             detail="No files in the request")
 
-    errors, uploaded_file_paths, uploaded_filenames = await upload_files_impl(destination, file, folder)
+    errors, uploaded_file_paths, uploaded_filenames = await upload_files_impl(destination, file, folder, index_name)
 
     if uploaded_file_paths:
         return JSONResponse(
 
@@ -92,6 +92,13 @@
 REDIS_BACKEND_URL = os.getenv("REDIS_BACKEND_URL")
 REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
 FLOWER_PORT = int(os.getenv("FLOWER_PORT", "5555"))
+DP_REDIS_CHUNKS_WAIT_TIMEOUT_S = int(
+    os.getenv("DP_REDIS_CHUNKS_WAIT_TIMEOUT_S", "30"))
+DP_REDIS_CHUNKS_POLL_INTERVAL_MS = int(
+    os.getenv("DP_REDIS_CHUNKS_POLL_INTERVAL_MS", "200"))
+FORWARD_REDIS_RETRY_DELAY_S = int(
+    os.getenv("FORWARD_REDIS_RETRY_DELAY_S", "5"))
+FORWARD_REDIS_RETRY_MAX = int(os.getenv("FORWARD_REDIS_RETRY_MAX", "12"))
 
 
 # Ray Configuration
 
@@ -61,7 +61,7 @@
     result_backend_always_retry=True,  # Always retry backend operations
     result_backend_max_retries=10,  # Max retries for backend operations
     task_time_limit=3600,      # 1 hour time limit per task
-    worker_prefetch_multiplier=4,  # Allow prefetching for better throughput
+    worker_prefetch_multiplier=1,  # Fair scheduling; avoid batchy prefetch
     worker_max_tasks_per_child=1000,  # Reduce restart frequency
     # Important for task chains
     task_acks_late=True,       # Tasks are acknowledged after completion
 
@@ -1,9 +1,10 @@
 import logging
+import json
 from typing import Any, Dict, List, Optional
 
 import ray
 
-from consts.const import RAY_ACTOR_NUM_CPUS
+from consts.const import RAY_ACTOR_NUM_CPUS, REDIS_BACKEND_URL
 from database.attachment_db import get_file_stream
 from nexent.data_process import DataProcessCore
 
@@ -47,7 +48,7 @@ def process_file(
             List[Dict[str, Any]]: A list of dictionaries representing the processed chunks.
         """
         logger.info(
-            f"[RayActor] Processing file: {source}, destination: {destination}")
+            f"[RayActor] Processing start: source='{source}', destination='{destination}', strategy='{chunking_strategy}', task_id='{task_id}'")
 
         if task_id:
             params['task_id'] = task_id
@@ -69,11 +70,63 @@ def process_file(
             **params
         )
 
-        if not chunks:
+        if chunks is None:
             logger.warning(
-                f"[RayActor] file_process returned no chunks for {source}")
+                f"[RayActor] file_process returned None for source='{source}'")
+            return []
+        if not isinstance(chunks, list):
+            logger.error(
+                f"[RayActor] file_process returned non-list type {type(chunks)} for source='{source}'")
+            return []
+        if len(chunks) == 0:
+            logger.warning(
+                f"[RayActor] file_process returned empty list for source='{source}'")
             return []
 
-        logger.debug(
-            f"[RayActor] file_process returned {len(chunks)} chunks, returning as is.")
+        logger.info(
+            f"[RayActor] Processing done: produced {len(chunks)} chunks for source='{source}'")
         return chunks
+
+    def store_chunks_in_redis(self, redis_key: str, chunks: List[Dict[str, Any]]) -> bool:
+        """
+        Store processed chunks into Redis under a given key.
+
+        This is used to decouple Celery task execution from Ray processing, allowing
+        Celery to submit work and return immediately while Ray persists results for
+        a subsequent step to retrieve.
+        """
+        if not REDIS_BACKEND_URL:
+            logger.error(
+                "REDIS_BACKEND_URL is not configured; cannot store chunks.")
+            return False
+        try:
+            import redis
+            client = redis.Redis.from_url(
+                REDIS_BACKEND_URL, decode_responses=True)
+            # Use a compact JSON for storage
+            if chunks is None:
+                logger.error(
+                    f"[RayActor] store_chunks_in_redis received None chunks for key '{redis_key}'")
+                serialized = json.dumps([])
+            else:
+                try:
+                    serialized = json.dumps(chunks, ensure_ascii=False)
+                except Exception as ser_exc:
+                    logger.error(
+                        f"[RayActor] JSON serialization failed for key '{redis_key}': {ser_exc}")
+                    # Fallback to empty list to avoid poisoning Redis with invalid data
+                    serialized = json.dumps([])
+            client.set(redis_key, serialized)
+            # Optionally set an expiration to avoid leaks (e.g., 2 hours)
+            client.expire(redis_key, 2 * 60 * 60)
+            try:
+                count_logged = len(chunks) if isinstance(chunks, list) else 0
+            except Exception:
+                count_logged = 0
+            logger.info(
+                f"[RayActor] Stored {count_logged} chunks in Redis at key '{redis_key}', value_len={len(serialized)}")
+            return True
+        except Exception as exc:
+            logger.error(
+                f"Failed to store chunks in Redis at key {redis_key}: {exc}")
+            return False
@@ -12,11 +12,17 @@
 import aiohttp
 import ray
 from celery import Task, chain, states
+from celery.exceptions import Retry
 
 from consts.const import ELASTICSEARCH_SERVICE
 from utils.file_management_utils import get_file_size
 from .app import app
 from .ray_actors import DataProcessorRayActor
+from consts.const import (
+    REDIS_BACKEND_URL,
+    FORWARD_REDIS_RETRY_DELAY_S,
+    FORWARD_REDIS_RETRY_MAX,
+)
 
 
 logger = logging.getLogger("data_process.tasks")
@@ -195,15 +201,21 @@ def process(
                 f"[{self.request.id}] PROCESS TASK: File size: {file_size_mb:.2f}MB")
 
             # The unified actor call, mapping 'file' source_type to 'local' destination
+            # Submit Ray work and do not block here
+            logger.debug(
+                f"[{self.request.id}] PROCESS TASK: Submitting Ray processing for source='{source}', strategy='{chunking_strategy}', destination='{source_type}'")
             chunks_ref = actor.process_file.remote(
                 source,
                 chunking_strategy,
                 destination=source_type,
                 task_id=task_id,
                 **params
             )
-
-            chunks = ray.get(chunks_ref)
+            # Persist chunks into Redis via Ray to decouple Celery
+            redis_key = f"dp:{task_id}:chunks"
+            actor.store_chunks_in_redis.remote(redis_key, chunks_ref)
+            logger.debug(
+                f"[{self.request.id}] PROCESS TASK: Scheduled store_chunks_in_redis for key '{redis_key}'")
 
             end_time = time.time()
             elapsed_time = end_time - start_time
@@ -217,14 +229,20 @@ def process(
                 f"[{self.request.id}] PROCESS TASK: Processing from URL: {source}")
 
             # For URL source, core.py expects a non-local destination to trigger URL fetching
+            logger.debug(
+                f"[{self.request.id}] PROCESS TASK: Submitting Ray processing for URL='{source}', strategy='{chunking_strategy}', destination='{source_type}'")
             chunks_ref = actor.process_file.remote(
                 source,
                 chunking_strategy,
                 destination=source_type,
                 task_id=task_id,
                 **params
             )
-            chunks = ray.get(chunks_ref)
+            # Persist chunks into Redis via Ray to decouple Celery
+            redis_key = f"dp:{task_id}:chunks"
+            actor.store_chunks_in_redis.remote(redis_key, chunks_ref)
+            logger.debug(
+                f"[{self.request.id}] PROCESS TASK: Scheduled store_chunks_in_redis for key '{redis_key}'")
             end_time = time.time()
             elapsed_time = end_time - start_time
             logger.info(
@@ -235,11 +253,11 @@ def process(
             raise NotImplementedError(
                 f"Source type '{source_type}' not yet supported")
 
-        # Update task state to SUCCESS with metadata
+        # Update task state to SUCCESS with metadata (without materializing chunks here)
         self.update_state(
             state=states.SUCCESS,
             meta={
-                'chunks_count': len(chunks),
+                'chunks_count': None,
                 'processing_time': elapsed_time,
                 'source': source,
                 'index_name': index_name,
@@ -252,11 +270,12 @@ def process(
         )
 
         logger.info(
-            f"[{self.request.id}] PROCESS TASK: Successfully processed {len(chunks)} chunks in {elapsed_time:.2f}s")
+            f"[{self.request.id}] PROCESS TASK: Submitted for Ray processing; result will be fetched by forward")
 
-        # Prepare data for the next task in the chain
+        # Prepare data for the next task in the chain; pass redis_key
         returned_data = {
-            'chunks': chunks,
+            'redis_key': f"dp:{task_id}:chunks",
+            'chunks': None,
             'source': source,
             'index_name': index_name,
             'original_filename': original_filename,
@@ -329,6 +348,60 @@ def forward(
 
     try:
         chunks = processed_data.get('chunks')
+        # If chunks are not in payload, try loading from Redis via the redis_key
+        if (not chunks) and processed_data.get('redis_key'):
+            redis_key = processed_data.get('redis_key')
+            if not REDIS_BACKEND_URL:
+                raise Exception(json.dumps({
+                    "message": "REDIS_BACKEND_URL not configured to retrieve chunks",
+                    "index_name": original_index_name,
+                    "task_name": "forward",
+                    "source": original_source,
+                    "original_filename": filename
+                }, ensure_ascii=False))
+            try:
+                import redis
+                client = redis.Redis.from_url(
+                    REDIS_BACKEND_URL, decode_responses=True)
+                cached = client.get(redis_key)
+                if cached:
+                    try:
+                        logger.debug(
+                            f"[{self.request.id}] FORWARD TASK: Retrieved Redis key '{redis_key}', payload_length={len(cached)}")
+                        chunks = json.loads(cached)
+                    except json.JSONDecodeError as jde:
+                        # Log raw prefix to help diagnose incorrect writes
+                        raw_preview = cached[:120] if isinstance(
+                            cached, str) else str(type(cached))
+                        logger.error(
+                            f"[{self.request.id}] FORWARD TASK: JSON decode error for key '{redis_key}': {str(jde)}; raw_prefix={raw_preview!r}")
+                        raise
+                else:
+                    # No busy-wait: release the worker slot and retry later
+                    retry_num = getattr(self.request, 'retries', 0)
+                    logger.info(
+                        f"[{self.request.id}] FORWARD TASK: Chunks not yet available for key {redis_key}. Retry {retry_num + 1}/{FORWARD_REDIS_RETRY_MAX} in {FORWARD_REDIS_RETRY_DELAY_S}s")
+                    raise self.retry(
+                        countdown=FORWARD_REDIS_RETRY_DELAY_S,
+                        max_retries=FORWARD_REDIS_RETRY_MAX,
+                        exc=Exception(json.dumps({
+                            "message": "Chunks not ready in Redis; will retry",
+                            "index_name": original_index_name,
+                            "task_name": "forward",
+                            "source": original_source,
+                            "original_filename": filename
+                        }, ensure_ascii=False))
+                    )
+            except Retry:
+                raise
+            except Exception as exc:
+                raise Exception(json.dumps({
+                    "message": f"Failed to retrieve chunks from Redis: {str(exc)}",
+                    "index_name": original_index_name,
+                    "task_name": "forward",
+                    "source": original_source,
+                    "original_filename": filename
+                }, ensure_ascii=False))
         if processed_data.get('source'):
             original_source = processed_data.get('source')
         if processed_data.get('index_name'):
@@ -357,7 +430,7 @@ def forward(
                 "index_name": original_index_name,
                 "task_name": "forward",
                 "source": original_source,
-                "original_filename": filename
+                "original_filename": original_filename
             }, ensure_ascii=False))
         if len(chunks) == 0:
             logger.warning(
 
@@ -311,7 +311,8 @@ def start_worker():
         f'--hostname={worker_name}@%h',
         f'--concurrency={concurrency}',
         '--pool=threads',
-        '--task-events'
+        '--task-events',
+        '-Ofair'
     ]
 
     try:
 
@@ -14,14 +14,14 @@
 import redis
 import torch
 from PIL import Image
-from celery import states
+from celery import states, chain
 from transformers import CLIPProcessor, CLIPModel
 from nexent.data_process.core import DataProcessCore
 
 from consts.const import CLIP_MODEL_PATH, IMAGE_FILTER, REDIS_BACKEND_URL, REDIS_URL
 from consts.model import BatchTaskRequest
 from data_process.app import app as celery_app
-from data_process.tasks import process_and_forward
+from data_process.tasks import process, forward
 from data_process.utils import get_task_info, get_all_task_ids_from_redis
 
 # Configure logging
@@ -467,16 +467,26 @@ async def create_batch_tasks_impl(self, authorization: Optional[str], request: B
                     f"Missing required field 'index_name' in source config: {source_config}")
                 continue
 
-            # Create individual task for this source
-            task_result = process_and_forward.delay(
-                source=source,
-                source_type=source_type,
-                chunking_strategy=chunking_strategy,
-                index_name=index_name,
-                original_filename=original_filename,
-                authorization=authorization
+            # Create and submit a chain: process -> forward
+            task_chain = chain(
+                process.s(
+                    source=source,
+                    source_type=source_type,
+                    chunking_strategy=chunking_strategy,
+                    index_name=index_name,
+                    original_filename=original_filename
+                ).set(queue='process_q'),
+                forward.s(
+                    index_name=index_name,
+                    source=source,
+                    source_type=source_type,
+                    original_filename=original_filename,
+                    authorization=authorization
+                ).set(queue='forward_q')
             )
 
+            task_result = task_chain.apply_async()
+
             task_ids.append(task_result.id)
             logger.debug(f"Created task {task_result.id} for source: {source}")
         logger.info(
Original file line number	Diff line number	Diff line change
`@@ -311,7 +311,8 @@ def start_worker():`
`311`	`311`	`f'--hostname={worker_name}@%h',`
`312`	`312`	`f'--concurrency={concurrency}',`
`313`	`313`	`'--pool=threads',`
`314`		`- '--task-events'`
	`314`	`+ '--task-events',`
	`315`	`+ '-Ofair'`
`315`	`316`	`]`
`316`	`317`
`317`	`318`	`try:`