Skip to content

Commit ec141cd

Browse files
committed
🐛 Bugfix: upload file with the same names will now add suffix automatically and avoid duplication
🐛 Bugfix: now knowledgebase tasks will show up at the same time 🐛 Bugfix: exceed max tokens for jina-embedding-v3 & jina-embedding-v4 when creating knowledgebase
1 parent fe346dc commit ec141cd

File tree

19 files changed

+2272
-262
lines changed

19 files changed

+2272
-262
lines changed

backend/apps/file_management_app.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ async def upload_files(
3434
destination: str = Form(...,
3535
description="Upload destination: 'local' or 'minio'"),
3636
folder: str = Form(
37-
"attachments", description="Storage folder path for MinIO (optional)")
37+
"attachments", description="Storage folder path for MinIO (optional)"),
38+
index_name: Optional[str] = Form(
39+
None, description="Knowledge base index for conflict resolution")
3840
):
3941
if not file:
4042
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST,
4143
detail="No files in the request")
4244

43-
errors, uploaded_file_paths, uploaded_filenames = await upload_files_impl(destination, file, folder)
45+
errors, uploaded_file_paths, uploaded_filenames = await upload_files_impl(destination, file, folder, index_name)
4446

4547
if uploaded_file_paths:
4648
return JSONResponse(

backend/consts/const.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@
9292
REDIS_BACKEND_URL = os.getenv("REDIS_BACKEND_URL")
9393
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
9494
FLOWER_PORT = int(os.getenv("FLOWER_PORT", "5555"))
95+
DP_REDIS_CHUNKS_WAIT_TIMEOUT_S = int(
96+
os.getenv("DP_REDIS_CHUNKS_WAIT_TIMEOUT_S", "30"))
97+
DP_REDIS_CHUNKS_POLL_INTERVAL_MS = int(
98+
os.getenv("DP_REDIS_CHUNKS_POLL_INTERVAL_MS", "200"))
99+
FORWARD_REDIS_RETRY_DELAY_S = int(
100+
os.getenv("FORWARD_REDIS_RETRY_DELAY_S", "5"))
101+
FORWARD_REDIS_RETRY_MAX = int(os.getenv("FORWARD_REDIS_RETRY_MAX", "12"))
95102

96103

97104
# Ray Configuration

backend/data_process/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
result_backend_always_retry=True, # Always retry backend operations
6262
result_backend_max_retries=10, # Max retries for backend operations
6363
task_time_limit=3600, # 1 hour time limit per task
64-
worker_prefetch_multiplier=4, # Allow prefetching for better throughput
64+
worker_prefetch_multiplier=1, # Fair scheduling; avoid batchy prefetch
6565
worker_max_tasks_per_child=1000, # Reduce restart frequency
6666
# Important for task chains
6767
task_acks_late=True, # Tasks are acknowledged after completion

backend/data_process/ray_actors.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import logging
2+
import json
23
from typing import Any, Dict, List, Optional
34

45
import ray
56

6-
from consts.const import RAY_ACTOR_NUM_CPUS
7+
from consts.const import RAY_ACTOR_NUM_CPUS, REDIS_BACKEND_URL
78
from database.attachment_db import get_file_stream
89
from nexent.data_process import DataProcessCore
910

@@ -47,7 +48,7 @@ def process_file(
4748
List[Dict[str, Any]]: A list of dictionaries representing the processed chunks.
4849
"""
4950
logger.info(
50-
f"[RayActor] Processing file: {source}, destination: {destination}")
51+
f"[RayActor] Processing start: source='{source}', destination='{destination}', strategy='{chunking_strategy}', task_id='{task_id}'")
5152

5253
if task_id:
5354
params['task_id'] = task_id
@@ -69,11 +70,63 @@ def process_file(
6970
**params
7071
)
7172

72-
if not chunks:
73+
if chunks is None:
7374
logger.warning(
74-
f"[RayActor] file_process returned no chunks for {source}")
75+
f"[RayActor] file_process returned None for source='{source}'")
76+
return []
77+
if not isinstance(chunks, list):
78+
logger.error(
79+
f"[RayActor] file_process returned non-list type {type(chunks)} for source='{source}'")
80+
return []
81+
if len(chunks) == 0:
82+
logger.warning(
83+
f"[RayActor] file_process returned empty list for source='{source}'")
7584
return []
7685

77-
logger.debug(
78-
f"[RayActor] file_process returned {len(chunks)} chunks, returning as is.")
86+
logger.info(
87+
f"[RayActor] Processing done: produced {len(chunks)} chunks for source='{source}'")
7988
return chunks
89+
90+
def store_chunks_in_redis(self, redis_key: str, chunks: List[Dict[str, Any]]) -> bool:
91+
"""
92+
Store processed chunks into Redis under a given key.
93+
94+
This is used to decouple Celery task execution from Ray processing, allowing
95+
Celery to submit work and return immediately while Ray persists results for
96+
a subsequent step to retrieve.
97+
"""
98+
if not REDIS_BACKEND_URL:
99+
logger.error(
100+
"REDIS_BACKEND_URL is not configured; cannot store chunks.")
101+
return False
102+
try:
103+
import redis
104+
client = redis.Redis.from_url(
105+
REDIS_BACKEND_URL, decode_responses=True)
106+
# Use a compact JSON for storage
107+
if chunks is None:
108+
logger.error(
109+
f"[RayActor] store_chunks_in_redis received None chunks for key '{redis_key}'")
110+
serialized = json.dumps([])
111+
else:
112+
try:
113+
serialized = json.dumps(chunks, ensure_ascii=False)
114+
except Exception as ser_exc:
115+
logger.error(
116+
f"[RayActor] JSON serialization failed for key '{redis_key}': {ser_exc}")
117+
# Fallback to empty list to avoid poisoning Redis with invalid data
118+
serialized = json.dumps([])
119+
client.set(redis_key, serialized)
120+
# Optionally set an expiration to avoid leaks (e.g., 2 hours)
121+
client.expire(redis_key, 2 * 60 * 60)
122+
try:
123+
count_logged = len(chunks) if isinstance(chunks, list) else 0
124+
except Exception:
125+
count_logged = 0
126+
logger.info(
127+
f"[RayActor] Stored {count_logged} chunks in Redis at key '{redis_key}', value_len={len(serialized)}")
128+
return True
129+
except Exception as exc:
130+
logger.error(
131+
f"Failed to store chunks in Redis at key {redis_key}: {exc}")
132+
return False

backend/data_process/tasks.py

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,17 @@
1212
import aiohttp
1313
import ray
1414
from celery import Task, chain, states
15+
from celery.exceptions import Retry
1516

1617
from consts.const import ELASTICSEARCH_SERVICE
1718
from utils.file_management_utils import get_file_size
1819
from .app import app
1920
from .ray_actors import DataProcessorRayActor
21+
from consts.const import (
22+
REDIS_BACKEND_URL,
23+
FORWARD_REDIS_RETRY_DELAY_S,
24+
FORWARD_REDIS_RETRY_MAX,
25+
)
2026

2127

2228
logger = logging.getLogger("data_process.tasks")
@@ -195,15 +201,21 @@ def process(
195201
f"[{self.request.id}] PROCESS TASK: File size: {file_size_mb:.2f}MB")
196202

197203
# The unified actor call, mapping 'file' source_type to 'local' destination
204+
# Submit Ray work and do not block here
205+
logger.debug(
206+
f"[{self.request.id}] PROCESS TASK: Submitting Ray processing for source='{source}', strategy='{chunking_strategy}', destination='{source_type}'")
198207
chunks_ref = actor.process_file.remote(
199208
source,
200209
chunking_strategy,
201210
destination=source_type,
202211
task_id=task_id,
203212
**params
204213
)
205-
206-
chunks = ray.get(chunks_ref)
214+
# Persist chunks into Redis via Ray to decouple Celery
215+
redis_key = f"dp:{task_id}:chunks"
216+
actor.store_chunks_in_redis.remote(redis_key, chunks_ref)
217+
logger.debug(
218+
f"[{self.request.id}] PROCESS TASK: Scheduled store_chunks_in_redis for key '{redis_key}'")
207219

208220
end_time = time.time()
209221
elapsed_time = end_time - start_time
@@ -217,14 +229,20 @@ def process(
217229
f"[{self.request.id}] PROCESS TASK: Processing from URL: {source}")
218230

219231
# For URL source, core.py expects a non-local destination to trigger URL fetching
232+
logger.debug(
233+
f"[{self.request.id}] PROCESS TASK: Submitting Ray processing for URL='{source}', strategy='{chunking_strategy}', destination='{source_type}'")
220234
chunks_ref = actor.process_file.remote(
221235
source,
222236
chunking_strategy,
223237
destination=source_type,
224238
task_id=task_id,
225239
**params
226240
)
227-
chunks = ray.get(chunks_ref)
241+
# Persist chunks into Redis via Ray to decouple Celery
242+
redis_key = f"dp:{task_id}:chunks"
243+
actor.store_chunks_in_redis.remote(redis_key, chunks_ref)
244+
logger.debug(
245+
f"[{self.request.id}] PROCESS TASK: Scheduled store_chunks_in_redis for key '{redis_key}'")
228246
end_time = time.time()
229247
elapsed_time = end_time - start_time
230248
logger.info(
@@ -235,11 +253,11 @@ def process(
235253
raise NotImplementedError(
236254
f"Source type '{source_type}' not yet supported")
237255

238-
# Update task state to SUCCESS with metadata
256+
# Update task state to SUCCESS with metadata (without materializing chunks here)
239257
self.update_state(
240258
state=states.SUCCESS,
241259
meta={
242-
'chunks_count': len(chunks),
260+
'chunks_count': None,
243261
'processing_time': elapsed_time,
244262
'source': source,
245263
'index_name': index_name,
@@ -252,11 +270,12 @@ def process(
252270
)
253271

254272
logger.info(
255-
f"[{self.request.id}] PROCESS TASK: Successfully processed {len(chunks)} chunks in {elapsed_time:.2f}s")
273+
f"[{self.request.id}] PROCESS TASK: Submitted for Ray processing; result will be fetched by forward")
256274

257-
# Prepare data for the next task in the chain
275+
# Prepare data for the next task in the chain; pass redis_key
258276
returned_data = {
259-
'chunks': chunks,
277+
'redis_key': f"dp:{task_id}:chunks",
278+
'chunks': None,
260279
'source': source,
261280
'index_name': index_name,
262281
'original_filename': original_filename,
@@ -329,6 +348,60 @@ def forward(
329348

330349
try:
331350
chunks = processed_data.get('chunks')
351+
# If chunks are not in payload, try loading from Redis via the redis_key
352+
if (not chunks) and processed_data.get('redis_key'):
353+
redis_key = processed_data.get('redis_key')
354+
if not REDIS_BACKEND_URL:
355+
raise Exception(json.dumps({
356+
"message": "REDIS_BACKEND_URL not configured to retrieve chunks",
357+
"index_name": original_index_name,
358+
"task_name": "forward",
359+
"source": original_source,
360+
"original_filename": filename
361+
}, ensure_ascii=False))
362+
try:
363+
import redis
364+
client = redis.Redis.from_url(
365+
REDIS_BACKEND_URL, decode_responses=True)
366+
cached = client.get(redis_key)
367+
if cached:
368+
try:
369+
logger.debug(
370+
f"[{self.request.id}] FORWARD TASK: Retrieved Redis key '{redis_key}', payload_length={len(cached)}")
371+
chunks = json.loads(cached)
372+
except json.JSONDecodeError as jde:
373+
# Log raw prefix to help diagnose incorrect writes
374+
raw_preview = cached[:120] if isinstance(
375+
cached, str) else str(type(cached))
376+
logger.error(
377+
f"[{self.request.id}] FORWARD TASK: JSON decode error for key '{redis_key}': {str(jde)}; raw_prefix={raw_preview!r}")
378+
raise
379+
else:
380+
# No busy-wait: release the worker slot and retry later
381+
retry_num = getattr(self.request, 'retries', 0)
382+
logger.info(
383+
f"[{self.request.id}] FORWARD TASK: Chunks not yet available for key {redis_key}. Retry {retry_num + 1}/{FORWARD_REDIS_RETRY_MAX} in {FORWARD_REDIS_RETRY_DELAY_S}s")
384+
raise self.retry(
385+
countdown=FORWARD_REDIS_RETRY_DELAY_S,
386+
max_retries=FORWARD_REDIS_RETRY_MAX,
387+
exc=Exception(json.dumps({
388+
"message": "Chunks not ready in Redis; will retry",
389+
"index_name": original_index_name,
390+
"task_name": "forward",
391+
"source": original_source,
392+
"original_filename": filename
393+
}, ensure_ascii=False))
394+
)
395+
except Retry:
396+
raise
397+
except Exception as exc:
398+
raise Exception(json.dumps({
399+
"message": f"Failed to retrieve chunks from Redis: {str(exc)}",
400+
"index_name": original_index_name,
401+
"task_name": "forward",
402+
"source": original_source,
403+
"original_filename": filename
404+
}, ensure_ascii=False))
332405
if processed_data.get('source'):
333406
original_source = processed_data.get('source')
334407
if processed_data.get('index_name'):
@@ -357,7 +430,7 @@ def forward(
357430
"index_name": original_index_name,
358431
"task_name": "forward",
359432
"source": original_source,
360-
"original_filename": filename
433+
"original_filename": original_filename
361434
}, ensure_ascii=False))
362435
if len(chunks) == 0:
363436
logger.warning(

backend/data_process/worker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,8 @@ def start_worker():
311311
f'--hostname={worker_name}@%h',
312312
f'--concurrency={concurrency}',
313313
'--pool=threads',
314-
'--task-events'
314+
'--task-events',
315+
'-Ofair'
315316
]
316317

317318
try:

backend/services/data_process_service.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@
1414
import redis
1515
import torch
1616
from PIL import Image
17-
from celery import states
17+
from celery import states, chain
1818
from transformers import CLIPProcessor, CLIPModel
1919
from nexent.data_process.core import DataProcessCore
2020

2121
from consts.const import CLIP_MODEL_PATH, IMAGE_FILTER, REDIS_BACKEND_URL, REDIS_URL
2222
from consts.model import BatchTaskRequest
2323
from data_process.app import app as celery_app
24-
from data_process.tasks import process_and_forward
24+
from data_process.tasks import process, forward
2525
from data_process.utils import get_task_info, get_all_task_ids_from_redis
2626

2727
# Configure logging
@@ -467,16 +467,26 @@ async def create_batch_tasks_impl(self, authorization: Optional[str], request: B
467467
f"Missing required field 'index_name' in source config: {source_config}")
468468
continue
469469

470-
# Create individual task for this source
471-
task_result = process_and_forward.delay(
472-
source=source,
473-
source_type=source_type,
474-
chunking_strategy=chunking_strategy,
475-
index_name=index_name,
476-
original_filename=original_filename,
477-
authorization=authorization
470+
# Create and submit a chain: process -> forward
471+
task_chain = chain(
472+
process.s(
473+
source=source,
474+
source_type=source_type,
475+
chunking_strategy=chunking_strategy,
476+
index_name=index_name,
477+
original_filename=original_filename
478+
).set(queue='process_q'),
479+
forward.s(
480+
index_name=index_name,
481+
source=source,
482+
source_type=source_type,
483+
original_filename=original_filename,
484+
authorization=authorization
485+
).set(queue='forward_q')
478486
)
479487

488+
task_result = task_chain.apply_async()
489+
480490
task_ids.append(task_result.id)
481491
logger.debug(f"Created task {task_result.id} for source: {source}")
482492
logger.info(

0 commit comments

Comments
 (0)