|
10 | 10 | import uuid |
11 | 11 | import logging |
12 | 12 | import traceback |
| 13 | +import zipfile |
13 | 14 | from pathlib import Path |
14 | 15 | from typing import Optional, List |
15 | 16 | from datetime import datetime |
@@ -313,6 +314,180 @@ async def upload_multiple_files( |
313 | 314 | ) |
314 | 315 |
|
315 | 316 |
|
| 317 | +def run_zip_indexing(job_id: str, zip_path: Path, collection_dir: Path, collection: str): |
| 318 | + """ |
| 319 | + Extract ZIP and run indexing pipeline. |
| 320 | +
|
| 321 | + Args: |
| 322 | + job_id: Job identifier for tracking |
| 323 | + zip_path: Path to uploaded ZIP file |
| 324 | + collection_dir: Target directory for extracted files |
| 325 | + collection: Target collection name |
| 326 | + """ |
| 327 | + logger.info(f"[{job_id}] Starting ZIP extraction from {zip_path}") |
| 328 | + |
| 329 | + try: |
| 330 | + jobs[job_id]["status"] = "running" |
| 331 | + jobs[job_id]["stage"] = "extracting_zip" |
| 332 | + jobs[job_id]["message"] = "Extracting ZIP archive" |
| 333 | + jobs[job_id]["progress"] = 0.05 |
| 334 | + |
| 335 | + # Extract ZIP, filtering out macOS metadata and hidden files |
| 336 | + extracted_files = [] |
| 337 | + with zipfile.ZipFile(zip_path, 'r') as zf: |
| 338 | + for name in zf.namelist(): |
| 339 | + # Skip macOS metadata, hidden files, and directories |
| 340 | + if name.startswith('__MACOSX') or name.startswith('.') or name.endswith('/'): |
| 341 | + continue |
| 342 | + # Skip nested hidden files (e.g., folder/.hidden) |
| 343 | + if '/.' in name: |
| 344 | + continue |
| 345 | + |
| 346 | + zf.extract(name, collection_dir) |
| 347 | + extracted_files.append(name) |
| 348 | + |
| 349 | + # Remove ZIP after extraction |
| 350 | + zip_path.unlink() |
| 351 | + |
| 352 | + logger.info(f"[{job_id}] Extracted {len(extracted_files)} files from ZIP") |
| 353 | + jobs[job_id]["message"] = f"Extracted {len(extracted_files)} files" |
| 354 | + jobs[job_id]["progress"] = 0.15 |
| 355 | + jobs[job_id]["filename"] = f"{len(extracted_files)} files" |
| 356 | + |
| 357 | + if not extracted_files: |
| 358 | + jobs[job_id]["status"] = "completed" |
| 359 | + jobs[job_id]["stage"] = "completed" |
| 360 | + jobs[job_id]["message"] = "ZIP was empty or contained only hidden files" |
| 361 | + jobs[job_id]["completed_at"] = datetime.utcnow().isoformat() |
| 362 | + return |
| 363 | + |
| 364 | + # Import RagifyPipeline |
| 365 | + from ragify import RagifyPipeline |
| 366 | + from lib.config import RagifyConfig |
| 367 | + from lib.tika_check import check_tika_available |
| 368 | + |
| 369 | + # Configure |
| 370 | + config = RagifyConfig.default() |
| 371 | + config.qdrant.collection = collection |
| 372 | + |
| 373 | + # Check Tika availability |
| 374 | + tika_status = check_tika_available() |
| 375 | + use_tika = tika_status['can_use_tika'] |
| 376 | + logger.info(f"[{job_id}] Tika available: {use_tika}") |
| 377 | + |
| 378 | + # Progress callback |
| 379 | + def update_progress(stage: str, progress: float): |
| 380 | + # Scale progress: extraction was 0-0.15, pipeline is 0.15-1.0 |
| 381 | + scaled_progress = 0.15 + (progress * 0.85) |
| 382 | + jobs[job_id]["stage"] = stage |
| 383 | + jobs[job_id]["progress"] = scaled_progress |
| 384 | + |
| 385 | + # Run pipeline |
| 386 | + pipeline = RagifyPipeline(config, use_tika=use_tika) |
| 387 | + stats = pipeline.process_directory(collection_dir, progress_callback=update_progress) |
| 388 | + |
| 389 | + # Update job with results |
| 390 | + jobs[job_id]["progress"] = 1.0 |
| 391 | + jobs[job_id]["status"] = "completed" |
| 392 | + jobs[job_id]["stage"] = "completed" |
| 393 | + jobs[job_id]["message"] = ( |
| 394 | + f"Indexed {stats['processed']}/{stats['processed'] + stats['failed']} files, " |
| 395 | + f"{stats['chunks']} chunks" |
| 396 | + ) |
| 397 | + jobs[job_id]["completed_at"] = datetime.utcnow().isoformat() |
| 398 | + |
| 399 | + logger.info(f"[{job_id}] ZIP indexing COMPLETED: {stats['processed']} files, {stats['chunks']} chunks") |
| 400 | + |
| 401 | + except Exception as e: |
| 402 | + error_msg = str(e) |
| 403 | + logger.error(f"[{job_id}] ZIP indexing FAILED: {error_msg}") |
| 404 | + logger.error(f"[{job_id}] Stack trace:\n{traceback.format_exc()}") |
| 405 | + |
| 406 | + # Cleanup ZIP if still exists |
| 407 | + if zip_path.exists(): |
| 408 | + try: |
| 409 | + zip_path.unlink() |
| 410 | + except Exception: |
| 411 | + pass |
| 412 | + |
| 413 | + jobs[job_id]["status"] = "failed" |
| 414 | + jobs[job_id]["stage"] = "failed" |
| 415 | + jobs[job_id]["message"] = error_msg |
| 416 | + jobs[job_id]["completed_at"] = datetime.utcnow().isoformat() |
| 417 | + |
| 418 | + |
| 419 | +@router.post("/upload-zip") |
| 420 | +async def upload_zip( |
| 421 | + background_tasks: BackgroundTasks, |
| 422 | + file: UploadFile = File(...), |
| 423 | + collection: str = Form(default="documentation") |
| 424 | +): |
| 425 | + """ |
| 426 | + Upload a ZIP file for extraction and indexing. |
| 427 | +
|
| 428 | + The ZIP is extracted server-side, then all files are processed |
| 429 | + by RagifyPipeline as a single job. |
| 430 | +
|
| 431 | + Args: |
| 432 | + file: ZIP file to upload |
| 433 | + collection: Target collection name |
| 434 | +
|
| 435 | + Returns: |
| 436 | + dict: Job information |
| 437 | + """ |
| 438 | + # Trigger cleanup |
| 439 | + cleanup_old_files() |
| 440 | + |
| 441 | + # Validate file |
| 442 | + if not file.filename: |
| 443 | + raise HTTPException(status_code=400, detail="No filename provided") |
| 444 | + |
| 445 | + if not file.filename.endswith('.zip'): |
| 446 | + raise HTTPException(status_code=400, detail="File must be a ZIP archive") |
| 447 | + |
| 448 | + # Create collection directory |
| 449 | + collection_dir = COLLECTIONS_DIR / collection |
| 450 | + collection_dir.mkdir(parents=True, exist_ok=True) |
| 451 | + |
| 452 | + # Save ZIP temporarily with unique name |
| 453 | + zip_path = collection_dir / f"_upload_{uuid.uuid4().hex}.zip" |
| 454 | + try: |
| 455 | + content = await file.read() |
| 456 | + zip_path.write_bytes(content) |
| 457 | + logger.info(f"Saved ZIP: {zip_path} ({len(content)} bytes)") |
| 458 | + except Exception as e: |
| 459 | + raise HTTPException(status_code=500, detail=f"Failed to save ZIP: {e}") |
| 460 | + |
| 461 | + # Create job record |
| 462 | + job_id = str(uuid.uuid4()) |
| 463 | + jobs[job_id] = { |
| 464 | + "job_id": job_id, |
| 465 | + "status": "pending", |
| 466 | + "stage": "pending", |
| 467 | + "collection": collection, |
| 468 | + "filename": "ZIP archive", |
| 469 | + "progress": 0.0, |
| 470 | + "message": "ZIP uploaded, extraction starting", |
| 471 | + "created_at": datetime.utcnow().isoformat(), |
| 472 | + "completed_at": None |
| 473 | + } |
| 474 | + |
| 475 | + # Start background processing |
| 476 | + background_tasks.add_task( |
| 477 | + run_zip_indexing, |
| 478 | + job_id, |
| 479 | + zip_path, |
| 480 | + collection_dir, |
| 481 | + collection |
| 482 | + ) |
| 483 | + |
| 484 | + return JobCreate( |
| 485 | + job_id=job_id, |
| 486 | + status="pending", |
| 487 | + message=f"ZIP uploaded to collection '{collection}', extraction and indexing started" |
| 488 | + ) |
| 489 | + |
| 490 | + |
316 | 491 | @router.get("/jobs/{job_id}") |
317 | 492 | async def get_job_status(job_id: str): |
318 | 493 | """ |
|
0 commit comments