@@ -15,19 +15,106 @@ class DocumentService:
1515
1616 def __init__ (self ):
1717 """Initialize the document service"""
18- # Initialize S3/MinIO client
19- self .s3_client = boto3 .client (
20- 's3' ,
21- endpoint_url = os .getenv ('MINIO_URL' , 'http://localhost:9000' ),
22- aws_access_key_id = os .getenv ('MINIO_ACCESS_KEY' , 'minioadmin' ),
23- aws_secret_access_key = os .getenv ('MINIO_SECRET_KEY' , 'minioadmin' ),
24- region_name = 'us-east-1' ,
25- config = boto3 .session .Config (signature_version = 's3v4' )
26- )
18+ # Skip MinIO connection if SKIP_MINIO env var is set
19+ if os .getenv ("SKIP_MINIO" , "false" ).lower () == "true" :
20+ print ("SKIP_MINIO is set. Mocking MinIO client for tests." )
21+ self .s3_client = self ._create_mock_s3_client ()
22+ else :
23+ # Initialize S3/MinIO client
24+ self .s3_client = boto3 .client (
25+ 's3' ,
26+ endpoint_url = os .getenv ('MINIO_URL' , 'http://localhost:9000' ),
27+ aws_access_key_id = os .getenv ('MINIO_ACCESS_KEY' , 'minioadmin' ),
28+ aws_secret_access_key = os .getenv ('MINIO_SECRET_KEY' , 'minioadmin' ),
29+ region_name = 'us-east-1' ,
30+ config = boto3 .session .Config (signature_version = 's3v4' )
31+ )
32+
2733 self .bucket_name = os .getenv ('MINIO_BUCKET' , 'concepts' )
2834
29- # Ensure bucket exists
30- self ._ensure_bucket_exists ()
35+ # Ensure bucket exists if not skipping MinIO
36+ if os .getenv ("SKIP_MINIO" , "false" ).lower () != "true" :
37+ self ._ensure_bucket_exists ()
38+
39+ def _create_mock_s3_client (self ):
40+ """Create a mock S3 client for testing"""
41+ class MockS3Client :
42+ def __init__ (self ):
43+ self .mock_objects = {}
44+ self .meta = type ('meta' , (), {'session' : type ('session' , (), {'close' : lambda : None })()})
45+
46+ def create_bucket (self , ** kwargs ):
47+ return {}
48+
49+ def head_bucket (self , ** kwargs ):
50+ return {}
51+
52+ def upload_fileobj (self , file_obj , bucket , key ):
53+ if bucket not in self .mock_objects :
54+ self .mock_objects [bucket ] = {}
55+ self .mock_objects [bucket ][key ] = {
56+ 'content' : 'mock_content' ,
57+ 'size' : 100 ,
58+ 'last_modified' : datetime .datetime .now ()
59+ }
60+ return {}
61+
62+ def list_objects_v2 (self , ** kwargs ):
63+ bucket = kwargs .get ('Bucket' )
64+ prefix = kwargs .get ('Prefix' , '' )
65+
66+ if bucket not in self .mock_objects or not self .mock_objects [bucket ]:
67+ return {}
68+
69+ contents = []
70+ for key , obj in self .mock_objects [bucket ].items ():
71+ if key .startswith (prefix ):
72+ contents .append ({
73+ 'Key' : key ,
74+ 'Size' : obj ['size' ],
75+ 'LastModified' : obj ['last_modified' ]
76+ })
77+
78+ if contents :
79+ return {'Contents' : contents }
80+ return {}
81+
82+ def delete_objects (self , ** kwargs ):
83+ bucket = kwargs .get ('Bucket' )
84+ objects = kwargs .get ('Delete' , {}).get ('Objects' , [])
85+
86+ if bucket in self .mock_objects :
87+ for obj in objects :
88+ key = obj .get ('Key' )
89+ if key in self .mock_objects [bucket ]:
90+ del self .mock_objects [bucket ][key ]
91+
92+ return {'Deleted' : [{'Key' : obj .get ('Key' )} for obj in objects ]}
93+
94+ def delete_object (self , ** kwargs ):
95+ bucket = kwargs .get ('Bucket' )
96+ key = kwargs .get ('Key' )
97+
98+ if bucket in self .mock_objects and key in self .mock_objects [bucket ]:
99+ del self .mock_objects [bucket ][key ]
100+
101+ return {}
102+
103+ def get_paginator (self , operation_name ):
104+ class MockPaginator :
105+ def __init__ (self , client , operation ):
106+ self .client = client
107+ self .operation = operation
108+
109+ def paginate (self , ** kwargs ):
110+ # For list_objects_v2, return a single page with all objects
111+ if self .operation == 'list_objects_v2' :
112+ result = self .client .list_objects_v2 (** kwargs )
113+ yield result
114+
115+ return MockPaginator (self , operation_name )
116+
117+ return MockS3Client ()
31118
32119 def _ensure_bucket_exists (self ):
33120 """Ensure the S3/MinIO bucket exists"""
@@ -62,15 +149,18 @@ def process_document(self, file, concept_id: str) -> ProcessedDocument:
62149 # Split text into chunks
63150 chunks = self ._split_text (text )
64151
152+ # Ensure we have at least one chunk even if text extraction failed
153+ if not chunks :
154+ chunks = [f"Failed to extract meaningful text from { filename } " ]
155+
65156 # Store chunks in vector database
66- if chunks :
67- metadatas = [{
68- "document_id" : document_id ,
69- "concept_id" : concept_id ,
70- "filename" : filename
71- } for _ in chunks ]
72-
73- vector_store_service .add_texts (texts = chunks , metadatas = metadatas )
157+ metadatas = [{
158+ "document_id" : document_id ,
159+ "concept_id" : concept_id ,
160+ "filename" : filename
161+ } for _ in chunks ]
162+
163+ vector_store_service .add_texts (texts = chunks , metadatas = metadatas )
74164
75165 # Create and return processed document info
76166 # Determine document type based on file extension
0 commit comments