1717from tempfile import NamedTemporaryFile
1818import re
1919from langchain_community .document_loaders import YoutubeLoader
20+ from langchain .document_loaders import WikipediaLoader
21+ import warnings
22+ warnings .filterwarnings ("ignore" )
2023
2124load_dotenv ()
2225logging .basicConfig (format = '%(asctime)s - %(message)s' ,level = 'INFO' )
@@ -126,7 +129,7 @@ def check_url_source(url):
126129 except Exception as e :
127130 raise e
128131
129- def create_source_node_graph_url (uri , userName , password , source_url , max_limit , query_source , model , aws_access_key_id = None ,aws_secret_access_key = None ):
132+ def create_source_node_graph_url (uri , userName , password , source_url , max_limit , wiki_query , model , aws_access_key_id = None ,aws_secret_access_key = None ):
130133 """
131134 Creates a source node in Neo4jGraph and sets properties.
132135
@@ -141,9 +144,6 @@ def create_source_node_graph_url(uri, userName, password, source_url, max_limit,
141144 Success or Failed message of node creation
142145 """
143146 try :
144- # if aws_access_key_id !=None and aws_secret_access_key !=None:
145- # os.environ['AWS_ACCESS_KEY_ID']= aws_access_key_id
146- # os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
147147 graph = Neo4jGraph (url = uri , username = userName , password = password )
148148 source_type = check_url_source (source_url )
149149 print (f"source type URL:{ source_type } " )
@@ -164,7 +164,7 @@ def create_source_node_graph_url(uri, userName, password, source_url, max_limit,
164164 file_size = file_info ['file_size_bytes' ]
165165 s3_file_path = str (source_url + file_name )
166166 try :
167- create_source_node (graph ,file_name ,file_size ,file_type ,source_type ,model ,s3_file_path ,aws_access_key_id )
167+ create_source_node (graph ,file_name . split ( '/' )[ - 1 ] ,file_size ,file_type ,source_type ,model ,s3_file_path ,aws_access_key_id )
168168 success_count += 1
169169 except Exception as e :
170170 err_flag = 1
@@ -206,10 +206,12 @@ def file_into_chunks(pages: List[Document]):
206206 logging .info ("Split file into smaller chunks" )
207207 text_splitter = TokenTextSplitter (chunk_size = 200 , chunk_overlap = 20 )
208208 chunks = text_splitter .split_documents (pages )
209+ # print('Before chunks',len(chunks))
210+ chunks = chunks [:10 ]
209211 return chunks
210212
211213def get_s3_pdf_content (s3_url ,aws_access_key_id = None ,aws_secret_access_key = None ):
212- # try:
214+ try :
213215 # Extract bucket name and directory from the S3 URL
214216 parsed_url = urlparse (s3_url )
215217 bucket_name = parsed_url .netloc
@@ -222,10 +224,22 @@ def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None)
222224 else :
223225 return None
224226
225- # except Exception as e:
226- # return None
227+ except Exception as e :
228+ return None
229+
230+ def wiki_loader (wiki_query ,max_sources ,max_wiki_pages = 2 ):
231+
232+ searches = wiki_query .split (',' )
233+ searches = searches [:max_sources ]
234+ pages = []
235+ for query in searches :
236+ pages .extend (WikipediaLoader (query = query ,load_all_available_meta = False ).load ())
237+ pages = pages [:max_wiki_pages ]
238+ return pages
227239
228- def extract_graph_from_file (uri , userName , password , model , file = None ,source_url = None ,aws_access_key_id = None ,aws_secret_access_key = None ):
240+
241+
242+ def extract_graph_from_file (uri , userName , password , model , file = None ,source_url = None ,aws_access_key_id = None ,aws_secret_access_key = None ,wiki_query = None ,max_sources = None ,max_wiki_pages = 2 ):
229243 """
230244 Extracts a Neo4jGraph from a PDF file based on the model.
231245
@@ -350,22 +364,16 @@ def get_documents_from_file(file):
350364 return file_name ,file_key ,pages
351365
352366def get_documents_from_s3 (s3_url , aws_access_key_id , aws_secret_access_key ):
353- # if aws_access_key_id !=None and aws_secret_access_key !=None:
354- # os.environ['AWS_ACCESS_KEY_ID']= aws_access_key_id
355- # os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
356367
357368 parsed_url = urlparse (s3_url )
358369 bucket = parsed_url .netloc
359370 file_key = parsed_url .path .lstrip ('/' )
360371 file_name = file_key .split ('/' )[- 1 ]
361372 s3 = boto3 .client ('s3' ,aws_access_key_id = aws_access_key_id ,aws_secret_access_key = aws_secret_access_key )
362373 response = s3 .head_object (Bucket = bucket ,Key = file_key )
363- # response = s3.get_object(Bucket=bucket, Key=file_key)
364374 file_size = response ['ContentLength' ]
365375
366376 logging .info (f'bucket : { bucket } , file key : { file_key } , file size : { file_size } ' )
367-
368- # loader = S3FileLoader(bucket,file_key)
369377 pages = get_s3_pdf_content (s3_url ,aws_access_key_id = aws_access_key_id ,aws_secret_access_key = aws_secret_access_key )
370378 return file_name ,file_key ,pages
371379
0 commit comments