@@ -160,7 +160,7 @@ def check_url_source(url):
160160 logging .error (f"Error in recognize URL: { e } " )
161161 raise Exception (e )
162162
163- def create_source_node_graph_url (uri , userName , password , source_url ,model , db_name = None ,aws_access_key_id = None ,aws_secret_access_key = None ):
163+ def create_source_node_graph_url (uri , userName , password ,model , source_url = None , db_name = None , wiki_query : List [ str ] = None ,aws_access_key_id = None ,aws_secret_access_key = None ):
164164 """
165165 Creates a source node in Neo4jGraph and sets properties.
166166
@@ -176,60 +176,87 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
176176 Success or Failed message of node creation
177177 """
178178 try :
179- source_type ,youtube_url = check_url_source (source_url )
180179 graph = Neo4jGraph (url = uri , database = db_name , username = userName , password = password )
181- logging .info (f"source type URL:{ source_type } " )
182- if source_type == "s3 bucket" :
183- lst_s3_file_name = []
184- files_info = get_s3_files_info (source_url ,aws_access_key_id = aws_access_key_id ,aws_secret_access_key = aws_secret_access_key )
185- if isinstance (files_info ,dict ):
186- return files_info
187- elif len (files_info )== 0 :
188- return create_api_response ('Failed' ,success_count = 0 ,Failed_count = 0 ,message = 'No pdf files found.' )
189- logging .info (f'files info : { files_info } ' )
190- err_flag = 0
191- success_count = 0
192- Failed_count = 0
193- file_type = 'pdf'
194- for file_info in files_info :
195- job_status = "New"
196- file_name = file_info ['file_key' ]
197- file_size = file_info ['file_size_bytes' ]
198- s3_file_path = str (source_url + file_name )
199- try :
200- create_source_node (graph ,file_name .split ('/' )[- 1 ],file_size ,file_type ,source_type ,model ,s3_file_path ,aws_access_key_id )
201- success_count += 1
202- lst_s3_file_name .append ({'fileName' :file_name .split ('/' )[- 1 ],'fileSize' :file_size ,'url' :s3_file_path })
203-
204- except Exception as e :
205- err_flag = 1
206- Failed_count += 1
207- error_message = str (e )
208- if err_flag == 1 :
209- job_status = "Failed"
210- message = "Unable to create source node for s3 bucket files"
211- return create_api_response (job_status ,message = message ,error = error_message ,success_count = success_count ,Failed_count = Failed_count ,file_source = 's3 bucket' )
212- return create_api_response ("Success" ,message = "Source Node created successfully" ,success_count = success_count ,Failed_count = Failed_count ,file_source = 's3 bucket' ,file_name = lst_s3_file_name )
213- elif source_type == 'youtube' :
214- source_url = youtube_url
215- match = re .search (r'(?:v=)([0-9A-Za-z_-]{11})\s*' ,source_url )
216- logging .info (f"match value{ match } " )
217- file_name = YouTube (source_url ).title
218- transcript = get_youtube_transcript (match .group (1 ))
219- if transcript == None or len (transcript )== 0 :
220- file_size = ''
221- job_status = "Failed"
222- message = f"Youtube transcript is not available for : { file_name } "
223- error_message = str (e )
224- logging .exception (f'Exception Stack trace:' )
225- return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
226- else :
227- file_size = sys .getsizeof (transcript )
228- file_type = 'text'
229- aws_access_key_id = ''
230- job_status = "Completed"
231- create_source_node (graph ,file_name ,file_size ,file_type ,source_type ,model ,source_url ,aws_access_key_id )
232- return create_api_response (job_status ,file_name = {'fileName' :file_name ,'fileSize' :file_size ,'url' :source_url })
180+ if source_url :
181+ source_type ,youtube_url = check_url_source (source_url )
182+ logging .info (f"source type URL:{ source_type } " )
183+ if source_type == "s3 bucket" :
184+ lst_s3_file_name = []
185+ files_info = get_s3_files_info (source_url ,aws_access_key_id = aws_access_key_id ,aws_secret_access_key = aws_secret_access_key )
186+ if isinstance (files_info ,dict ):
187+ return files_info
188+ elif len (files_info )== 0 :
189+ return create_api_response ('Failed' ,success_count = 0 ,Failed_count = 0 ,message = 'No pdf files found.' )
190+ logging .info (f'files info : { files_info } ' )
191+ err_flag = 0
192+ success_count = 0
193+ Failed_count = 0
194+ file_type = 'pdf'
195+ for file_info in files_info :
196+ job_status = "New"
197+ file_name = file_info ['file_key' ]
198+ file_size = file_info ['file_size_bytes' ]
199+ s3_file_path = str (source_url + file_name )
200+ try :
201+ create_source_node (graph ,file_name .split ('/' )[- 1 ],file_size ,file_type ,source_type ,model ,s3_file_path ,aws_access_key_id )
202+ success_count += 1
203+ lst_s3_file_name .append ({'fileName' :file_name .split ('/' )[- 1 ],'fileSize' :file_size ,'url' :s3_file_path })
204+
205+ except Exception as e :
206+ err_flag = 1
207+ Failed_count += 1
208+ error_message = str (e )
209+ if err_flag == 1 :
210+ job_status = "Failed"
211+ message = "Unable to create source node for s3 bucket files"
212+ return create_api_response (job_status ,message = message ,error = error_message ,success_count = success_count ,Failed_count = Failed_count ,file_source = 's3 bucket' )
213+ return create_api_response ("Success" ,message = "Source Node created successfully" ,success_count = success_count ,Failed_count = Failed_count ,file_source = 's3 bucket' ,file_name = lst_s3_file_name )
214+ elif source_type == 'youtube' :
215+ source_url = youtube_url
216+ match = re .search (r'(?:v=)([0-9A-Za-z_-]{11})\s*' ,source_url )
217+ logging .info (f"match value{ match } " )
218+ file_name = YouTube (source_url ).title
219+ transcript = get_youtube_transcript (match .group (1 ))
220+ if transcript == None or len (transcript )== 0 :
221+ file_size = ''
222+ job_status = "Failed"
223+ message = f"Youtube transcript is not available for : { file_name } "
224+ error_message = str (e )
225+ logging .exception (f'Exception Stack trace:' )
226+ return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
227+ else :
228+ file_size = sys .getsizeof (transcript )
229+ file_type = 'text'
230+ aws_access_key_id = ''
231+ job_status = "Completed"
232+ create_source_node (graph ,file_name ,file_size ,file_type ,source_type ,model ,source_url ,aws_access_key_id )
233+ return create_api_response (job_status ,file_name = {'fileName' :file_name ,'fileSize' :file_size ,'url' :source_url })
234+
235+ elif wiki_query :
236+ success_count = 0
237+ Failed_count = 0
238+ lst_file_metadata = []
239+ queries = wiki_query .split (',' )
240+ for query in queries :
241+ logging .info (f"Creating source node for { query .strip ()} " )
242+ pages = WikipediaLoader (query = query .strip (), load_max_docs = 1 , load_all_available_meta = True ).load ()
243+ file_name = query .strip ()
244+ file_size = sys .getsizeof (pages [0 ].page_content )
245+ file_type = 'text'
246+ source_url = pages [0 ].metadata ['source' ]
247+ aws_access_key_id = ''
248+ source_type = 'Wikipedia'
249+ job_status = 'Completed'
250+ try :
251+ create_source_node (graph ,file_name ,file_size ,file_type ,source_type ,model ,source_url ,aws_access_key_id )
252+ success_count += 1
253+ lst_file_metadata .append ({'fileName' :file_name ,'fileSize' :file_size ,'url' :source_url })
254+ except Exception as e :
255+ job_status = "Failed"
256+ Failed_count += 1
257+ error_message = str (e )
258+ return create_api_response (job_status ,message = "SUnable to create source node for Wikipedia source" ,file_name = lst_file_metadata , success_count = success_count , Failed_count = Failed_count )
259+ return create_api_response (job_status ,message = "Source Node created successfully" ,file_name = lst_file_metadata , success_count = success_count , Failed_count = Failed_count )
233260 else :
234261 job_status = "Failed"
235262 return create_api_response (job_status ,message = 'Invalid URL' )
@@ -281,25 +308,6 @@ def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None)
281308 logging .error (f"getting error while reading content from s3 files:{ e } " )
282309 raise Exception (e )
283310
284- def get_wikipedia_content (wiki_query ,max_sources ):
285- try :
286- searches = wiki_query .split (',' )
287- if max_sources :
288- searches = searches [:int (max_sources )]
289- else :
290- searches = searches [:2 ]
291- pages = []
292- for query in searches :
293- wiki_pages = WikipediaLoader (query = query .strip (), load_max_docs = 1 , load_all_available_meta = False ).load ()
294- pages .extend (wiki_pages )
295-
296- logging .info (f"Total Pages from Wikipedia = { len (pages )} " )
297- return pages
298- except Exception as e :
299- logging .error (f"Not finding wiki content:{ e } " )
300- raise Exception (e )
301-
302-
303311
304312def extract_graph_from_file (uri , userName , password , model , db_name = None , file = None ,source_url = None ,aws_access_key_id = None ,aws_secret_access_key = None ,wiki_query = None ,max_sources = None ):
305313 """
@@ -329,6 +337,9 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
329337 if file != None :
330338 file_name , file_key , pages = get_documents_from_file (file )
331339
340+ elif wiki_query :
341+ file_name , file_key , pages = get_documents_from_Wikipedia (wiki_query )
342+
332343 elif source_type == 's3 bucket' :
333344 if (aws_access_key_id == None or aws_secret_access_key == None ):
334345 job_status = "Failed"
@@ -339,9 +350,6 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
339350 logging .info (f"filename { file_name } file_key: { file_key } pages:{ pages } " )
340351 elif source_type == 'youtube' :
341352 file_name , file_key , pages = get_documents_from_youtube (source_url )
342- if wiki_query is not None :
343- logging .info (f"Wikipedia query source = { wiki_query } " )
344- pages .extend (get_wikipedia_content (wiki_query , max_sources ))
345353
346354 else :
347355 job_status = "Failed"
@@ -483,6 +491,21 @@ def get_documents_from_youtube(url):
483491 logging .exception (f'Exception in reading transcript from youtube:{ error_message } ' )
484492 raise Exception (error_message )
485493
494+ def get_documents_from_Wikipedia (wiki_query :str ):
495+ try :
496+ pages = WikipediaLoader (query = wiki_query .strip (), load_max_docs = 1 , load_all_available_meta = False ).load ()
497+ file_name = wiki_query .strip ()
498+ file_key = wiki_query .strip ()
499+ logging .info (f"Total Pages from Wikipedia = { len (pages )} " )
500+ return file_name , file_key , pages
501+ except Exception as e :
502+ job_status = "Failed"
503+ message = "Failed To Process Wikipedia Query"
504+ error_message = str (e )
505+ logging .error (f"Failed To Process Wikipedia Query: { file_name } " )
506+ logging .exception (f'Exception Stack trace: { error_message } ' )
507+ return create_api_response (job_status ,message = message ,error = error_message ,file_name = file_name )
508+
486509def get_source_list_from_graph (uri ,userName ,password ,db_name = None ):
487510 """
488511 Args:
@@ -529,6 +552,27 @@ def update_graph(uri,userName,password,db_name):
529552 error_message = str (e )
530553 logging .exception (f'Exception in update KNN graph:{ error_message } ' )
531554 raise Exception (error_message )
555+
556+ def connection_check (uri ,userName ,password ,db_name ):
557+ """
558+ Args:
559+ uri: URI of the graph to extract
560+ userName: Username to use for graph creation ( if None will use username from config file )
561+ password: Password to use for graph creation ( if None will use password from config file )
562+ db_name: db_name is database name to connect to graph db
563+ Returns:
564+ Returns a status of connection from NEO4j is success or failure
565+ """
566+ try :
567+ graph = Neo4jGraph (url = uri , database = db_name , username = userName , password = password )
568+ if graph :
569+ return create_api_response ("Success" ,message = "Connection Successful" )
570+ except Exception as e :
571+ job_status = "Failed"
572+ message = "Connection Failed"
573+ error_message = str (e )
574+ logging .exception (f'Exception:{ error_message } ' )
575+ return create_api_response (job_status ,message = message ,error = error_message )
532576
533577def create_api_response (status ,success_count = None ,Failed_count = None , data = None , error = None ,message = None ,file_source = None ,file_name = None ):
534578 """
0 commit comments