2020from langchain_community .document_loaders import YoutubeLoader
2121from langchain_community .document_loaders import WikipediaLoader
2222import warnings
23+ from pytube import YouTube
24+ from youtube_transcript_api import YouTubeTranscriptApi
25+ import sys
26+
2327warnings .filterwarnings ("ignore" )
2428
2529load_dotenv ()
@@ -215,9 +219,17 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
215219 # match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
216220 match = re .search (r'(?:v=)([0-9A-Za-z_-]{11})\s*' ,source_url )
217221 logging .info (f"match value{ match } " )
218- youtube_id = match .group (1 )
219- file_name = youtube_id .strip ()
220- file_size = ''
222+ file_name = YouTube (source_url ).title
223+ transcript = get_youtube_transcript (match .group (1 ))
224+ if transcript == None or len (transcript )== 0 :
225+ file_size = ''
226+ job_status = "Failed"
227+ message = f"Youtube transcript is not available for : { file_name } "
228+ error_message = str (e )
229+ logging .exception (f'Exception Stack trace:' )
230+ return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
231+ else :
232+ file_size = sys .getsizeof (transcript )
221233 file_type = 'text'
222234 aws_access_key_id = ''
223235 job_status = "Completed"
@@ -232,7 +244,15 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
232244 error_message = str (e )
233245 logging .exception (f'Exception Stack trace:' )
234246 return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
235-
247+
248+ def get_youtube_transcript (youtube_id ):
249+ transcript_dict = YouTubeTranscriptApi .get_transcript (youtube_id )
250+ transcript = ''
251+ for td in transcript_dict :
252+ transcript += '' .join (td ['text' ])
253+ return transcript
254+
255+
236256def file_into_chunks (pages : List [Document ]):
237257 """
238258 Split a list of documents(file pages) into chunks of fixed size.
@@ -362,18 +382,19 @@ def extract_graph_from_file(uri, userName, password, model, db_name=None, file=N
362382
363383 logging .info ("Get graph document list from models" )
364384 if model == 'Diffbot' :
365- graph_documents = extract_graph_from_diffbot (graph ,chunks ,file_name ,uri ,userName ,password )
385+ graph_documents , cypher_list = extract_graph_from_diffbot (graph ,chunks ,file_name ,uri ,userName ,password )
366386
367387 elif model == 'OpenAI GPT 3.5' :
368388 model_version = 'gpt-3.5-turbo-16k'
369- graph_documents = extract_graph_from_OpenAI (model_version ,graph ,chunks ,file_name ,uri ,userName ,password )
389+ graph_documents , cypher_list = extract_graph_from_OpenAI (model_version ,graph ,chunks ,file_name ,uri ,userName ,password )
370390
371391 elif model == 'OpenAI GPT 4' :
372392 model_version = 'gpt-4-0125-preview'
373- graph_documents = extract_graph_from_OpenAI (model_version ,graph ,chunks ,file_name ,uri ,userName ,password )
393+ graph_documents , cypher_list = extract_graph_from_OpenAI (model_version ,graph ,chunks ,file_name ,uri ,userName ,password )
374394
375- #update_similarity_graph for the KNN Graph
376- update_graph (graph )
395+ #create relation between chunks (FIRST_CHUNK and NEXT_CHUNK)
396+ for query in cypher_list :
397+ graph .query (query )
377398
378399 distinct_nodes = set ()
379400 relations = []
@@ -455,9 +476,10 @@ def get_documents_from_youtube(url):
455476 translation = "en" ,
456477 add_video_info = True )
457478 pages = youtube_loader .load ()
458- match = re .search (r"v=([a-zA-Z0-9_-]+)" , url )
459- youtube_id = match .group (1 )
460- file_name = youtube_id
479+ # match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
480+ # youtube_id=match.group(1)
481+ # file_name=youtube_id
482+ file_name = YouTube (url ).title
461483 file_key = file_name
462484 return file_name , file_key , pages
463485 except Exception as e :
@@ -498,15 +520,15 @@ def get_source_list_from_graph(uri,userName,password,db_name=None):
498520 logging .exception (f'Exception:{ error_message } ' )
499521 return create_api_response (job_status ,message = message ,error = error_message )
500522
501- def update_graph (graph ):
523+ def update_graph (uri , userName , password , db_name ):
502524 """
503525 Update the graph node with SIMILAR relationship where embedding scrore match
504526 """
505527 try :
506528 knn_min_score = os .environ .get ('KNN_MIN_SCORE' )
507529
508530 query = "WHERE node <> c and score >= {} MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score"
509- # graph = Neo4jGraph()
531+ graph = Neo4jGraph (url = uri , database = db_name , username = userName , password = password )
510532 result = graph .query ("""MATCH (c:Chunk)
511533 WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
512534 CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score """ + query .format (knn_min_score ))
0 commit comments