2020from langchain_community .document_loaders import YoutubeLoader
2121from langchain_community .document_loaders import WikipediaLoader
2222import warnings
23+ from pytube import YouTube
24+ from youtube_transcript_api import YouTubeTranscriptApi
25+ import sys
26+
2327warnings .filterwarnings ("ignore" )
2428
2529load_dotenv ()
@@ -215,9 +219,17 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
215219 # match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
216220 match = re .search (r'(?:v=)([0-9A-Za-z_-]{11})\s*' ,source_url )
217221 logging .info (f"match value{ match } " )
218- youtube_id = match .group (1 )
219- file_name = youtube_id .strip ()
220- file_size = ''
222+ file_name = YouTube (source_url ).title
223+ transcript = get_youtube_transcript (match .group (1 ))
224+ if transcript == None or len (transcript )== 0 :
225+ file_size = ''
226+ job_status = "Failed"
227+ message = f"Youtube transcript is not available for : { file_name } "
228+ error_message = str (e )
229+ logging .exception (f'Exception Stack trace:' )
230+ return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
231+ else :
232+ file_size = sys .getsizeof (transcript )
221233 file_type = 'text'
222234 aws_access_key_id = ''
223235 job_status = "Completed"
@@ -232,7 +244,15 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
232244 error_message = str (e )
233245 logging .exception (f'Exception Stack trace:' )
234246 return create_api_response (job_status ,message = message ,error = error_message ,file_source = source_type )
235-
247+
248+ def get_youtube_transcript (youtube_id ):
249+ transcript_dict = YouTubeTranscriptApi .get_transcript (youtube_id )
250+ transcript = ''
251+ for td in transcript_dict :
252+ transcript += '' .join (td ['text' ])
253+ return transcript
254+
255+
236256def file_into_chunks (pages : List [Document ]):
237257 """
238258 Split a list of documents(file pages) into chunks of fixed size.
@@ -456,9 +476,10 @@ def get_documents_from_youtube(url):
456476 translation = "en" ,
457477 add_video_info = True )
458478 pages = youtube_loader .load ()
459- match = re .search (r"v=([a-zA-Z0-9_-]+)" , url )
460- youtube_id = match .group (1 )
461- file_name = youtube_id
479+ # match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
480+ # youtube_id=match.group(1)
481+ # file_name=youtube_id
482+ file_name = YouTube (url ).title
462483 file_key = file_name
463484 return file_name , file_key , pages
464485 except Exception as e :
0 commit comments