Skip to content

Commit b7ec837

Browse files
Merge pull request #124 from neo4j-labs/youtube_file_update
Updated file name and size for youtube url
2 parents 9ff034f + d929a73 commit b7ec837

File tree

1 file changed

+28
-7
lines changed

1 file changed

+28
-7
lines changed

backend/src/main.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
from langchain_community.document_loaders import YoutubeLoader
2121
from langchain_community.document_loaders import WikipediaLoader
2222
import warnings
23+
from pytube import YouTube
24+
from youtube_transcript_api import YouTubeTranscriptApi
25+
import sys
26+
2327
warnings.filterwarnings("ignore")
2428

2529
load_dotenv()
@@ -215,9 +219,17 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
215219
# match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", source_url)
216220
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',source_url)
217221
logging.info(f"match value{match}")
218-
youtube_id=match.group(1)
219-
file_name=youtube_id.strip()
220-
file_size=''
222+
file_name = YouTube(source_url).title
223+
transcript= get_youtube_transcript(match.group(1))
224+
if transcript==None or len(transcript)==0:
225+
file_size=''
226+
job_status = "Failed"
227+
message = f"Youtube transcript is not available for : {file_name}"
228+
error_message = str(e)
229+
logging.exception(f'Exception Stack trace:')
230+
return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
231+
else:
232+
file_size=sys.getsizeof(transcript)
221233
file_type='text'
222234
aws_access_key_id=''
223235
job_status = "Completed"
@@ -232,7 +244,15 @@ def create_source_node_graph_url(uri, userName, password, source_url ,model, db_
232244
error_message = str(e)
233245
logging.exception(f'Exception Stack trace:')
234246
return create_api_response(job_status,message=message,error=error_message,file_source=source_type)
235-
247+
248+
def get_youtube_transcript(youtube_id):
249+
transcript_dict = YouTubeTranscriptApi.get_transcript(youtube_id)
250+
transcript=''
251+
for td in transcript_dict:
252+
transcript += ''.join(td['text'])
253+
return transcript
254+
255+
236256
def file_into_chunks(pages: List[Document]):
237257
"""
238258
Split a list of documents(file pages) into chunks of fixed size.
@@ -456,9 +476,10 @@ def get_documents_from_youtube(url):
456476
translation = "en",
457477
add_video_info=True)
458478
pages = youtube_loader.load()
459-
match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
460-
youtube_id=match.group(1)
461-
file_name=youtube_id
479+
# match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
480+
# youtube_id=match.group(1)
481+
# file_name=youtube_id
482+
file_name = YouTube(url).title
462483
file_key=file_name
463484
return file_name, file_key, pages
464485
except Exception as e:

0 commit comments

Comments
 (0)