|
| 1 | +from sklearnex import unpatch_sklearn |
| 2 | +unpatch_sklearn() |
| 3 | +from sklearn.metrics.pairwise import cosine_similarity |
| 4 | +from transformers import pipeline |
| 5 | +import requests |
| 6 | +import json |
| 7 | +from youtube_transcript_api import YouTubeTranscriptApi |
| 8 | +from sentence_transformers import SentenceTransformer |
| 9 | +import torch |
| 10 | +import time |
| 11 | + |
| 12 | +# Load the Universal Sentence Encoder model |
| 13 | +model = SentenceTransformer('bert-base-nli-mean-tokens') |
| 14 | + |
| 15 | +# YouTube API parameters |
| 16 | +API_KEY = "AIzaSyAMD4FgbCjmp-_8g8nams4tsno4DV1mDnE" |
| 17 | +MAX_RESULTS = 50 # Maximum number of search results to retrieve |
| 18 | + |
| 19 | +# Search for videos using the YouTube API |
| 20 | +def search_videos(query): |
| 21 | + url = f"https://www.googleapis.com/youtube/v3/search?key={API_KEY}&part=snippet&type=video&maxResults={MAX_RESULTS}&q={query}" |
| 22 | + response = requests.get(url) |
| 23 | + data = json.loads(response.text) |
| 24 | + video_ids = [item['id']['videoId'] for item in data['items']] |
| 25 | + video_titles = [item['snippet']['title'] for item in data['items']] |
| 26 | + return video_ids, video_titles |
| 27 | + |
| 28 | +# Retrieve video transcripts using the YouTube Transcript API |
| 29 | +def get_video_transcripts(video_ids): |
| 30 | + transcripts = [] |
| 31 | + for video_id in video_ids: |
| 32 | + try: |
| 33 | + transcript = YouTubeTranscriptApi.get_transcript(video_id) |
| 34 | + text = ' '.join([line['text'] for line in transcript]) |
| 35 | + transcripts.append(text) |
| 36 | + except: |
| 37 | + transcripts.append('') |
| 38 | + return transcripts |
| 39 | + |
| 40 | +def get_best_video(input_text: str): |
| 41 | + start_time = time.time() # Start measuring time |
| 42 | + |
| 43 | + # Encode the input text |
| 44 | + input_embedding = model.encode([input_text], convert_to_tensor=True) |
| 45 | + |
| 46 | + # Search for videos and retrieve video transcripts |
| 47 | + video_ids, video_titles = search_videos(input_text) |
| 48 | + video_transcripts = get_video_transcripts(video_ids) |
| 49 | + |
| 50 | + # Encode the video transcripts |
| 51 | + video_embeddings = model.encode(video_transcripts, convert_to_tensor=True) |
| 52 | + |
| 53 | + # Calculate the similarity between the input text and video transcripts |
| 54 | + similarity_scores = cosine_similarity(input_embedding, video_embeddings) |
| 55 | + |
| 56 | + # Rank the videos based on similarity scores |
| 57 | + ranked_videos = sorted(zip(video_ids, video_titles, similarity_scores), key=lambda x: x[2], reverse=True) |
| 58 | + |
| 59 | + # Select the top-ranked video ID as the best match |
| 60 | + best_video_id = ranked_videos[0][0] |
| 61 | + |
| 62 | + # Construct the YouTube video URL |
| 63 | + best_video_url = f"https://www.youtube.com/watch?v={best_video_id}" |
| 64 | + |
| 65 | + end_time = time.time() # Stop measuring time |
| 66 | + elapsed_time = end_time - start_time |
| 67 | + |
| 68 | + return {"best_video_url": best_video_url, "elapsed_time": elapsed_time} |
| 69 | + |
| 70 | + |
| 71 | +# Example usage: |
| 72 | +input_text = "machine learning tutorial" |
| 73 | +best_video = get_best_video(input_text) |
| 74 | +print(best_video) |
0 commit comments