@@ -989,162 +989,125 @@ def handle_chitchat(user_message: str, chat_history: str) -> str:
989989 print (f"[chitchat] Gemini LLM failed: { e } " )
990990 return "Whoa, let's keep it polite, please! 😊"
991991
992- # -------------------- VIDEO MATCHING SYSTEM --------------------
992+ # -------------------- VIDEO MATCHING SYSTEM (SIMPLIFIED BERT APPROACH) --------------------
993993class VideoMatchingSystem :
994994 def __init__ (self , video_file_path : str = "D:\\ RHL-WH\\ RHL-FASTAPI\\ FILES\\ video_link_topic.xlsx" ):
995- """Initialize the video matching system"""
995+ """Initialize the simplified video matching system using BERT similarity """
996996 self .video_file_path = video_file_path
997- self .topic_dict = {} # topic -> index
998- self .url_dict = {} # index -> URL
997+ self .topic_list = [] # List of topic strings
998+ self .url_list = [] # List of corresponding URLs
999999
10001000 # Load video data
10011001 self ._load_video_data ()
1002+
1003+ # Initialize BERT model for similarity
1004+ print ("[VIDEO_SYSTEM] Loading BERT model for semantic similarity..." )
1005+ self .similarity_model = SentenceTransformer ("sentence-transformers/all-MiniLM-L6-v2" )
1006+ print ("[VIDEO_SYSTEM] BERT model loaded successfully" )
10021007
10031008 def _load_video_data (self ):
1004- """Load and preprocess video data"""
1009+ """Load and preprocess video data into simple lists using Description column """
10051010 try :
10061011 df = pd .read_excel (self .video_file_path )
10071012 print (f"[VIDEO_SYSTEM] Loaded { len (df )} videos from { self .video_file_path } " )
10081013
1009- # Create dictionaries
1014+ # Create simple lists using Description column instead of video_topic
10101015 for idx , row in df .iterrows ():
1011- topic = row ['video_topic ' ].strip ()
1016+ description = row ['Description ' ].strip ()
10121017 url = row ['URL' ].strip ()
10131018
1014- if topic and url :
1015- self .topic_dict [ topic ] = idx
1016- self .url_dict [ idx ] = url
1019+ if description and url :
1020+ self .topic_list . append ( description )
1021+ self .url_list . append ( url )
10171022
1018- print (f"[VIDEO_SYSTEM] Created topic_dict with { len (self .topic_dict )} topics" )
1023+ print (f"[VIDEO_SYSTEM] Created description_list with { len (self .topic_list )} descriptions" )
1024+ print (f"[VIDEO_SYSTEM] Sample descriptions:" )
1025+ for i , desc in enumerate (self .topic_list [:3 ]):
1026+ print (f" { i } : { desc [:100 ]} ..." )
10191027
10201028 except Exception as e :
10211029 print (f"[VIDEO_SYSTEM] Error loading video data: { e } " )
1022- self .topic_dict = {}
1023- self .url_dict = {}
1030+ self .topic_list = []
1031+ self .url_list = []
10241032
1025- def pre_filter_topics (self , answer : str , min_matches : int = 4 ) -> List [Tuple [int , int ]]:
1026- """Strict word matching to reduce candidates - only highly relevant topics"""
1027- candidates = []
1028- answer_words = set (answer .lower ().split ())
1033+ def find_relevant_video (self , answer : str ) -> Optional [str ]:
1034+ """Find relevant video using BERT similarity + LLM verification"""
1035+ if not self .topic_list :
1036+ return None
1037+
1038+ print (f"[VIDEO_SYSTEM] Searching for video for answer: { answer [:100 ]} ..." )
10291039
1030- # Remove common words that don't add meaning
1031- stop_words = { 'the' , 'a' , 'an' , 'and' , 'or' , 'but' , 'in' , 'on' , 'at' , 'to' , 'for' , 'of' , 'with' , 'by' , 'is' , 'are' , 'was' , 'were' , 'be' , 'been' , 'have' , 'has' , 'had' , 'do' , 'does' , 'did' , 'will' , 'would' , 'could' , 'should' , 'may' , 'might' , 'can' , 'must' , 'this' , 'that' , 'these' , 'those' , 'i' , 'you' , 'he' , 'she' , 'it' , 'we' , 'they' , 'me' , 'him' , 'her' , 'us' , 'them' }
1032- answer_words = answer_words - stop_words
1040+ # Step 1: BERT Semantic Similarity
1041+ print ( "[VIDEO_SYSTEM] Step 1: Computing BERT semantic similarities..." )
1042+ bert_start = time . perf_counter ()
10331043
1034- for topic , idx in self .topic_dict .items ():
1035- # Split topic by comma and get individual words
1036- topic_words = set ()
1037- for term in topic .split (',' ):
1038- topic_words .update (term .strip ().lower ().split ())
1044+ # Encode answer and all topics
1045+ answer_embedding = self .similarity_model .encode ([answer ])
1046+ topic_embeddings = self .similarity_model .encode (self .topic_list )
1047+
1048+ # Compute cosine similarities
1049+ similarities = cosine_similarity (answer_embedding , topic_embeddings )[0 ]
1050+
1051+ # Find best match
1052+ best_idx = np .argmax (similarities )
1053+ best_similarity = similarities [best_idx ]
1054+
1055+ bert_end = time .perf_counter ()
1056+ print (f"[VIDEO_SYSTEM] BERT similarity computation took { bert_end - bert_start :.3f} seconds" )
1057+ print (f"[VIDEO_SYSTEM] Best similarity score: { best_similarity :.3f} " )
1058+ print (f"[VIDEO_SYSTEM] Best description: { self .topic_list [best_idx ][:100 ]} ..." )
1059+
1060+ # Step 2: LLM Verification (only for top match)
1061+ if best_similarity >= 0.3 : # Threshold for semantic similarity
1062+ print ("[VIDEO_SYSTEM] Step 2: LLM verification of top match..." )
1063+ llm_start = time .perf_counter ()
10391064
1040- # Remove stop words from topic words too
1041- topic_words = topic_words - stop_words
1065+ verification_result = self ._verify_with_llm (answer , self .topic_list [best_idx ])
10421066
1043- # Count matches
1044- matches = len (answer_words .intersection (topic_words ))
1067+ llm_end = time .perf_counter ()
1068+ print (f"[VIDEO_SYSTEM] LLM verification took { llm_end - llm_start :.3f} seconds" )
1069+ print (f"[VIDEO_SYSTEM] LLM verification result: { verification_result } " )
10451070
1046- # Much stricter criteria: need at least 4 meaningful word matches
1047- if matches >= min_matches :
1048- candidates .append ((idx , matches ))
1049-
1050- # Sort by matches (descending)
1051- candidates .sort (key = lambda x : x [1 ], reverse = True )
1052- return candidates
1071+ if verification_result :
1072+ video_url = self .url_list [best_idx ]
1073+ print (f"[VIDEO_SYSTEM] Found relevant video: { video_url } " )
1074+ return video_url
1075+ else :
1076+ print ("[VIDEO_SYSTEM] LLM verification failed - no video" )
1077+ return None
1078+ else :
1079+ print (f"[VIDEO_SYSTEM] Similarity score { best_similarity :.3f} below threshold 0.3 - no video" )
1080+ return None
10531081
1054- def llm_score_candidates (self , answer : str , candidates : List [Tuple [int , int ]]) -> Optional [int ]:
1055- """Use Gemini to score top candidates"""
1056- if len (candidates ) <= 1 :
1057- return candidates [0 ][0 ] if candidates else None
1058-
1059- # Create prompt with top candidates
1060- topic_list = []
1061- for idx , matches in candidates [:10 ]: # Limit to top 10 for efficiency
1062- topic = list (self .topic_dict .keys ())[list (self .topic_dict .values ()).index (idx )]
1063- topic_list .append (f"{ idx } : { topic } " )
1064-
1065- prompt = f"""Score these video topics against the medical answer (0-100 each):
1082+ def _verify_with_llm (self , answer : str , description : str ) -> bool :
1083+ """Use Gemini to verify if the video description is contextually relevant to the answer"""
1084+ prompt = f"""Analyze if the video description is contextually relevant to the medical answer.
10661085
1067- Answer: { answer }
1086+ Medical Answer: { answer }
10681087
1069- Topics:
1070- { chr (10 ).join (topic_list )}
1088+ Video Description: { description }
10711089
1072- IMPORTANT: Only give high scores (80+) if the video topic is DIRECTLY and STRONGLY related to the medical answer.
1073- - 90-100: Perfect match, video directly addresses the answer topic
1074- - 80-89: Strong match, video covers the same medical condition/treatment
1075- - 70-79: Moderate match, video is somewhat related
1076- - 60-69: Weak match, video has some connection
1077- - 0-59: No meaningful connection
1090+ Question: Is this video description DIRECTLY and STRONGLY related to the medical answer?
1091+
1092+ Rules:
1093+ - Return "YES" only if the video description directly addresses the same medical condition, procedure, or treatment mentioned in the answer
1094+ - Return "NO" if the description is related but not directly relevant (e.g., general care vs specific procedure)
1095+ - Return "NO" if the description is about a different medical condition entirely
1096+
1097+ Examples:
1098+ - Answer about "eye care for newborns" + Description "video about applying eye medication to prevent infections" → YES
1099+ - Answer about "eye care for newborns" + Description "video about umbilical cord care procedures" → NO
1100+ - Answer about "temperature measurement" + Description "video about using thermometer to check baby temperature" → YES
1101+
1102+ Response (YES/NO only):"""
10781103
1079- Return JSON: {{"scores": [85, 92, 45, ...]}}"""
1080-
10811104 try :
1082- response = gemini_llm .invoke ([HumanMessage (content = prompt )]).content
1083-
1084- # Parse JSON response
1085- try :
1086- # Extract JSON from response
1087- json_start = response .find ('{' )
1088- json_end = response .rfind ('}' ) + 1
1089- if json_start != - 1 and json_end > json_start :
1090- json_str = response [json_start :json_end ]
1091- scores_data = json .loads (json_str )
1092- scores = scores_data .get ('scores' , [])
1093-
1094- if scores and len (scores ) == len (candidates [:10 ]):
1095- # Find best score
1096- best_score = max (scores )
1097- best_score_idx = scores .index (best_score )
1098- best_candidate_idx = candidates [best_score_idx ][0 ]
1099-
1100- # Only return if score is high enough (80+ for strong relevance)
1101- if best_score >= 80 :
1102- print (f"[VIDEO_SYSTEM] Best score: { best_score } (meets threshold)" )
1103- return best_candidate_idx
1104- else :
1105- print (f"[VIDEO_SYSTEM] Best score: { best_score } (below 80 threshold, no video)" )
1106- return None
1107-
1108- except Exception as e :
1109- print (f"[VIDEO_SYSTEM] Error parsing LLM response: { e } " )
1110-
1105+ response = gemini_llm .invoke ([HumanMessage (content = prompt )]).content .strip ().upper ()
1106+ print (f"[VIDEO_SYSTEM] LLM response: { response } " )
1107+ return response == "YES"
11111108 except Exception as e :
1112- print (f"[VIDEO_SYSTEM] LLM call failed: { e } " )
1113-
1114- # Fallback to first candidate
1115- return candidates [0 ][0 ] if candidates else None
1116-
1117- def find_relevant_video (self , answer : str ) -> Optional [str ]:
1118- """Find relevant video URL for the answer - STRICT MATCHING ONLY"""
1119- if not self .topic_dict :
1120- return None
1121-
1122- print (f"[VIDEO_SYSTEM] Searching for video for answer: { answer [:100 ]} ..." )
1123-
1124- # Step 1: Pre-filtering (strict - need 4+ meaningful word matches)
1125- candidates = self .pre_filter_topics (answer , min_matches = 4 )
1126-
1127- if not candidates :
1128- print ("[VIDEO_SYSTEM] No candidates found with 4+ meaningful word matches" )
1129- return None
1130-
1131- print (f"[VIDEO_SYSTEM] Found { len (candidates )} candidates after pre-filtering" )
1132-
1133- # Step 2: LLM scoring (if multiple candidates)
1134- if len (candidates ) == 1 :
1135- # For single candidate, still use LLM to verify relevance
1136- best_idx = self .llm_score_candidates (answer , candidates )
1137- else :
1138- best_idx = self .llm_score_candidates (answer , candidates )
1139-
1140- # Step 3: Get URL only if we have a valid, high-scoring match
1141- if best_idx is not None and best_idx in self .url_dict :
1142- video_url = self .url_dict [best_idx ]
1143- print (f"[VIDEO_SYSTEM] Found relevant video: { video_url } " )
1144- return video_url
1145- else :
1146- print ("[VIDEO_SYSTEM] No video found - no high-relevance matches" )
1147- return None
1109+ print (f"[VIDEO_SYSTEM] LLM verification failed: { e } " )
1110+ return False
11481111
11491112# Global video matching system
11501113video_system : VideoMatchingSystem = None
0 commit comments