Skip to content

Commit 6f31b36

Browse files
added final code for video link
1 parent b71a76a commit 6f31b36

File tree

9 files changed

+195
-124
lines changed

9 files changed

+195
-124
lines changed
-875 Bytes
Binary file not shown.

FASTAPI-DEPLOYMENT/rhl_fastapi_v2_modify.py

Lines changed: 87 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -989,162 +989,125 @@ def handle_chitchat(user_message: str, chat_history: str) -> str:
989989
print(f"[chitchat] Gemini LLM failed: {e}")
990990
return "Whoa, let's keep it polite, please! 😊"
991991

992-
# -------------------- VIDEO MATCHING SYSTEM --------------------
992+
# -------------------- VIDEO MATCHING SYSTEM (SIMPLIFIED BERT APPROACH) --------------------
993993
class VideoMatchingSystem:
994994
def __init__(self, video_file_path: str = "D:\\RHL-WH\\RHL-FASTAPI\\FILES\\video_link_topic.xlsx"):
995-
"""Initialize the video matching system"""
995+
"""Initialize the simplified video matching system using BERT similarity"""
996996
self.video_file_path = video_file_path
997-
self.topic_dict = {} # topic -> index
998-
self.url_dict = {} # index -> URL
997+
self.topic_list = [] # List of topic strings
998+
self.url_list = [] # List of corresponding URLs
999999

10001000
# Load video data
10011001
self._load_video_data()
1002+
1003+
# Initialize BERT model for similarity
1004+
print("[VIDEO_SYSTEM] Loading BERT model for semantic similarity...")
1005+
self.similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
1006+
print("[VIDEO_SYSTEM] BERT model loaded successfully")
10021007

10031008
def _load_video_data(self):
1004-
"""Load and preprocess video data"""
1009+
"""Load and preprocess video data into simple lists using Description column"""
10051010
try:
10061011
df = pd.read_excel(self.video_file_path)
10071012
print(f"[VIDEO_SYSTEM] Loaded {len(df)} videos from {self.video_file_path}")
10081013

1009-
# Create dictionaries
1014+
# Create simple lists using Description column instead of video_topic
10101015
for idx, row in df.iterrows():
1011-
topic = row['video_topic'].strip()
1016+
description = row['Description'].strip()
10121017
url = row['URL'].strip()
10131018

1014-
if topic and url:
1015-
self.topic_dict[topic] = idx
1016-
self.url_dict[idx] = url
1019+
if description and url:
1020+
self.topic_list.append(description)
1021+
self.url_list.append(url)
10171022

1018-
print(f"[VIDEO_SYSTEM] Created topic_dict with {len(self.topic_dict)} topics")
1023+
print(f"[VIDEO_SYSTEM] Created description_list with {len(self.topic_list)} descriptions")
1024+
print(f"[VIDEO_SYSTEM] Sample descriptions:")
1025+
for i, desc in enumerate(self.topic_list[:3]):
1026+
print(f" {i}: {desc[:100]}...")
10191027

10201028
except Exception as e:
10211029
print(f"[VIDEO_SYSTEM] Error loading video data: {e}")
1022-
self.topic_dict = {}
1023-
self.url_dict = {}
1030+
self.topic_list = []
1031+
self.url_list = []
10241032

1025-
def pre_filter_topics(self, answer: str, min_matches: int = 4) -> List[Tuple[int, int]]:
1026-
"""Strict word matching to reduce candidates - only highly relevant topics"""
1027-
candidates = []
1028-
answer_words = set(answer.lower().split())
1033+
def find_relevant_video(self, answer: str) -> Optional[str]:
1034+
"""Find relevant video using BERT similarity + LLM verification"""
1035+
if not self.topic_list:
1036+
return None
1037+
1038+
print(f"[VIDEO_SYSTEM] Searching for video for answer: {answer[:100]}...")
10291039

1030-
# Remove common words that don't add meaning
1031-
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
1032-
answer_words = answer_words - stop_words
1040+
# Step 1: BERT Semantic Similarity
1041+
print("[VIDEO_SYSTEM] Step 1: Computing BERT semantic similarities...")
1042+
bert_start = time.perf_counter()
10331043

1034-
for topic, idx in self.topic_dict.items():
1035-
# Split topic by comma and get individual words
1036-
topic_words = set()
1037-
for term in topic.split(','):
1038-
topic_words.update(term.strip().lower().split())
1044+
# Encode answer and all topics
1045+
answer_embedding = self.similarity_model.encode([answer])
1046+
topic_embeddings = self.similarity_model.encode(self.topic_list)
1047+
1048+
# Compute cosine similarities
1049+
similarities = cosine_similarity(answer_embedding, topic_embeddings)[0]
1050+
1051+
# Find best match
1052+
best_idx = np.argmax(similarities)
1053+
best_similarity = similarities[best_idx]
1054+
1055+
bert_end = time.perf_counter()
1056+
print(f"[VIDEO_SYSTEM] BERT similarity computation took {bert_end - bert_start:.3f} seconds")
1057+
print(f"[VIDEO_SYSTEM] Best similarity score: {best_similarity:.3f}")
1058+
print(f"[VIDEO_SYSTEM] Best description: {self.topic_list[best_idx][:100]}...")
1059+
1060+
# Step 2: LLM Verification (only for top match)
1061+
if best_similarity >= 0.3: # Threshold for semantic similarity
1062+
print("[VIDEO_SYSTEM] Step 2: LLM verification of top match...")
1063+
llm_start = time.perf_counter()
10391064

1040-
# Remove stop words from topic words too
1041-
topic_words = topic_words - stop_words
1065+
verification_result = self._verify_with_llm(answer, self.topic_list[best_idx])
10421066

1043-
# Count matches
1044-
matches = len(answer_words.intersection(topic_words))
1067+
llm_end = time.perf_counter()
1068+
print(f"[VIDEO_SYSTEM] LLM verification took {llm_end - llm_start:.3f} seconds")
1069+
print(f"[VIDEO_SYSTEM] LLM verification result: {verification_result}")
10451070

1046-
# Much stricter criteria: need at least 4 meaningful word matches
1047-
if matches >= min_matches:
1048-
candidates.append((idx, matches))
1049-
1050-
# Sort by matches (descending)
1051-
candidates.sort(key=lambda x: x[1], reverse=True)
1052-
return candidates
1071+
if verification_result:
1072+
video_url = self.url_list[best_idx]
1073+
print(f"[VIDEO_SYSTEM] Found relevant video: {video_url}")
1074+
return video_url
1075+
else:
1076+
print("[VIDEO_SYSTEM] LLM verification failed - no video")
1077+
return None
1078+
else:
1079+
print(f"[VIDEO_SYSTEM] Similarity score {best_similarity:.3f} below threshold 0.3 - no video")
1080+
return None
10531081

1054-
def llm_score_candidates(self, answer: str, candidates: List[Tuple[int, int]]) -> Optional[int]:
1055-
"""Use Gemini to score top candidates"""
1056-
if len(candidates) <= 1:
1057-
return candidates[0][0] if candidates else None
1058-
1059-
# Create prompt with top candidates
1060-
topic_list = []
1061-
for idx, matches in candidates[:10]: # Limit to top 10 for efficiency
1062-
topic = list(self.topic_dict.keys())[list(self.topic_dict.values()).index(idx)]
1063-
topic_list.append(f"{idx}: {topic}")
1064-
1065-
prompt = f"""Score these video topics against the medical answer (0-100 each):
1082+
def _verify_with_llm(self, answer: str, description: str) -> bool:
1083+
"""Use Gemini to verify if the video description is contextually relevant to the answer"""
1084+
prompt = f"""Analyze if the video description is contextually relevant to the medical answer.
10661085
1067-
Answer: {answer}
1086+
Medical Answer: {answer}
10681087
1069-
Topics:
1070-
{chr(10).join(topic_list)}
1088+
Video Description: {description}
10711089
1072-
IMPORTANT: Only give high scores (80+) if the video topic is DIRECTLY and STRONGLY related to the medical answer.
1073-
- 90-100: Perfect match, video directly addresses the answer topic
1074-
- 80-89: Strong match, video covers the same medical condition/treatment
1075-
- 70-79: Moderate match, video is somewhat related
1076-
- 60-69: Weak match, video has some connection
1077-
- 0-59: No meaningful connection
1090+
Question: Is this video description DIRECTLY and STRONGLY related to the medical answer?
1091+
1092+
Rules:
1093+
- Return "YES" only if the video description directly addresses the same medical condition, procedure, or treatment mentioned in the answer
1094+
- Return "NO" if the description is related but not directly relevant (e.g., general care vs specific procedure)
1095+
- Return "NO" if the description is about a different medical condition entirely
1096+
1097+
Examples:
1098+
- Answer about "eye care for newborns" + Description "video about applying eye medication to prevent infections" → YES
1099+
- Answer about "eye care for newborns" + Description "video about umbilical cord care procedures" → NO
1100+
- Answer about "temperature measurement" + Description "video about using thermometer to check baby temperature" → YES
1101+
1102+
Response (YES/NO only):"""
10781103

1079-
Return JSON: {{"scores": [85, 92, 45, ...]}}"""
1080-
10811104
try:
1082-
response = gemini_llm.invoke([HumanMessage(content=prompt)]).content
1083-
1084-
# Parse JSON response
1085-
try:
1086-
# Extract JSON from response
1087-
json_start = response.find('{')
1088-
json_end = response.rfind('}') + 1
1089-
if json_start != -1 and json_end > json_start:
1090-
json_str = response[json_start:json_end]
1091-
scores_data = json.loads(json_str)
1092-
scores = scores_data.get('scores', [])
1093-
1094-
if scores and len(scores) == len(candidates[:10]):
1095-
# Find best score
1096-
best_score = max(scores)
1097-
best_score_idx = scores.index(best_score)
1098-
best_candidate_idx = candidates[best_score_idx][0]
1099-
1100-
# Only return if score is high enough (80+ for strong relevance)
1101-
if best_score >= 80:
1102-
print(f"[VIDEO_SYSTEM] Best score: {best_score} (meets threshold)")
1103-
return best_candidate_idx
1104-
else:
1105-
print(f"[VIDEO_SYSTEM] Best score: {best_score} (below 80 threshold, no video)")
1106-
return None
1107-
1108-
except Exception as e:
1109-
print(f"[VIDEO_SYSTEM] Error parsing LLM response: {e}")
1110-
1105+
response = gemini_llm.invoke([HumanMessage(content=prompt)]).content.strip().upper()
1106+
print(f"[VIDEO_SYSTEM] LLM response: {response}")
1107+
return response == "YES"
11111108
except Exception as e:
1112-
print(f"[VIDEO_SYSTEM] LLM call failed: {e}")
1113-
1114-
# Fallback to first candidate
1115-
return candidates[0][0] if candidates else None
1116-
1117-
def find_relevant_video(self, answer: str) -> Optional[str]:
1118-
"""Find relevant video URL for the answer - STRICT MATCHING ONLY"""
1119-
if not self.topic_dict:
1120-
return None
1121-
1122-
print(f"[VIDEO_SYSTEM] Searching for video for answer: {answer[:100]}...")
1123-
1124-
# Step 1: Pre-filtering (strict - need 4+ meaningful word matches)
1125-
candidates = self.pre_filter_topics(answer, min_matches=4)
1126-
1127-
if not candidates:
1128-
print("[VIDEO_SYSTEM] No candidates found with 4+ meaningful word matches")
1129-
return None
1130-
1131-
print(f"[VIDEO_SYSTEM] Found {len(candidates)} candidates after pre-filtering")
1132-
1133-
# Step 2: LLM scoring (if multiple candidates)
1134-
if len(candidates) == 1:
1135-
# For single candidate, still use LLM to verify relevance
1136-
best_idx = self.llm_score_candidates(answer, candidates)
1137-
else:
1138-
best_idx = self.llm_score_candidates(answer, candidates)
1139-
1140-
# Step 3: Get URL only if we have a valid, high-scoring match
1141-
if best_idx is not None and best_idx in self.url_dict:
1142-
video_url = self.url_dict[best_idx]
1143-
print(f"[VIDEO_SYSTEM] Found relevant video: {video_url}")
1144-
return video_url
1145-
else:
1146-
print("[VIDEO_SYSTEM] No video found - no high-relevance matches")
1147-
return None
1109+
print(f"[VIDEO_SYSTEM] LLM verification failed: {e}")
1110+
return False
11481111

11491112
# Global video matching system
11501113
video_system: VideoMatchingSystem = None

FILES/cached_match.py

Whitespace-only changes.

FILES/video_link_topic.xlsx

0 Bytes
Binary file not shown.

FILES/~$video_link_topic.xlsx

165 Bytes
Binary file not shown.

chat_history.db

168 KB
Binary file not shown.

test_description_column.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

test_eye_care_video.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

test_simplified_video_matching.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Test script for the simplified BERT-based video matching system
4+
"""
5+
6+
import requests
7+
import json
8+
import time
9+
10+
def test_video_matching():
11+
"""Test the simplified video matching system"""
12+
13+
base_url = "http://localhost:8000"
14+
15+
# Test cases with expected behavior
16+
test_cases = [
17+
{
18+
"question": "What is eye care for newborns?",
19+
"expected_keywords": ["eye", "care", "newborn", "medication", "infection"],
20+
"should_have_video": True,
21+
"description": "Eye care question - should match eye-related video"
22+
},
23+
{
24+
"question": "How to take baby temperature?",
25+
"expected_keywords": ["temperature", "baby", "thermometer"],
26+
"should_have_video": True,
27+
"description": "Temperature question - should match temperature video"
28+
},
29+
{
30+
"question": "What causes jaundice in babies?",
31+
"expected_keywords": ["jaundice", "baby", "causes"],
32+
"should_have_video": False, # No jaundice-specific video in our dataset
33+
"description": "Jaundice question - may not have specific video"
34+
},
35+
{
36+
"question": "How to care for umbilical cord?",
37+
"expected_keywords": ["cord", "umbilical", "care"],
38+
"should_have_video": True,
39+
"description": "Cord care question - should match cord care video"
40+
}
41+
]
42+
43+
print("="*80)
44+
print("TESTING SIMPLIFIED BERT-BASED VIDEO MATCHING SYSTEM")
45+
print("="*80)
46+
47+
for i, test_case in enumerate(test_cases, 1):
48+
print(f"\n--- TEST CASE {i}: {test_case['description']} ---")
49+
print(f"Question: {test_case['question']}")
50+
51+
try:
52+
# Make API request
53+
start_time = time.time()
54+
response = requests.get(
55+
f"{base_url}/chat",
56+
params={
57+
"user_id": f"test_user_{i}",
58+
"message": test_case['question']
59+
},
60+
timeout=30
61+
)
62+
end_time = time.time()
63+
64+
if response.status_code == 200:
65+
data = response.json()
66+
67+
print(f"Response time: {end_time - start_time:.2f} seconds")
68+
print(f"Answer length: {len(data.get('answer', ''))}")
69+
print(f"Video URL: {data.get('video_url', 'None')}")
70+
71+
# Check if video URL is present
72+
has_video = data.get('video_url') is not None
73+
print(f"Has video: {has_video}")
74+
75+
# Validate expectation
76+
if test_case['should_have_video']:
77+
if has_video:
78+
print("✅ PASS: Expected video and got video")
79+
else:
80+
print("❌ FAIL: Expected video but got None")
81+
else:
82+
if not has_video:
83+
print("✅ PASS: Expected no video and got None")
84+
else:
85+
print("❌ FAIL: Expected no video but got video")
86+
87+
# Print first 200 chars of answer for context
88+
answer_preview = data.get('answer', '')[:200]
89+
print(f"Answer preview: {answer_preview}...")
90+
91+
else:
92+
print(f"❌ API Error: {response.status_code} - {response.text}")
93+
94+
except requests.exceptions.RequestException as e:
95+
print(f"❌ Request failed: {e}")
96+
except Exception as e:
97+
print(f"❌ Unexpected error: {e}")
98+
99+
print("-" * 60)
100+
101+
print("\n" + "="*80)
102+
print("TESTING COMPLETE")
103+
print("="*80)
104+
105+
if __name__ == "__main__":
106+
test_video_matching()

0 commit comments

Comments
 (0)