Skip to content

Commit db1a62d

Browse files
opt: improve recommendations using larger candidate pool and better similarity methods
1 parent 6eb8479 commit db1a62d

File tree

4 files changed

+142
-55
lines changed

4 files changed

+142
-55
lines changed

app/core/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.0-rc.3"
1+
__version__ = "1.0.0-rc.4"

app/services/discovery.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ async def discover_recommendations(
3232
top_cast = profile.cast.get_top_features(limit=2)
3333
top_crew = profile.get_top_crew(limit=1) # e.g. [(555, 1.0)] - Director
3434

35+
top_countries = profile.get_top_countries(limit=2)
36+
3537
if not top_genres and not top_keywords and not top_cast:
3638
# Fallback if profile is empty
3739
return []
@@ -41,21 +43,44 @@ async def discover_recommendations(
4143
# Query 1: Top Genres Mix
4244
if top_genres:
4345
genre_ids = "|".join([str(g[0]) for g in top_genres])
44-
params_genres = {"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
45-
tasks.append(self._fetch_discovery(content_type, params_genres))
46+
params_popular = {"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
47+
tasks.append(self._fetch_discovery(content_type, params_popular))
48+
49+
# fetch atleast two pages of results
50+
for i in range(2):
51+
params_rating = {
52+
"with_genres": genre_ids,
53+
"sort_by": "ratings.desc",
54+
"vote_count.gte": 300,
55+
"page": i + 1,
56+
}
57+
tasks.append(self._fetch_discovery(content_type, params_rating))
4658

4759
# Query 2: Top Keywords
4860
if top_keywords:
4961
keyword_ids = "|".join([str(k[0]) for k in top_keywords])
5062
params_keywords = {"with_keywords": keyword_ids, "sort_by": "popularity.desc"}
5163
tasks.append(self._fetch_discovery(content_type, params_keywords))
5264

65+
# fetch atleast two pages of results
66+
for i in range(3):
67+
params_rating = {
68+
"with_keywords": keyword_ids,
69+
"sort_by": "ratings.desc",
70+
"vote_count.gte": 300,
71+
"page": i + 1,
72+
}
73+
tasks.append(self._fetch_discovery(content_type, params_rating))
74+
5375
# Query 3: Top Actors
5476
for actor in top_cast:
5577
actor_id = actor[0]
5678
params_actor = {"with_cast": str(actor_id), "sort_by": "popularity.desc"}
5779
tasks.append(self._fetch_discovery(content_type, params_actor))
5880

81+
params_rating = {"with_cast": str(actor_id), "sort_by": "ratings.desc", "vote_count.gte": 300}
82+
tasks.append(self._fetch_discovery(content_type, params_rating))
83+
5984
# Query 4: Top Director
6085
if top_crew:
6186
director_id = top_crew[0][0]
@@ -65,6 +90,18 @@ async def discover_recommendations(
6590
}
6691
tasks.append(self._fetch_discovery(content_type, params_director))
6792

93+
params_rating = {"with_crew": str(director_id), "sort_by": "ratings.desc", "vote_count.gte": 300}
94+
tasks.append(self._fetch_discovery(content_type, params_rating))
95+
96+
# Query 5: Top Countries
97+
if top_countries:
98+
country_ids = "|".join([str(c[0]) for c in top_countries])
99+
params_country = {"with_origin_country": country_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
100+
tasks.append(self._fetch_discovery(content_type, params_country))
101+
102+
params_rating = {"with_origin_country": country_ids, "sort_by": "ratings.desc", "vote_count.gte": 300}
103+
tasks.append(self._fetch_discovery(content_type, params_rating))
104+
68105
# 3. Execute Parallel Queries
69106
results_batches = await asyncio.gather(*tasks, return_exceptions=True)
70107

app/services/recommendation_service.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
from app.services.user_profile import UserProfileService
1313

1414

15+
def normalize(value, min_v=0, max_v=10):
16+
"""
17+
Normalize popularity / rating when blending.
18+
"""
19+
if max_v == min_v:
20+
return 0
21+
return (value - min_v) / (max_v - min_v)
22+
23+
1524
def _parse_identifier(identifier: str) -> tuple[str | None, int | None]:
1625
"""Parse Stremio identifier to extract IMDB ID and TMDB ID."""
1726
if not identifier:
@@ -350,7 +359,7 @@ async def _fetch_recommendations_from_tmdb(self, item_id: str, media_type: str,
350359
recommended_items = recommendation_response.get("results", [])
351360
if not recommended_items:
352361
return []
353-
return recommended_items[:limit]
362+
return recommended_items
354363

355364
async def get_recommendations(
356365
self,
@@ -396,28 +405,29 @@ async def get_recommendations(
396405
tasks_a = []
397406
for source in top_source_items:
398407
tasks_a.append(self._fetch_recommendations_from_tmdb(source.get("_id"), source.get("type"), limit=10))
408+
similarity_candidates = []
409+
similarity_recommendations = await asyncio.gather(*tasks_a, return_exceptions=True)
410+
similarity_recommendations = [item for item in similarity_recommendations if not isinstance(item, Exception)]
411+
for item in similarity_recommendations:
412+
similarity_candidates.extend(item)
399413

400414
# --- Candidate Set B: Profile-based Discovery ---
401415
# Use typed profile based on content_type
402416
user_profile = await self.user_profile_service.build_user_profile(scored_objects, content_type=content_type)
403-
task_b = self.discovery_engine.discover_recommendations(user_profile, content_type, limit=20)
404-
405-
# Execute all fetches
406-
all_results = await asyncio.gather(task_b, *tasks_a, return_exceptions=True)
407-
408-
discovery_candidates = all_results[0] if isinstance(all_results[0], list) else []
409-
similarity_batches = all_results[1:]
417+
discovery_candidates = await self.discovery_engine.discover_recommendations(
418+
user_profile, content_type, limit=20
419+
)
410420

411421
# --- Combine & Deduplicate ---
412422
candidate_pool = {} # tmdb_id -> item_dict
413423

414424
for item in discovery_candidates:
415425
candidate_pool[item["id"]] = item
416426

417-
for batch in similarity_batches:
418-
if isinstance(batch, list):
419-
for item in batch:
420-
candidate_pool[item["id"]] = item
427+
for item in similarity_candidates:
428+
# add score to boost similarity candidates
429+
item["_ranked_candidate"] = True
430+
candidate_pool[item["id"]] = item
421431

422432
# --- Re-Ranking & Filtering ---
423433
ranked_candidates = []
@@ -430,11 +440,15 @@ async def get_recommendations(
430440
sim_score = self.user_profile_service.calculate_similarity(user_profile, item)
431441
vote_average = item.get("vote_average", 0)
432442
popularity = item.get("popularity", 0)
433-
import math
434443

435-
pop_score = math.log(popularity + 1) if popularity > 0 else 0
444+
pop_score = normalize(popularity, 0, 1000)
445+
vote_score = normalize(vote_average, 0, 10)
446+
447+
final_score = (sim_score * 0.6) + (vote_score * 0.3) + (pop_score * 0.1)
436448

437-
final_score = (sim_score * 0.7) + (vote_average * 0.2) + (pop_score * 0.1)
449+
# Boost candidate if its from tmdb collaborative recommendations
450+
if item.get("_ranked_candidate"):
451+
final_score *= 1.25
438452
ranked_candidates.append((final_score, item))
439453

440454
# Sort by Final Score

app/services/user_profile.py

Lines changed: 73 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,24 @@
55
from app.services.tmdb_service import TMDBService
66

77
# TODO: Make these weights dynamic based on user's preferences.
8-
GENRES_WEIGHT = 1.0
9-
KEYWORDS_WEIGHT = 2.0
10-
CAST_WEIGHT = 1.2
11-
CREW_WEIGHT = 1.2
12-
YEAR_WEIGHT = 0.5
13-
COUNTRIES_WEIGHT = 0.5
8+
GENRES_WEIGHT = 0.3
9+
KEYWORDS_WEIGHT = 0.40
10+
CAST_WEIGHT = 0.1
11+
CREW_WEIGHT = 0.1
12+
YEAR_WEIGHT = 0.05
13+
COUNTRIES_WEIGHT = 0.05
14+
BASE_GENRE_WEIGHT = 0.15
15+
16+
17+
def emphasis(x: float) -> float:
18+
"""
19+
Non-linear boost for strong preferences.
20+
"""
21+
return x**1.25
22+
23+
24+
def safe_div(a, b):
25+
return a / b if b else 0.0
1426

1527

1628
class UserProfileService:
@@ -83,41 +95,65 @@ async def build_user_profile(
8395

8496
def calculate_similarity(self, profile: UserTasteProfile, item_meta: dict) -> float:
8597
"""
86-
Calculate the match score between a candidate item and the user profile.
87-
Uses a weighted dot product strategy.
98+
Final improved similarity scoring function.
99+
Uses normalized sparse matching + rarity boosting + non-linear emphasis.
88100
"""
89-
# 1. Vectorize the candidate item
90-
item_vector = self._vectorize_item(item_meta)
91101

92-
score = 0.0
102+
item_vec = self._vectorize_item(item_meta)
93103

94-
# 2. Calculate Dot Product for each dimension
95-
# We can tune the weights of dimensions here too if needed
96-
97-
# Genres match
98-
for g_id in item_vector["genres"]:
99-
score += profile.genres.values.get(g_id, 0.0) * GENRES_WEIGHT
100-
101-
# Keywords match (Higher weight usually)
102-
for k_id in item_vector["keywords"]:
103-
score += profile.keywords.values.get(k_id, 0.0) * KEYWORDS_WEIGHT
104-
105-
# Cast match
106-
for c_id in item_vector["cast"]:
107-
score += profile.cast.values.get(c_id, 0.0) * CAST_WEIGHT
108-
109-
# Crew/Director match
110-
for cr_id in item_vector["crew"]:
111-
score += profile.crew.values.get(cr_id, 0.0) * CREW_WEIGHT
112-
113-
# Year match (Bucket)
114-
year = item_vector["year"]
115-
if year:
116-
score += profile.years.values.get(year, 0.0) * YEAR_WEIGHT
104+
score = 0.0
117105

118-
# Country match
119-
for c_code in item_vector["countries"]:
120-
score += profile.countries.values.get(c_code, 0.0) * COUNTRIES_WEIGHT
106+
print(profile)
107+
108+
# 1. GENRES
109+
# Normalize so movies with many genres don't get excessive score.
110+
for gid in item_vec["genres"]:
111+
pref = profile.genres.values.get(gid, 0.0)
112+
113+
if pref > 0:
114+
s = emphasis(pref)
115+
s = safe_div(s, len(item_vec["genres"]))
116+
score += s * GENRES_WEIGHT
117+
118+
# Soft prior bias (genre-only)
119+
base_pref = profile.top_genres_normalized.get(gid, 0.0)
120+
score += base_pref * BASE_GENRE_WEIGHT
121+
122+
# 2. KEYWORDS
123+
for kw in item_vec["keywords"]:
124+
pref = profile.keywords.values.get(kw, 0.0)
125+
126+
if pref > 0:
127+
s = emphasis(pref)
128+
s = safe_div(s, len(item_vec["keywords"]))
129+
score += s * KEYWORDS_WEIGHT
130+
131+
# 3. CAST
132+
for cid in item_vec["cast"]:
133+
pref = profile.cast.values.get(cid, 0.0)
134+
135+
if pref > 0:
136+
s = emphasis(pref)
137+
s = safe_div(s, len(item_vec["cast"]))
138+
score += s * CAST_WEIGHT
139+
140+
# 4. CREW
141+
for cr in item_vec["crew"]:
142+
pref = profile.crew.values.get(cr, 0.0)
143+
144+
if pref > 0:
145+
s = emphasis(pref)
146+
s = safe_div(s, len(item_vec["crew"]))
147+
score += s * CREW_WEIGHT
148+
149+
# 5. COUNTRIES
150+
for c in item_vec["countries"]:
151+
pref = profile.countries.values.get(c, 0.0)
152+
153+
if pref > 0:
154+
s = emphasis(pref)
155+
s = safe_div(s, len(item_vec["countries"]))
156+
score += s * COUNTRIES_WEIGHT
121157

122158
return score
123159

0 commit comments

Comments
 (0)