opt: improve recommendations using larger candidate pool and better similarity methods

TimilsinaBimal · TimilsinaBimal · commit db1a62dc930f · 2025-12-05T15:02:37.000+05:45
diff --git a/app/core/version.py b/app/core/version.py
@@ -1 +1 @@
-__version__ = "1.0.0-rc.3"
+__version__ = "1.0.0-rc.4"
diff --git a/app/services/discovery.py b/app/services/discovery.py
@@ -32,6 +32,8 @@ async def discover_recommendations(
         top_cast = profile.cast.get_top_features(limit=2)
         top_crew = profile.get_top_crew(limit=1)  # e.g. [(555, 1.0)] - Director
 
+        top_countries = profile.get_top_countries(limit=2)
+
         if not top_genres and not top_keywords and not top_cast:
             # Fallback if profile is empty
             return []
@@ -41,21 +43,44 @@ async def discover_recommendations(
         # Query 1: Top Genres Mix
         if top_genres:
             genre_ids = "|".join([str(g[0]) for g in top_genres])
-            params_genres = {"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
-            tasks.append(self._fetch_discovery(content_type, params_genres))
+            params_popular = {"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
+            tasks.append(self._fetch_discovery(content_type, params_popular))
+
+            # fetch atleast two pages of results
+            for i in range(2):
+                params_rating = {
+                    "with_genres": genre_ids,
+                    "sort_by": "ratings.desc",
+                    "vote_count.gte": 300,
+                    "page": i + 1,
+                }
+                tasks.append(self._fetch_discovery(content_type, params_rating))
 
         # Query 2: Top Keywords
         if top_keywords:
             keyword_ids = "|".join([str(k[0]) for k in top_keywords])
             params_keywords = {"with_keywords": keyword_ids, "sort_by": "popularity.desc"}
             tasks.append(self._fetch_discovery(content_type, params_keywords))
 
+            # fetch atleast two pages of results
+            for i in range(3):
+                params_rating = {
+                    "with_keywords": keyword_ids,
+                    "sort_by": "ratings.desc",
+                    "vote_count.gte": 300,
+                    "page": i + 1,
+                }
+                tasks.append(self._fetch_discovery(content_type, params_rating))
+
         # Query 3: Top Actors
         for actor in top_cast:
             actor_id = actor[0]
             params_actor = {"with_cast": str(actor_id), "sort_by": "popularity.desc"}
             tasks.append(self._fetch_discovery(content_type, params_actor))
 
+            params_rating = {"with_cast": str(actor_id), "sort_by": "ratings.desc", "vote_count.gte": 300}
+            tasks.append(self._fetch_discovery(content_type, params_rating))
+
         # Query 4: Top Director
         if top_crew:
             director_id = top_crew[0][0]
@@ -65,6 +90,18 @@ async def discover_recommendations(
             }
             tasks.append(self._fetch_discovery(content_type, params_director))
 
+            params_rating = {"with_crew": str(director_id), "sort_by": "ratings.desc", "vote_count.gte": 300}
+            tasks.append(self._fetch_discovery(content_type, params_rating))
+
+        # Query 5: Top Countries
+        if top_countries:
+            country_ids = "|".join([str(c[0]) for c in top_countries])
+            params_country = {"with_origin_country": country_ids, "sort_by": "popularity.desc", "vote_count.gte": 100}
+            tasks.append(self._fetch_discovery(content_type, params_country))
+
+            params_rating = {"with_origin_country": country_ids, "sort_by": "ratings.desc", "vote_count.gte": 300}
+            tasks.append(self._fetch_discovery(content_type, params_rating))
+
         # 3. Execute Parallel Queries
         results_batches = await asyncio.gather(*tasks, return_exceptions=True)
 
diff --git a/app/services/recommendation_service.py b/app/services/recommendation_service.py
@@ -12,6 +12,15 @@
 from app.services.user_profile import UserProfileService
 
 
+def normalize(value, min_v=0, max_v=10):
+    """
+    Normalize popularity / rating when blending.
+    """
+    if max_v == min_v:
+        return 0
+    return (value - min_v) / (max_v - min_v)
+
+
 def _parse_identifier(identifier: str) -> tuple[str | None, int | None]:
     """Parse Stremio identifier to extract IMDB ID and TMDB ID."""
     if not identifier:
@@ -350,7 +359,7 @@ async def _fetch_recommendations_from_tmdb(self, item_id: str, media_type: str,
         recommended_items = recommendation_response.get("results", [])
         if not recommended_items:
             return []
-        return recommended_items[:limit]
+        return recommended_items
 
     async def get_recommendations(
         self,
@@ -396,28 +405,29 @@ async def get_recommendations(
         tasks_a = []
         for source in top_source_items:
             tasks_a.append(self._fetch_recommendations_from_tmdb(source.get("_id"), source.get("type"), limit=10))
+        similarity_candidates = []
+        similarity_recommendations = await asyncio.gather(*tasks_a, return_exceptions=True)
+        similarity_recommendations = [item for item in similarity_recommendations if not isinstance(item, Exception)]
+        for item in similarity_recommendations:
+            similarity_candidates.extend(item)
 
         # --- Candidate Set B: Profile-based Discovery ---
         # Use typed profile based on content_type
         user_profile = await self.user_profile_service.build_user_profile(scored_objects, content_type=content_type)
-        task_b = self.discovery_engine.discover_recommendations(user_profile, content_type, limit=20)
-
-        # Execute all fetches
-        all_results = await asyncio.gather(task_b, *tasks_a, return_exceptions=True)
-
-        discovery_candidates = all_results[0] if isinstance(all_results[0], list) else []
-        similarity_batches = all_results[1:]
+        discovery_candidates = await self.discovery_engine.discover_recommendations(
+            user_profile, content_type, limit=20
+        )
 
         # --- Combine & Deduplicate ---
         candidate_pool = {}  # tmdb_id -> item_dict
 
         for item in discovery_candidates:
             candidate_pool[item["id"]] = item
 
-        for batch in similarity_batches:
-            if isinstance(batch, list):
-                for item in batch:
-                    candidate_pool[item["id"]] = item
+        for item in similarity_candidates:
+            # add score to boost similarity candidates
+            item["_ranked_candidate"] = True
+            candidate_pool[item["id"]] = item
 
         # --- Re-Ranking & Filtering ---
         ranked_candidates = []
@@ -430,11 +440,15 @@ async def get_recommendations(
             sim_score = self.user_profile_service.calculate_similarity(user_profile, item)
             vote_average = item.get("vote_average", 0)
             popularity = item.get("popularity", 0)
-            import math
 
-            pop_score = math.log(popularity + 1) if popularity > 0 else 0
+            pop_score = normalize(popularity, 0, 1000)
+            vote_score = normalize(vote_average, 0, 10)
+
+            final_score = (sim_score * 0.6) + (vote_score * 0.3) + (pop_score * 0.1)
 
-            final_score = (sim_score * 0.7) + (vote_average * 0.2) + (pop_score * 0.1)
+            # Boost candidate if its from tmdb collaborative recommendations
+            if item.get("_ranked_candidate"):
+                final_score *= 1.25
             ranked_candidates.append((final_score, item))
 
         # Sort by Final Score
diff --git a/app/services/user_profile.py b/app/services/user_profile.py
@@ -5,12 +5,24 @@
 from app.services.tmdb_service import TMDBService
 
 # TODO: Make these weights dynamic based on user's preferences.
-GENRES_WEIGHT = 1.0
-KEYWORDS_WEIGHT = 2.0
-CAST_WEIGHT = 1.2
-CREW_WEIGHT = 1.2
-YEAR_WEIGHT = 0.5
-COUNTRIES_WEIGHT = 0.5
+GENRES_WEIGHT = 0.3
+KEYWORDS_WEIGHT = 0.40
+CAST_WEIGHT = 0.1
+CREW_WEIGHT = 0.1
+YEAR_WEIGHT = 0.05
+COUNTRIES_WEIGHT = 0.05
+BASE_GENRE_WEIGHT = 0.15
+
+
+def emphasis(x: float) -> float:
+    """
+    Non-linear boost for strong preferences.
+    """
+    return x**1.25
+
+
+def safe_div(a, b):
+    return a / b if b else 0.0
 
 
 class UserProfileService:
@@ -83,41 +95,65 @@ async def build_user_profile(
 
     def calculate_similarity(self, profile: UserTasteProfile, item_meta: dict) -> float:
         """
-        Calculate the match score between a candidate item and the user profile.
-        Uses a weighted dot product strategy.
+        Final improved similarity scoring function.
+        Uses normalized sparse matching + rarity boosting + non-linear emphasis.
         """
-        # 1. Vectorize the candidate item
-        item_vector = self._vectorize_item(item_meta)
 
-        score = 0.0
+        item_vec = self._vectorize_item(item_meta)
 
-        # 2. Calculate Dot Product for each dimension
-        # We can tune the weights of dimensions here too if needed
-
-        # Genres match
-        for g_id in item_vector["genres"]:
-            score += profile.genres.values.get(g_id, 0.0) * GENRES_WEIGHT
-
-        # Keywords match (Higher weight usually)
-        for k_id in item_vector["keywords"]:
-            score += profile.keywords.values.get(k_id, 0.0) * KEYWORDS_WEIGHT
-
-        # Cast match
-        for c_id in item_vector["cast"]:
-            score += profile.cast.values.get(c_id, 0.0) * CAST_WEIGHT
-
-        # Crew/Director match
-        for cr_id in item_vector["crew"]:
-            score += profile.crew.values.get(cr_id, 0.0) * CREW_WEIGHT
-
-        # Year match (Bucket)
-        year = item_vector["year"]
-        if year:
-            score += profile.years.values.get(year, 0.0) * YEAR_WEIGHT
+        score = 0.0
 
-        # Country match
-        for c_code in item_vector["countries"]:
-            score += profile.countries.values.get(c_code, 0.0) * COUNTRIES_WEIGHT
+        print(profile)
+
+        # 1. GENRES
+        # Normalize so movies with many genres don't get excessive score.
+        for gid in item_vec["genres"]:
+            pref = profile.genres.values.get(gid, 0.0)
+
+            if pref > 0:
+                s = emphasis(pref)
+                s = safe_div(s, len(item_vec["genres"]))
+                score += s * GENRES_WEIGHT
+
+            # Soft prior bias (genre-only)
+            base_pref = profile.top_genres_normalized.get(gid, 0.0)
+            score += base_pref * BASE_GENRE_WEIGHT
+
+        # 2. KEYWORDS
+        for kw in item_vec["keywords"]:
+            pref = profile.keywords.values.get(kw, 0.0)
+
+            if pref > 0:
+                s = emphasis(pref)
+                s = safe_div(s, len(item_vec["keywords"]))
+                score += s * KEYWORDS_WEIGHT
+
+        # 3. CAST
+        for cid in item_vec["cast"]:
+            pref = profile.cast.values.get(cid, 0.0)
+
+            if pref > 0:
+                s = emphasis(pref)
+                s = safe_div(s, len(item_vec["cast"]))
+                score += s * CAST_WEIGHT
+
+        # 4. CREW
+        for cr in item_vec["crew"]:
+            pref = profile.crew.values.get(cr, 0.0)
+
+            if pref > 0:
+                s = emphasis(pref)
+                s = safe_div(s, len(item_vec["crew"]))
+                score += s * CREW_WEIGHT
+
+        # 5. COUNTRIES
+        for c in item_vec["countries"]:
+            pref = profile.countries.values.get(c, 0.0)
+
+            if pref > 0:
+                s = emphasis(pref)
+                s = safe_div(s, len(item_vec["countries"]))
+                score += s * COUNTRIES_WEIGHT
 
         return score
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.0.0-rc.3"`
	`1`	`+__version__ = "1.0.0-rc.4"`