refactor: update profile constants and enhance recommendation logic

TimilsinaBimal · TimilsinaBimal · commit ed126a7159dd · 2025-12-31T23:47:32.000+05:45
- Reduced SMART_SAMPLING_MAX_ITEMS from 50 to 30 for improved sampling efficiency.
- Increased TOP_PICKS_MIN_VOTE_COUNT from 100 to 300 to ensure higher quality recommendations.
- Modified SmartSampler to include a percentage-based approach for strong signal items.
- Updated TopPicksService to utilize SmartSampler for item selection and added filtering by vote count and rating in discovery queries.
- Adjusted TMDBService caching settings for better performance and reduced cache sizes.
diff --git a/app/services/profile/constants.py b/app/services/profile/constants.py
@@ -40,7 +40,7 @@
 RECENCY_DECAY_RATE: Final[float] = 0.98  # Daily decay multiplier (soft decay)
 
 # Smart Sampling
-SMART_SAMPLING_MAX_ITEMS: Final[int] = 50
+SMART_SAMPLING_MAX_ITEMS: Final[int] = 30
 
 # Frequency Multiplier (optional, subtle boost for repeated patterns)
 FREQUENCY_ENABLED: Final[bool] = True
@@ -52,7 +52,7 @@
 TOP_PICKS_GENRE_CAP: Final[float] = 0.30  # Max 30% per genre
 TOP_PICKS_CREATOR_CAP: Final[int] = 2  # Max 2 items per creator (director/actor)
 TOP_PICKS_ERA_CAP: Final[float] = 0.40  # Max 40% per era
-TOP_PICKS_MIN_VOTE_COUNT: Final[int] = 100  # Minimum vote count for quality
+TOP_PICKS_MIN_VOTE_COUNT: Final[int] = 300  # Minimum vote count for quality
 TOP_PICKS_MIN_RATING: Final[float] = 5.0  # Minimum weighted rating for quality
 
 # Genre whitelist limit (top N genres)
diff --git a/app/services/profile/sampling.py b/app/services/profile/sampling.py
@@ -69,8 +69,8 @@ def sample_items(
             if not (it.get("_is_loved") or it.get("_is_liked") or it.get("_id") in added_item_ids)
         ]
 
-        # Always include all strong signal items
-        strong_signal_items = loved_liked_items + added_items
+        # Always include strong signal items: Loved/Liked: 45%, Added: 20%
+        strong_signal_items = loved_liked_items[: int(max_items * 0.45)] + added_items[: int(max_items * 0.20)]
         strong_signal_scored = [self.scoring_service.process_item(it) for it in strong_signal_items]
 
         # Score watched items and sort by score
diff --git a/app/services/recommendation/item_based.py b/app/services/recommendation/item_based.py
@@ -11,6 +11,7 @@
     filter_watched_by_imdb,
     resolve_tmdb_id,
 )
+from app.services.tmdb.service import TMDBService
 
 
 class ItemBasedService:
@@ -19,7 +20,7 @@ class ItemBasedService:
     """
 
     def __init__(self, tmdb_service: Any, user_settings: Any = None):
-        self.tmdb_service = tmdb_service
+        self.tmdb_service: TMDBService = tmdb_service
         self.user_settings = user_settings
 
     async def get_recommendations_for_item(
@@ -77,7 +78,7 @@ async def get_recommendations_for_item(
         # Final filter (remove watched by IMDB ID)
         final = filter_watched_by_imdb(enriched, watched_imdb or set())
 
-        return final
+        return final[:limit]
 
     async def _fetch_candidates(self, tmdb_id: int, mtype: str) -> list[dict[str, Any]]:
         """
@@ -92,18 +93,18 @@ async def _fetch_candidates(self, tmdb_id: int, mtype: str) -> list[dict[str, An
         """
         combined = {}
 
-        # Fetch 2 pages each for recommendations and similar
-        for action in ["recommendations", "similar"]:
-            method = getattr(self.tmdb_service, f"get_{action}")
-            results = await asyncio.gather(*[method(tmdb_id, mtype, page=p) for p in [1, 2]], return_exceptions=True)
-
-            for res in results:
-                if isinstance(res, Exception):
-                    logger.debug(f"Error fetching {action} for {tmdb_id}: {res}")
-                    continue
-                for item in res.get("results", []):
-                    item_id = item.get("id")
-                    if item_id:
-                        combined[item_id] = item
+        results = await asyncio.gather(
+            *[self.tmdb_service.get_recommendations(tmdb_id, mtype, page=p) for p in [1, 2]],
+            return_exceptions=True,
+        )
+
+        for res in results:
+            if isinstance(res, Exception):
+                logger.warning(f"Error fetching recommendations for {tmdb_id}: {res}")
+                continue
+            for item in res.get("results", []):
+                item_id = item.get("id")
+                if item_id:
+                    combined[item_id] = item
 
         return list(combined.values())
diff --git a/app/services/recommendation/top_picks.py b/app/services/recommendation/top_picks.py
@@ -14,10 +14,12 @@
     TOP_PICKS_MIN_VOTE_COUNT,
     TOP_PICKS_RECENCY_CAP,
 )
+from app.services.profile.sampling import SmartSampler
 from app.services.profile.scorer import ProfileScorer
 from app.services.recommendation.metadata import RecommendationMetadata
 from app.services.recommendation.scoring import RecommendationScoring
 from app.services.recommendation.utils import content_type_to_mtype, filter_watched_by_imdb, resolve_tmdb_id
+from app.services.scoring import ScoringService
 from app.services.tmdb.service import TMDBService
 
 
@@ -30,6 +32,8 @@ def __init__(self, tmdb_service: TMDBService, user_settings: UserSettings | None
         self.tmdb_service: TMDBService = tmdb_service
         self.user_settings: UserSettings | None = user_settings
         self.scorer: ProfileScorer = ProfileScorer()
+        self.scoring_service = ScoringService()
+        self.smart_sampler = SmartSampler(self.scoring_service)
 
     async def get_top_picks(
         self,
@@ -142,16 +146,7 @@ async def _fetch_recommendations_from_top_items(
             List of candidate items
         """
         # Get top items (loved first, then liked, then added, then top watched)
-        all_items = (
-            library_items.get("loved", [])
-            + library_items.get("liked", [])
-            + library_items.get("added", [])
-            + library_items.get("watched", [])
-        )
-        typed_items = [it for it in all_items if it.get("type") == content_type]
-
-        # Limit to top 5 items (to avoid too many API calls)
-        top_items = typed_items[:5]
+        top_items = self.smart_sampler.sample_items(library_items, content_type, max_items=15)
 
         candidates = []
         tasks = []
@@ -168,7 +163,7 @@ async def _fetch_recommendations_from_top_items(
 
             # Fetch recommendations (1 page only)
             tasks.append(self.tmdb_service.get_recommendations(tmdb_id, mtype, page=1))
-            tasks.append(self.tmdb_service.get_similar(tmdb_id, mtype, page=1))
+            # tasks.append(self.tmdb_service.get_similar(tmdb_id, mtype, page=1))
 
         # Execute all in parallel
         results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -196,10 +191,11 @@ async def _fetch_discover_with_profile(
         """
         # Get top features from profile
         top_genres = profile.get_top_genres(limit=2)
-        top_keywords = profile.get_top_keywords(limit=2)
+        top_keywords = profile.get_top_keywords(limit=3)
         top_directors = profile.get_top_directors(limit=2)
         top_cast = profile.get_top_cast(limit=2)
         top_eras = profile.get_top_eras(limit=1)
+        top_countries = profile.get_top_countries(limit=1)
 
         candidates = []
         tasks = []
@@ -209,7 +205,12 @@ async def _fetch_discover_with_profile(
             genre_ids = [g[0] for g in top_genres]
             tasks.append(
                 self.tmdb_service.get_discover(
-                    mtype, with_genres="|".join(str(g) for g in genre_ids), page=1, sort_by="popularity.desc"
+                    mtype,
+                    with_genres="|".join(str(g) for g in genre_ids),
+                    page=1,
+                    sort_by="popularity.desc",
+                    vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                    vote_average_gte=TOP_PICKS_MIN_RATING,
                 )
             )
 
@@ -218,22 +219,41 @@ async def _fetch_discover_with_profile(
             keyword_ids = [k[0] for k in top_keywords]
             tasks.append(
                 self.tmdb_service.get_discover(
-                    mtype, with_keywords="|".join(str(k) for k in keyword_ids), page=1, sort_by="popularity.desc"
+                    mtype,
+                    with_keywords="|".join(str(k) for k in keyword_ids),
+                    page=1,
+                    sort_by="popularity.desc",
+                    vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                    vote_average_gte=TOP_PICKS_MIN_RATING,
                 )
             )
 
         # Discover with directors
         if top_directors:
             director_id = top_directors[0][0]
             tasks.append(
-                self.tmdb_service.get_discover(mtype, with_crew=str(director_id), page=1, sort_by="popularity.desc")
+                self.tmdb_service.get_discover(
+                    mtype,
+                    with_crew=str(director_id),
+                    page=1,
+                    sort_by="popularity.desc",
+                    vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                    vote_average_gte=TOP_PICKS_MIN_RATING,
+                )
             )
 
         # Discover with cast
         if top_cast:
             cast_id = top_cast[0][0]
             tasks.append(
-                self.tmdb_service.get_discover(mtype, with_cast=str(cast_id), page=1, sort_by="popularity.desc")
+                self.tmdb_service.get_discover(
+                    mtype,
+                    with_cast=str(cast_id),
+                    page=1,
+                    sort_by="popularity.desc",
+                    vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                    vote_average_gte=TOP_PICKS_MIN_RATING,
+                )
             )
 
         # Discover with era (year range)
@@ -248,9 +268,25 @@ async def _fetch_discover_with_profile(
                         **{f"{prefix}.gte": f"{year_start}-01-01", f"{prefix}.lte": f"{year_start+9}-12-31"},
                         page=1,
                         sort_by="popularity.desc",
+                        vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                        vote_average_gte=TOP_PICKS_MIN_RATING,
                     )
                 )
 
+        # Discover with countries
+        if top_countries:
+            country_codes = [c[0] for c in top_countries]
+            tasks.append(
+                self.tmdb_service.get_discover(
+                    mtype,
+                    with_origin_country="|".join(country_codes),
+                    page=1,
+                    sort_by="popularity.desc",
+                    vote_count_gte=TOP_PICKS_MIN_VOTE_COUNT,
+                    vote_average_gte=TOP_PICKS_MIN_RATING,
+                )
+            )
+
         # Execute all in parallel
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
diff --git a/app/services/tmdb/service.py b/app/services/tmdb/service.py
@@ -6,6 +6,8 @@
 
 from app.services.tmdb.client import TMDBClient
 
+# from app.services.profile.constants import TOP_PICKS_MIN_VOTE_COUNT, TOP_PICKS_MIN_RATING
+
 
 class TMDBService:
     """
@@ -20,7 +22,7 @@ async def close(self):
         """Close the underlying HTTP client."""
         await self.client.close()
 
-    @alru_cache(maxsize=2000)
+    @alru_cache(maxsize=1000)
     async def find_by_imdb_id(self, imdb_id: str) -> tuple[int | None, str | None]:
         """Find TMDB ID and type by IMDB ID."""
         try:
@@ -49,23 +51,25 @@ async def find_by_imdb_id(self, imdb_id: str) -> tuple[int | None, str | None]:
             logger.exception(f"Error finding TMDB ID for IMDB {imdb_id}: {e}")
             return None, None
 
-    @alru_cache(maxsize=5000)
+    @alru_cache(maxsize=500)
     async def get_movie_details(self, movie_id: int) -> dict[str, Any]:
         """Get details of a specific movie with credits and keywords."""
         params = {"append_to_response": "credits,external_ids,keywords"}
         return await self.client.get(f"/movie/{movie_id}", params=params)
 
-    @alru_cache(maxsize=5000)
+    @alru_cache(maxsize=500)
     async def get_tv_details(self, tv_id: int) -> dict[str, Any]:
         """Get details of a specific TV series with credits and keywords."""
         params = {"append_to_response": "credits,external_ids,keywords"}
         return await self.client.get(f"/tv/{tv_id}", params=params)
 
+    @alru_cache(maxsize=500, ttl=86400)
     async def get_recommendations(self, tmdb_id: int, media_type: str, page: int = 1) -> dict[str, Any]:
         """Get recommendations based on TMDB ID and media type."""
         params = {"page": page}
         return await self.client.get(f"/{media_type}/{tmdb_id}/recommendations", params=params)
 
+    @alru_cache(maxsize=500, ttl=86400)
     async def get_similar(self, tmdb_id: int, media_type: str, page: int = 1) -> dict[str, Any]:
         """Get similar content based on TMDB ID and media type."""
         params = {"page": page}
@@ -84,6 +88,9 @@ async def get_discover(
         params = {"page": page, "sort_by": sort_by}
         if with_genres:
             params["with_genres"] = with_genres
+        # # always filter by vote count
+        # params["vote_count.gte"] = TOP_PICKS_MIN_VOTE_COUNT
+        # params["vote_average.gte"] = TOP_PICKS_MIN_RATING
         params.update(kwargs)
         return await self.client.get(f"/discover/{mt}", params=params)
 

Original file line number	Diff line number	Diff line change
`@@ -69,8 +69,8 @@ def sample_items(`
`69`	`69`	`if not (it.get("_is_loved") or it.get("_is_liked") or it.get("_id") in added_item_ids)`
`70`	`70`	`]`
`71`	`71`
`72`		`- # Always include all strong signal items`
`73`		`- strong_signal_items = loved_liked_items + added_items`
	`72`	`+ # Always include strong signal items: Loved/Liked: 45%, Added: 20%`
	`73`	`+ strong_signal_items = loved_liked_items[: int(max_items * 0.45)] + added_items[: int(max_items * 0.20)]`
`74`	`74`	`strong_signal_scored = [self.scoring_service.process_item(it) for it in strong_signal_items]`
`75`	`75`
`76`	`76`	`# Score watched items and sort by score`