fix: fetch more items if more genres are excluded (#79)

TimilsinaBimal · web-flow · commit 1d2531820bca · 2025-12-23T18:42:33.000+05:45
diff --git a/app/services/discovery.py b/app/services/discovery.py
@@ -34,11 +34,21 @@ async def discover_recommendations(
         """
         Find content that matches the user's taste profile using multi-phase TMDB discovery.
         """
+        # Calculate pages to fetch per query based on excluded genres
+        num_excluded = len(excluded_genres) if excluded_genres else 0
+        if num_excluded > 10:
+            pages_per_query = 5  # Fetch 5 pages when most genres are excluded
+        elif num_excluded > 5:
+            pages_per_query = 3  # Fetch 3 pages when many genres are excluded
+        else:
+            pages_per_query = 1  # Default: 1 page
+
         # 1. Build Phase 1 Tasks
         tasks = self._build_discovery_tasks_phase1(
             profile,
             content_type,
             excluded_genres,
+            pages_per_query=pages_per_query,
             use_genres=use_genres,
             use_keywords=use_keywords,
             use_cast=use_cast,
@@ -68,6 +78,7 @@ async def discover_recommendations(
                 profile,
                 content_type,
                 excluded_genres,
+                pages_per_query=pages_per_query,
                 use_genres=use_genres,
                 use_keywords=use_keywords,
                 use_cast=use_cast,
@@ -88,6 +99,7 @@ def _build_discovery_tasks_phase1(
         profile: UserTasteProfile,
         content_type: str,
         excluded_genres: list[int] | None = None,
+        pages_per_query: int = 1,
         **opts,
     ) -> list[Any]:
         """Construct the initial set of discovery tasks based on top profile features."""
@@ -106,32 +118,40 @@ def _build_discovery_tasks_phase1(
         if excluded_genres:
             base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
 
-        # Query 1: Top Genres
+        # Query 1: Top Genres - fetch multiple pages
         if top_genres:
             genre_ids = "|".join([str(g[0]) for g in top_genres])
-            tasks.append(
-                self._fetch_discovery(
-                    content_type,
-                    {"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
-                )
-            )
-            tasks.append(
-                self._fetch_discovery(
-                    content_type,
-                    {"with_genres": genre_ids, "sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params},
-                )
-            )
+            for page in range(1, pages_per_query + 1):
+                for sort_by_option in ["popularity.desc", "vote_average.desc"]:
+                    tasks.append(
+                        self._fetch_discovery(
+                            content_type,
+                            {
+                                "with_genres": genre_ids,
+                                "sort_by": sort_by_option,
+                                "vote_count.gte": 500,
+                                "page": page,
+                                **base_params,
+                            },
+                        )
+                    )
 
-        # Query 2: Top Keywords
+        # Query 2: Top Keywords - fetch multiple pages
         if top_keywords:
             keyword_ids = "|".join([str(k[0]) for k in top_keywords])
-            tasks.append(
-                self._fetch_discovery(
-                    content_type,
-                    {"with_keywords": keyword_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
+            for page in range(1, pages_per_query + 1):
+                tasks.append(
+                    self._fetch_discovery(
+                        content_type,
+                        {
+                            "with_keywords": keyword_ids,
+                            "sort_by": "popularity.desc",
+                            "vote_count.gte": 500,
+                            "page": page,
+                            **base_params,
+                        },
+                    )
                 )
-            )
-            for page in range(1, 3):
                 tasks.append(
                     self._fetch_discovery(
                         content_type,
@@ -145,55 +165,62 @@ def _build_discovery_tasks_phase1(
                     )
                 )
 
-        # Query 3: Cast & Crew
+        # Query 3: Cast & Crew - fetch multiple pages
         is_tv = content_type in ("tv", "series")
         for actor in top_cast:
-            p = {"sort_by": "popularity.desc", "vote_count.gte": 500, **base_params}
-            p["with_people" if is_tv else "with_cast"] = str(actor[0])
-            tasks.append(self._fetch_discovery(content_type, p))
+            for page in range(1, pages_per_query + 1):
+                p = {"sort_by": "popularity.desc", "vote_count.gte": 500, "page": page, **base_params}
+                p["with_people" if is_tv else "with_cast"] = str(actor[0])
+                tasks.append(self._fetch_discovery(content_type, p))
 
         if top_crew:
-            p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params}
-            p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
-            tasks.append(self._fetch_discovery(content_type, p))
+            for page in range(1, pages_per_query + 1):
+                p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, "page": page, **base_params}
+                p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
+                tasks.append(self._fetch_discovery(content_type, p))
 
-        # Query 4: Countries & Year
+        # Query 4: Countries & Year - fetch multiple pages
         if top_countries:
             country_ids = "|".join([str(c[0]) for c in top_countries])
-            tasks.append(
-                self._fetch_discovery(
-                    content_type,
-                    {
-                        "with_origin_country": country_ids,
-                        "sort_by": "popularity.desc",
-                        "vote_count.gte": 100,
-                        **base_params,
-                    },
+            for page in range(1, pages_per_query + 1):
+                tasks.append(
+                    self._fetch_discovery(
+                        content_type,
+                        {
+                            "with_origin_country": country_ids,
+                            "sort_by": "popularity.desc",
+                            "vote_count.gte": 100,
+                            "page": page,
+                            **base_params,
+                        },
+                    )
                 )
-            )
 
         if top_year:
             year = top_year[0][0]
             prefix = "first_air_date" if is_tv else "primary_release_date"
-            tasks.append(
-                self._fetch_discovery(
-                    content_type,
-                    {
-                        "sort_by": "vote_average.desc",
-                        "vote_count.gte": 500,
-                        f"{prefix}.gte": f"{year}-01-01",
-                        f"{prefix}.lte": f"{int(year)+9}-12-31",
-                        **base_params,
-                    },
+            for page in range(1, pages_per_query + 1):
+                tasks.append(
+                    self._fetch_discovery(
+                        content_type,
+                        {
+                            "sort_by": "vote_average.desc",
+                            "vote_count.gte": 500,
+                            f"{prefix}.gte": f"{year}-01-01",
+                            f"{prefix}.lte": f"{int(year)+9}-12-31",
+                            "page": page,
+                            **base_params,
+                        },
+                    )
                 )
-            )
         return tasks
 
     def _build_discovery_tasks_phase2(
         self,
         profile: UserTasteProfile,
         content_type: str,
         excluded_genres: list[int] | None = None,
+        pages_per_query: int = 1,
         **opts,
     ) -> list[Any]:
         """Construct additional discovery tasks with lower thresholds to fill out candidate pool."""
@@ -202,32 +229,41 @@ def _build_discovery_tasks_phase2(
         top_cast = profile.cast.get_top_features(limit=1) if opts.get("use_cast") else []
 
         tasks = []
-        base_params = {"vote_count.gte": 400, "page": 2}
+        base_params = {"vote_count.gte": 400}
         if excluded_genres:
             base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
 
+        # Start from page 2 for phase 2, but fetch multiple pages if needed
+        start_page = 2
+        end_page = start_page + pages_per_query
+
         if top_genres:
             genre_ids = "|".join([str(g[0]) for g in top_genres])
-            tasks.append(
-                self._fetch_discovery(
-                    content_type, {"with_genres": genre_ids, "sort_by": "vote_average.desc", **base_params}
+            for page in range(start_page, end_page):
+                tasks.append(
+                    self._fetch_discovery(
+                        content_type,
+                        {"with_genres": genre_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
+                    )
                 )
-            )
 
         if top_keywords:
             keyword_ids = "|".join([str(k[0]) for k in top_keywords])
-            tasks.append(
-                self._fetch_discovery(
-                    content_type, {"with_keywords": keyword_ids, "sort_by": "vote_average.desc", **base_params}
+            for page in range(start_page, end_page):
+                tasks.append(
+                    self._fetch_discovery(
+                        content_type,
+                        {"with_keywords": keyword_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
+                    )
                 )
-            )
 
         if top_cast:
             actor_id = top_cast[0][0]
             is_tv = content_type in ("tv", "series")
-            p = {"sort_by": "vote_average.desc", **base_params}
-            p["with_people" if is_tv else "with_cast"] = str(actor_id)
-            tasks.append(self._fetch_discovery(content_type, p))
+            for page in range(start_page, end_page):
+                p = {"sort_by": "vote_average.desc", "page": page, **base_params}
+                p["with_people" if is_tv else "with_cast"] = str(actor_id)
+                tasks.append(self._fetch_discovery(content_type, p))
 
         return tasks
 
diff --git a/app/services/recommendation/engine.py b/app/services/recommendation/engine.py
@@ -594,6 +594,37 @@ def _apply_diversification(self, pool: list, targets: dict, max_results: int) ->
 
         return result
 
+    def _filter_candidates_by_watched_and_genres(
+        self, candidates: list[dict], watched_tmdb: set[int], whitelist: set[int], existing_ids: set[int] | None = None
+    ) -> list[dict]:
+        """
+        Filter candidates by watched items and genre whitelist.
+
+        Args:
+            candidates: List of candidate items to filter
+            watched_tmdb: Set of watched TMDB IDs to exclude
+            whitelist: Set of preferred genre IDs
+            existing_ids: Optional set of IDs to exclude (for deduplication)
+
+        Returns:
+            Filtered list of candidates
+        """
+        filtered = []
+        existing = existing_ids or set()
+
+        for it in candidates:
+            item_id = it.get("id")
+            if not item_id or item_id in existing:
+                continue
+            if item_id in watched_tmdb:
+                continue
+            if not RecommendationFiltering.passes_top_genre_whitelist(it.get("genre_ids"), whitelist):
+                continue
+            filtered.append(it)
+            existing.add(item_id)
+
+        return filtered
+
     async def get_recommendations_for_theme(self, theme_id: str, content_type: str, limit: int = 20) -> list[dict]:
         """Parse theme and fetch recommendations with strict filtering."""
         params = {}
@@ -636,8 +667,23 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
 
         whitelist = await self._get_genre_whitelist(content_type)
         candidates = []
+
+        # Calculate how many pages to fetch based on excluded genres
+        # When many genres are excluded, we need to fetch more pages to get enough results
+        num_excluded = len(excluded_ids) if excluded_ids else 0
+        # Movies and Series both have ~20 genres, so if more than 10 are excluded, fetch more pages
+        if num_excluded > 10:
+            # Fetch 10 pages when most genres are excluded
+            pages_to_fetch = list(range(1, 11))
+        elif num_excluded > 5:
+            # Fetch 5 pages when many genres are excluded
+            pages_to_fetch = list(range(1, 6))
+        else:
+            # Default: 3 pages
+            pages_to_fetch = [1, 2, 3]
+
         try:
-            discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in [1, 2, 3]]
+            discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in pages_to_fetch]
             discover_results = await asyncio.gather(*discover_tasks, return_exceptions=True)
             for res in discover_results:
                 if isinstance(res, Exception):
@@ -652,13 +698,35 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
         )
 
         # Initial filter
-        filtered = []
-        for it in candidates:
-            if it.get("id") in watched_tmdb:
-                continue
-            if not RecommendationFiltering.passes_top_genre_whitelist(it.get("genre_ids"), whitelist):
-                continue
-            filtered.append(it)
+        filtered = self._filter_candidates_by_watched_and_genres(candidates, watched_tmdb, whitelist)
+
+        # If we still don't have enough candidates, fetch more pages
+        max_page_fetched = max(pages_to_fetch) if pages_to_fetch else 0
+        if len(filtered) < limit * 2 and max_page_fetched < 15:
+            try:
+                # Fetch additional pages starting from where we left off
+                next_page_start = max_page_fetched + 1
+                additional_pages = list(range(next_page_start, min(next_page_start + 5, 20)))
+                if additional_pages:
+                    logger.info(f"Fetching additional pages {additional_pages} due to insufficient candidates")
+                    additional_tasks = [
+                        self.tmdb_service.get_discover(content_type, page=p, **params) for p in additional_pages
+                    ]
+                    additional_results = await asyncio.gather(*additional_tasks, return_exceptions=True)
+                    # Collect new candidates from additional pages
+                    new_candidates = []
+                    for res in additional_results:
+                        if isinstance(res, Exception):
+                            continue
+                        new_candidates.extend(res.get("results", []))
+                    # Filter new candidates, excluding already processed ones
+                    existing_ids = {it.get("id") for it in filtered}
+                    additional_filtered = self._filter_candidates_by_watched_and_genres(
+                        new_candidates, watched_tmdb, whitelist, existing_ids
+                    )
+                    filtered.extend(additional_filtered)
+            except Exception as e:
+                logger.warning(f"Failed to fetch additional pages: {e}")
 
         if len(filtered) < limit * 2:
             tmp_pool = {it["id"]: it for it in filtered}