Skip to content

Commit 1d25318

Browse files
fix: fetch more items if more genres are excluded (#79)
1 parent 722e6e7 commit 1d25318

File tree

2 files changed

+173
-69
lines changed

2 files changed

+173
-69
lines changed

app/services/discovery.py

Lines changed: 97 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,21 @@ async def discover_recommendations(
3434
"""
3535
Find content that matches the user's taste profile using multi-phase TMDB discovery.
3636
"""
37+
# Calculate pages to fetch per query based on excluded genres
38+
num_excluded = len(excluded_genres) if excluded_genres else 0
39+
if num_excluded > 10:
40+
pages_per_query = 5 # Fetch 5 pages when most genres are excluded
41+
elif num_excluded > 5:
42+
pages_per_query = 3 # Fetch 3 pages when many genres are excluded
43+
else:
44+
pages_per_query = 1 # Default: 1 page
45+
3746
# 1. Build Phase 1 Tasks
3847
tasks = self._build_discovery_tasks_phase1(
3948
profile,
4049
content_type,
4150
excluded_genres,
51+
pages_per_query=pages_per_query,
4252
use_genres=use_genres,
4353
use_keywords=use_keywords,
4454
use_cast=use_cast,
@@ -68,6 +78,7 @@ async def discover_recommendations(
6878
profile,
6979
content_type,
7080
excluded_genres,
81+
pages_per_query=pages_per_query,
7182
use_genres=use_genres,
7283
use_keywords=use_keywords,
7384
use_cast=use_cast,
@@ -88,6 +99,7 @@ def _build_discovery_tasks_phase1(
8899
profile: UserTasteProfile,
89100
content_type: str,
90101
excluded_genres: list[int] | None = None,
102+
pages_per_query: int = 1,
91103
**opts,
92104
) -> list[Any]:
93105
"""Construct the initial set of discovery tasks based on top profile features."""
@@ -106,32 +118,40 @@ def _build_discovery_tasks_phase1(
106118
if excluded_genres:
107119
base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
108120

109-
# Query 1: Top Genres
121+
# Query 1: Top Genres - fetch multiple pages
110122
if top_genres:
111123
genre_ids = "|".join([str(g[0]) for g in top_genres])
112-
tasks.append(
113-
self._fetch_discovery(
114-
content_type,
115-
{"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
116-
)
117-
)
118-
tasks.append(
119-
self._fetch_discovery(
120-
content_type,
121-
{"with_genres": genre_ids, "sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params},
122-
)
123-
)
124+
for page in range(1, pages_per_query + 1):
125+
for sort_by_option in ["popularity.desc", "vote_average.desc"]:
126+
tasks.append(
127+
self._fetch_discovery(
128+
content_type,
129+
{
130+
"with_genres": genre_ids,
131+
"sort_by": sort_by_option,
132+
"vote_count.gte": 500,
133+
"page": page,
134+
**base_params,
135+
},
136+
)
137+
)
124138

125-
# Query 2: Top Keywords
139+
# Query 2: Top Keywords - fetch multiple pages
126140
if top_keywords:
127141
keyword_ids = "|".join([str(k[0]) for k in top_keywords])
128-
tasks.append(
129-
self._fetch_discovery(
130-
content_type,
131-
{"with_keywords": keyword_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
142+
for page in range(1, pages_per_query + 1):
143+
tasks.append(
144+
self._fetch_discovery(
145+
content_type,
146+
{
147+
"with_keywords": keyword_ids,
148+
"sort_by": "popularity.desc",
149+
"vote_count.gte": 500,
150+
"page": page,
151+
**base_params,
152+
},
153+
)
132154
)
133-
)
134-
for page in range(1, 3):
135155
tasks.append(
136156
self._fetch_discovery(
137157
content_type,
@@ -145,55 +165,62 @@ def _build_discovery_tasks_phase1(
145165
)
146166
)
147167

148-
# Query 3: Cast & Crew
168+
# Query 3: Cast & Crew - fetch multiple pages
149169
is_tv = content_type in ("tv", "series")
150170
for actor in top_cast:
151-
p = {"sort_by": "popularity.desc", "vote_count.gte": 500, **base_params}
152-
p["with_people" if is_tv else "with_cast"] = str(actor[0])
153-
tasks.append(self._fetch_discovery(content_type, p))
171+
for page in range(1, pages_per_query + 1):
172+
p = {"sort_by": "popularity.desc", "vote_count.gte": 500, "page": page, **base_params}
173+
p["with_people" if is_tv else "with_cast"] = str(actor[0])
174+
tasks.append(self._fetch_discovery(content_type, p))
154175

155176
if top_crew:
156-
p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params}
157-
p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
158-
tasks.append(self._fetch_discovery(content_type, p))
177+
for page in range(1, pages_per_query + 1):
178+
p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, "page": page, **base_params}
179+
p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
180+
tasks.append(self._fetch_discovery(content_type, p))
159181

160-
# Query 4: Countries & Year
182+
# Query 4: Countries & Year - fetch multiple pages
161183
if top_countries:
162184
country_ids = "|".join([str(c[0]) for c in top_countries])
163-
tasks.append(
164-
self._fetch_discovery(
165-
content_type,
166-
{
167-
"with_origin_country": country_ids,
168-
"sort_by": "popularity.desc",
169-
"vote_count.gte": 100,
170-
**base_params,
171-
},
185+
for page in range(1, pages_per_query + 1):
186+
tasks.append(
187+
self._fetch_discovery(
188+
content_type,
189+
{
190+
"with_origin_country": country_ids,
191+
"sort_by": "popularity.desc",
192+
"vote_count.gte": 100,
193+
"page": page,
194+
**base_params,
195+
},
196+
)
172197
)
173-
)
174198

175199
if top_year:
176200
year = top_year[0][0]
177201
prefix = "first_air_date" if is_tv else "primary_release_date"
178-
tasks.append(
179-
self._fetch_discovery(
180-
content_type,
181-
{
182-
"sort_by": "vote_average.desc",
183-
"vote_count.gte": 500,
184-
f"{prefix}.gte": f"{year}-01-01",
185-
f"{prefix}.lte": f"{int(year)+9}-12-31",
186-
**base_params,
187-
},
202+
for page in range(1, pages_per_query + 1):
203+
tasks.append(
204+
self._fetch_discovery(
205+
content_type,
206+
{
207+
"sort_by": "vote_average.desc",
208+
"vote_count.gte": 500,
209+
f"{prefix}.gte": f"{year}-01-01",
210+
f"{prefix}.lte": f"{int(year)+9}-12-31",
211+
"page": page,
212+
**base_params,
213+
},
214+
)
188215
)
189-
)
190216
return tasks
191217

192218
def _build_discovery_tasks_phase2(
193219
self,
194220
profile: UserTasteProfile,
195221
content_type: str,
196222
excluded_genres: list[int] | None = None,
223+
pages_per_query: int = 1,
197224
**opts,
198225
) -> list[Any]:
199226
"""Construct additional discovery tasks with lower thresholds to fill out candidate pool."""
@@ -202,32 +229,41 @@ def _build_discovery_tasks_phase2(
202229
top_cast = profile.cast.get_top_features(limit=1) if opts.get("use_cast") else []
203230

204231
tasks = []
205-
base_params = {"vote_count.gte": 400, "page": 2}
232+
base_params = {"vote_count.gte": 400}
206233
if excluded_genres:
207234
base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
208235

236+
# Start from page 2 for phase 2, but fetch multiple pages if needed
237+
start_page = 2
238+
end_page = start_page + pages_per_query
239+
209240
if top_genres:
210241
genre_ids = "|".join([str(g[0]) for g in top_genres])
211-
tasks.append(
212-
self._fetch_discovery(
213-
content_type, {"with_genres": genre_ids, "sort_by": "vote_average.desc", **base_params}
242+
for page in range(start_page, end_page):
243+
tasks.append(
244+
self._fetch_discovery(
245+
content_type,
246+
{"with_genres": genre_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
247+
)
214248
)
215-
)
216249

217250
if top_keywords:
218251
keyword_ids = "|".join([str(k[0]) for k in top_keywords])
219-
tasks.append(
220-
self._fetch_discovery(
221-
content_type, {"with_keywords": keyword_ids, "sort_by": "vote_average.desc", **base_params}
252+
for page in range(start_page, end_page):
253+
tasks.append(
254+
self._fetch_discovery(
255+
content_type,
256+
{"with_keywords": keyword_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
257+
)
222258
)
223-
)
224259

225260
if top_cast:
226261
actor_id = top_cast[0][0]
227262
is_tv = content_type in ("tv", "series")
228-
p = {"sort_by": "vote_average.desc", **base_params}
229-
p["with_people" if is_tv else "with_cast"] = str(actor_id)
230-
tasks.append(self._fetch_discovery(content_type, p))
263+
for page in range(start_page, end_page):
264+
p = {"sort_by": "vote_average.desc", "page": page, **base_params}
265+
p["with_people" if is_tv else "with_cast"] = str(actor_id)
266+
tasks.append(self._fetch_discovery(content_type, p))
231267

232268
return tasks
233269

app/services/recommendation/engine.py

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,37 @@ def _apply_diversification(self, pool: list, targets: dict, max_results: int) ->
594594

595595
return result
596596

597+
def _filter_candidates_by_watched_and_genres(
598+
self, candidates: list[dict], watched_tmdb: set[int], whitelist: set[int], existing_ids: set[int] | None = None
599+
) -> list[dict]:
600+
"""
601+
Filter candidates by watched items and genre whitelist.
602+
603+
Args:
604+
candidates: List of candidate items to filter
605+
watched_tmdb: Set of watched TMDB IDs to exclude
606+
whitelist: Set of preferred genre IDs
607+
existing_ids: Optional set of IDs to exclude (for deduplication)
608+
609+
Returns:
610+
Filtered list of candidates
611+
"""
612+
filtered = []
613+
existing = existing_ids or set()
614+
615+
for it in candidates:
616+
item_id = it.get("id")
617+
if not item_id or item_id in existing:
618+
continue
619+
if item_id in watched_tmdb:
620+
continue
621+
if not RecommendationFiltering.passes_top_genre_whitelist(it.get("genre_ids"), whitelist):
622+
continue
623+
filtered.append(it)
624+
existing.add(item_id)
625+
626+
return filtered
627+
597628
async def get_recommendations_for_theme(self, theme_id: str, content_type: str, limit: int = 20) -> list[dict]:
598629
"""Parse theme and fetch recommendations with strict filtering."""
599630
params = {}
@@ -636,8 +667,23 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
636667

637668
whitelist = await self._get_genre_whitelist(content_type)
638669
candidates = []
670+
671+
# Calculate how many pages to fetch based on excluded genres
672+
# When many genres are excluded, we need to fetch more pages to get enough results
673+
num_excluded = len(excluded_ids) if excluded_ids else 0
674+
# Movies and Series both have ~20 genres, so if more than 10 are excluded, fetch more pages
675+
if num_excluded > 10:
676+
# Fetch 10 pages when most genres are excluded
677+
pages_to_fetch = list(range(1, 11))
678+
elif num_excluded > 5:
679+
# Fetch 5 pages when many genres are excluded
680+
pages_to_fetch = list(range(1, 6))
681+
else:
682+
# Default: 3 pages
683+
pages_to_fetch = [1, 2, 3]
684+
639685
try:
640-
discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in [1, 2, 3]]
686+
discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in pages_to_fetch]
641687
discover_results = await asyncio.gather(*discover_tasks, return_exceptions=True)
642688
for res in discover_results:
643689
if isinstance(res, Exception):
@@ -652,13 +698,35 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
652698
)
653699

654700
# Initial filter
655-
filtered = []
656-
for it in candidates:
657-
if it.get("id") in watched_tmdb:
658-
continue
659-
if not RecommendationFiltering.passes_top_genre_whitelist(it.get("genre_ids"), whitelist):
660-
continue
661-
filtered.append(it)
701+
filtered = self._filter_candidates_by_watched_and_genres(candidates, watched_tmdb, whitelist)
702+
703+
# If we still don't have enough candidates, fetch more pages
704+
max_page_fetched = max(pages_to_fetch) if pages_to_fetch else 0
705+
if len(filtered) < limit * 2 and max_page_fetched < 15:
706+
try:
707+
# Fetch additional pages starting from where we left off
708+
next_page_start = max_page_fetched + 1
709+
additional_pages = list(range(next_page_start, min(next_page_start + 5, 20)))
710+
if additional_pages:
711+
logger.info(f"Fetching additional pages {additional_pages} due to insufficient candidates")
712+
additional_tasks = [
713+
self.tmdb_service.get_discover(content_type, page=p, **params) for p in additional_pages
714+
]
715+
additional_results = await asyncio.gather(*additional_tasks, return_exceptions=True)
716+
# Collect new candidates from additional pages
717+
new_candidates = []
718+
for res in additional_results:
719+
if isinstance(res, Exception):
720+
continue
721+
new_candidates.extend(res.get("results", []))
722+
# Filter new candidates, excluding already processed ones
723+
existing_ids = {it.get("id") for it in filtered}
724+
additional_filtered = self._filter_candidates_by_watched_and_genres(
725+
new_candidates, watched_tmdb, whitelist, existing_ids
726+
)
727+
filtered.extend(additional_filtered)
728+
except Exception as e:
729+
logger.warning(f"Failed to fetch additional pages: {e}")
662730

663731
if len(filtered) < limit * 2:
664732
tmp_pool = {it["id"]: it for it in filtered}

0 commit comments

Comments
 (0)