Skip to content

Commit 390d69c

Browse files
fix: fetch more items if more genres are excluded
1 parent 722e6e7 commit 390d69c

File tree

2 files changed

+153
-60
lines changed

2 files changed

+153
-60
lines changed

app/services/discovery.py

Lines changed: 106 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,21 @@ async def discover_recommendations(
3434
"""
3535
Find content that matches the user's taste profile using multi-phase TMDB discovery.
3636
"""
37+
# Calculate pages to fetch per query based on excluded genres
38+
num_excluded = len(excluded_genres) if excluded_genres else 0
39+
if num_excluded > 10:
40+
pages_per_query = 5 # Fetch 5 pages when most genres are excluded
41+
elif num_excluded > 5:
42+
pages_per_query = 3 # Fetch 3 pages when many genres are excluded
43+
else:
44+
pages_per_query = 1 # Default: 1 page
45+
3746
# 1. Build Phase 1 Tasks
3847
tasks = self._build_discovery_tasks_phase1(
3948
profile,
4049
content_type,
4150
excluded_genres,
51+
pages_per_query=pages_per_query,
4252
use_genres=use_genres,
4353
use_keywords=use_keywords,
4454
use_cast=use_cast,
@@ -68,6 +78,7 @@ async def discover_recommendations(
6878
profile,
6979
content_type,
7080
excluded_genres,
81+
pages_per_query=pages_per_query,
7182
use_genres=use_genres,
7283
use_keywords=use_keywords,
7384
use_cast=use_cast,
@@ -88,6 +99,7 @@ def _build_discovery_tasks_phase1(
8899
profile: UserTasteProfile,
89100
content_type: str,
90101
excluded_genres: list[int] | None = None,
102+
pages_per_query: int = 1,
91103
**opts,
92104
) -> list[Any]:
93105
"""Construct the initial set of discovery tasks based on top profile features."""
@@ -106,32 +118,51 @@ def _build_discovery_tasks_phase1(
106118
if excluded_genres:
107119
base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
108120

109-
# Query 1: Top Genres
121+
# Query 1: Top Genres - fetch multiple pages
110122
if top_genres:
111123
genre_ids = "|".join([str(g[0]) for g in top_genres])
112-
tasks.append(
113-
self._fetch_discovery(
114-
content_type,
115-
{"with_genres": genre_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
124+
for page in range(1, pages_per_query + 1):
125+
tasks.append(
126+
self._fetch_discovery(
127+
content_type,
128+
{
129+
"with_genres": genre_ids,
130+
"sort_by": "popularity.desc",
131+
"vote_count.gte": 500,
132+
"page": page,
133+
**base_params,
134+
},
135+
)
116136
)
117-
)
118-
tasks.append(
119-
self._fetch_discovery(
120-
content_type,
121-
{"with_genres": genre_ids, "sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params},
137+
tasks.append(
138+
self._fetch_discovery(
139+
content_type,
140+
{
141+
"with_genres": genre_ids,
142+
"sort_by": "vote_average.desc",
143+
"vote_count.gte": 500,
144+
"page": page,
145+
**base_params,
146+
},
147+
)
122148
)
123-
)
124149

125-
# Query 2: Top Keywords
150+
# Query 2: Top Keywords - fetch multiple pages
126151
if top_keywords:
127152
keyword_ids = "|".join([str(k[0]) for k in top_keywords])
128-
tasks.append(
129-
self._fetch_discovery(
130-
content_type,
131-
{"with_keywords": keyword_ids, "sort_by": "popularity.desc", "vote_count.gte": 500, **base_params},
153+
for page in range(1, pages_per_query + 1):
154+
tasks.append(
155+
self._fetch_discovery(
156+
content_type,
157+
{
158+
"with_keywords": keyword_ids,
159+
"sort_by": "popularity.desc",
160+
"vote_count.gte": 500,
161+
"page": page,
162+
**base_params,
163+
},
164+
)
132165
)
133-
)
134-
for page in range(1, 3):
135166
tasks.append(
136167
self._fetch_discovery(
137168
content_type,
@@ -145,55 +176,62 @@ def _build_discovery_tasks_phase1(
145176
)
146177
)
147178

148-
# Query 3: Cast & Crew
179+
# Query 3: Cast & Crew - fetch multiple pages
149180
is_tv = content_type in ("tv", "series")
150181
for actor in top_cast:
151-
p = {"sort_by": "popularity.desc", "vote_count.gte": 500, **base_params}
152-
p["with_people" if is_tv else "with_cast"] = str(actor[0])
153-
tasks.append(self._fetch_discovery(content_type, p))
182+
for page in range(1, pages_per_query + 1):
183+
p = {"sort_by": "popularity.desc", "vote_count.gte": 500, "page": page, **base_params}
184+
p["with_people" if is_tv else "with_cast"] = str(actor[0])
185+
tasks.append(self._fetch_discovery(content_type, p))
154186

155187
if top_crew:
156-
p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, **base_params}
157-
p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
158-
tasks.append(self._fetch_discovery(content_type, p))
188+
for page in range(1, pages_per_query + 1):
189+
p = {"sort_by": "vote_average.desc", "vote_count.gte": 500, "page": page, **base_params}
190+
p["with_people" if is_tv else "with_crew"] = str(top_crew[0][0])
191+
tasks.append(self._fetch_discovery(content_type, p))
159192

160-
# Query 4: Countries & Year
193+
# Query 4: Countries & Year - fetch multiple pages
161194
if top_countries:
162195
country_ids = "|".join([str(c[0]) for c in top_countries])
163-
tasks.append(
164-
self._fetch_discovery(
165-
content_type,
166-
{
167-
"with_origin_country": country_ids,
168-
"sort_by": "popularity.desc",
169-
"vote_count.gte": 100,
170-
**base_params,
171-
},
196+
for page in range(1, pages_per_query + 1):
197+
tasks.append(
198+
self._fetch_discovery(
199+
content_type,
200+
{
201+
"with_origin_country": country_ids,
202+
"sort_by": "popularity.desc",
203+
"vote_count.gte": 100,
204+
"page": page,
205+
**base_params,
206+
},
207+
)
172208
)
173-
)
174209

175210
if top_year:
176211
year = top_year[0][0]
177212
prefix = "first_air_date" if is_tv else "primary_release_date"
178-
tasks.append(
179-
self._fetch_discovery(
180-
content_type,
181-
{
182-
"sort_by": "vote_average.desc",
183-
"vote_count.gte": 500,
184-
f"{prefix}.gte": f"{year}-01-01",
185-
f"{prefix}.lte": f"{int(year)+9}-12-31",
186-
**base_params,
187-
},
213+
for page in range(1, pages_per_query + 1):
214+
tasks.append(
215+
self._fetch_discovery(
216+
content_type,
217+
{
218+
"sort_by": "vote_average.desc",
219+
"vote_count.gte": 500,
220+
f"{prefix}.gte": f"{year}-01-01",
221+
f"{prefix}.lte": f"{int(year)+9}-12-31",
222+
"page": page,
223+
**base_params,
224+
},
225+
)
188226
)
189-
)
190227
return tasks
191228

192229
def _build_discovery_tasks_phase2(
193230
self,
194231
profile: UserTasteProfile,
195232
content_type: str,
196233
excluded_genres: list[int] | None = None,
234+
pages_per_query: int = 1,
197235
**opts,
198236
) -> list[Any]:
199237
"""Construct additional discovery tasks with lower thresholds to fill out candidate pool."""
@@ -202,32 +240,41 @@ def _build_discovery_tasks_phase2(
202240
top_cast = profile.cast.get_top_features(limit=1) if opts.get("use_cast") else []
203241

204242
tasks = []
205-
base_params = {"vote_count.gte": 400, "page": 2}
243+
base_params = {"vote_count.gte": 400}
206244
if excluded_genres:
207245
base_params["without_genres"] = "|".join([str(g) for g in excluded_genres])
208246

247+
# Start from page 2 for phase 2, but fetch multiple pages if needed
248+
start_page = 2
249+
end_page = start_page + pages_per_query
250+
209251
if top_genres:
210252
genre_ids = "|".join([str(g[0]) for g in top_genres])
211-
tasks.append(
212-
self._fetch_discovery(
213-
content_type, {"with_genres": genre_ids, "sort_by": "vote_average.desc", **base_params}
253+
for page in range(start_page, end_page):
254+
tasks.append(
255+
self._fetch_discovery(
256+
content_type,
257+
{"with_genres": genre_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
258+
)
214259
)
215-
)
216260

217261
if top_keywords:
218262
keyword_ids = "|".join([str(k[0]) for k in top_keywords])
219-
tasks.append(
220-
self._fetch_discovery(
221-
content_type, {"with_keywords": keyword_ids, "sort_by": "vote_average.desc", **base_params}
263+
for page in range(start_page, end_page):
264+
tasks.append(
265+
self._fetch_discovery(
266+
content_type,
267+
{"with_keywords": keyword_ids, "sort_by": "vote_average.desc", "page": page, **base_params},
268+
)
222269
)
223-
)
224270

225271
if top_cast:
226272
actor_id = top_cast[0][0]
227273
is_tv = content_type in ("tv", "series")
228-
p = {"sort_by": "vote_average.desc", **base_params}
229-
p["with_people" if is_tv else "with_cast"] = str(actor_id)
230-
tasks.append(self._fetch_discovery(content_type, p))
274+
for page in range(start_page, end_page):
275+
p = {"sort_by": "vote_average.desc", "page": page, **base_params}
276+
p["with_people" if is_tv else "with_cast"] = str(actor_id)
277+
tasks.append(self._fetch_discovery(content_type, p))
231278

232279
return tasks
233280

app/services/recommendation/engine.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -636,8 +636,23 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
636636

637637
whitelist = await self._get_genre_whitelist(content_type)
638638
candidates = []
639+
640+
# Calculate how many pages to fetch based on excluded genres
641+
# When many genres are excluded, we need to fetch more pages to get enough results
642+
num_excluded = len(excluded_ids) if excluded_ids else 0
643+
# Movies and Series both have ~20 genres, so if more than 10 are excluded, fetch more pages
644+
if num_excluded > 10:
645+
# Fetch 8-10 pages when most genres are excluded
646+
pages_to_fetch = list(range(1, 11))
647+
elif num_excluded > 5:
648+
# Fetch 5-6 pages when many genres are excluded
649+
pages_to_fetch = list(range(1, 6))
650+
else:
651+
# Default: 3 pages
652+
pages_to_fetch = [1, 2, 3]
653+
639654
try:
640-
discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in [1, 2, 3]]
655+
discover_tasks = [self.tmdb_service.get_discover(content_type, page=p, **params) for p in pages_to_fetch]
641656
discover_results = await asyncio.gather(*discover_tasks, return_exceptions=True)
642657
for res in discover_results:
643658
if isinstance(res, Exception):
@@ -660,6 +675,37 @@ async def get_recommendations_for_theme(self, theme_id: str, content_type: str,
660675
continue
661676
filtered.append(it)
662677

678+
# If we still don't have enough candidates, fetch more pages
679+
max_page_fetched = max(pages_to_fetch) if pages_to_fetch else 0
680+
if len(filtered) < limit * 2 and max_page_fetched < 15:
681+
try:
682+
# Fetch additional pages starting from where we left off
683+
next_page_start = max_page_fetched + 1
684+
additional_pages = list(range(next_page_start, min(next_page_start + 5, 20)))
685+
if additional_pages:
686+
logger.info(f"Fetching additional pages {additional_pages} due to insufficient candidates")
687+
additional_tasks = [
688+
self.tmdb_service.get_discover(content_type, page=p, **params) for p in additional_pages
689+
]
690+
additional_results = await asyncio.gather(*additional_tasks, return_exceptions=True)
691+
# Track already processed IDs to avoid duplicates
692+
existing_ids = {it.get("id") for it in filtered}
693+
for res in additional_results:
694+
if isinstance(res, Exception):
695+
continue
696+
for it in res.get("results", []):
697+
item_id = it.get("id")
698+
if not item_id or item_id in existing_ids:
699+
continue
700+
if item_id in watched_tmdb:
701+
continue
702+
if not RecommendationFiltering.passes_top_genre_whitelist(it.get("genre_ids"), whitelist):
703+
continue
704+
filtered.append(it)
705+
existing_ids.add(item_id)
706+
except Exception as e:
707+
logger.warning(f"Failed to fetch additional pages: {e}")
708+
663709
if len(filtered) < limit * 2:
664710
tmp_pool = {it["id"]: it for it in filtered}
665711
await self._inject_freshness(tmp_pool, content_type, watched_tmdb, set(excluded_ids), whitelist, limit)

0 commit comments

Comments
 (0)