Skip to content

Commit 1418e3a

Browse files
mlplylerrolf-moz
andauthored
[GENAI-234] Remove crawled references (#1174)
* removed crawl from sections filter * merged * cleaned * ok * some tests cleaned up * more crawl test cleanup * lint * integration test shenanigans * skip _crawl sections * Add backwards compatability for _crawl issue --------- Co-authored-by: Rolf Rando <[email protected]>
1 parent c1d087d commit 1418e3a

File tree

13 files changed

+281
-768
lines changed

13 files changed

+281
-768
lines changed

merino/curated_recommendations/corpus_backends/sections_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ async def fetch(self, surface_id: SurfaceId) -> list[CorpusSection]:
126126
utm_source = get_utm_source(surface_id)
127127
sections_list = []
128128
for section in data["data"]["getSections"]:
129-
if not section.get("active"):
129+
if not section.get("active") or section.get("externalId", "").endswith("_crawl"):
130130
logger.info(f"Skipping inactive section {section['externalId']} for {surface_id}")
131131
continue
132132

merino/curated_recommendations/ml_backends/static_local_model.py

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"travel",
5656
]
5757

58+
BASE_TOPICS_SET = set(BASE_TOPICS)
5859

5960
BASE_SECTIONS_FOR_LOCAL_MODEL = [
6061
"nfl",
@@ -66,22 +67,22 @@
6667
"movies",
6768
"music",
6869
"books",
69-
"business_crawl",
70-
"career_crawl",
71-
"arts_crawl",
72-
"food_crawl",
73-
"health_crawl",
74-
"home_crawl",
75-
"finance_crawl",
76-
"government_crawl",
77-
"sports_crawl",
78-
"tech_crawl",
79-
"travel_crawl",
80-
"education_crawl",
81-
"hobbies_crawl",
82-
"society-parenting_crawl",
83-
"education-science_crawl",
84-
"society_crawl",
70+
"business",
71+
"career",
72+
"arts",
73+
"food",
74+
"health",
75+
"home",
76+
"finance",
77+
"government",
78+
"sports",
79+
"tech",
80+
"travel",
81+
"education",
82+
"hobbies",
83+
"society-parenting",
84+
"education-science",
85+
"society",
8586
]
8687

8788

@@ -147,8 +148,6 @@ def get_topic(topic: str) -> InterestVectorConfig:
147148
THRESHOLDS_V1_A = [0.008, 0.016, 0.024]
148149
THRESHOLDS_V1_B = [0.005, 0.010, 0.015]
149150

150-
CRAWL_SUFFIX = "_crawl"
151-
152151

153152
# Creates a limited model based on topics. Topics features are stored with a t_
154153
# in telemetry.
@@ -172,10 +171,6 @@ class SuperInferredModel(LocalModelBackend):
172171

173172
default_model_id = DEFAULT_PRODUCTION_MODEL_ID
174173

175-
@staticmethod
176-
def _clean_section(section_name: str):
177-
return section_name.replace(CRAWL_SUFFIX, "")
178-
179174
@staticmethod
180175
def _get_topic(topic: str, thresholds: list[float]) -> InterestVectorConfig:
181176
return InterestVectorConfig(
@@ -187,8 +182,13 @@ def _get_topic(topic: str, thresholds: list[float]) -> InterestVectorConfig:
187182

188183
@staticmethod
189184
def _get_section(section_name: str, thresholds: list[float]) -> InterestVectorConfig:
185+
features = (
186+
{f"s_{section_name}": 1, f"s_{section_name}_crawl": 1}
187+
if section_name in BASE_TOPICS_SET
188+
else {f"s_{section_name}": 1}
189+
)
190190
return InterestVectorConfig(
191-
features={f"s_{section_name}": 1},
191+
features=features,
192192
thresholds=thresholds,
193193
diff_p=MODEL_P_VALUE_V1,
194194
diff_q=MODEL_Q_VALUE_V1,
@@ -245,8 +245,7 @@ def _build_local(self, model_id, surface_id) -> InferredLocalModel | None:
245245
else:
246246
return None
247247
category_fields = {
248-
self._clean_section(a): self._get_section(a, model_thresholds)
249-
for a in BASE_SECTIONS_FOR_LOCAL_MODEL
248+
a: self._get_section(a, model_thresholds) for a in BASE_SECTIONS_FOR_LOCAL_MODEL
250249
} ## all sections
251250
model_data: ModelData = ModelData(
252251
model_type=ModelType.CTR,

merino/curated_recommendations/prior_backends/experiment_rescaler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
PESSIMISTIC_PRIOR_ALPHA_SCALE_SUBTOPIC = 0.35
1616

1717

18-
class DefaultCrawlerRescaler(ExperimentRescaler):
18+
class DefaultRescaler(ExperimentRescaler):
1919
"""Scales based on overall percentage"""
2020

2121
def __init__(self, **data: Any):

merino/curated_recommendations/protocol.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ class ExperimentName(str, Enum):
9191
RSS_VS_ZYTE_EXPERIMENT = "new-ranking-for-legacy-topics-in-new-tab-v1"
9292
# Experiment to display Daily Briefing section as the first section on New Tab
9393
DAILY_BRIEFING_EXPERIMENT = "daily-briefing-v1"
94-
# Experiment slug for crawling with identical behavior/branches as RSS_VS_ZYTE_EXPERIMENT
95-
NEW_TAB_CRAWLING_V2 = "new-tab-crawling-v2"
9694
# The following are 6 experiments to apply 1 row layout for Popular Today for contextual ads
9795
CONTEXTUAL_AD_NIGHTLY_EXPERIMENT = "new-tab-ad-updates-nightly"
9896
CONTEXTUAL_AD_V2_NIGHTLY_EXPERIMENT = "new-tab-contextual-ad-updates-v2-nightly"
@@ -106,15 +104,6 @@ class ExperimentName(str, Enum):
106104
INFERRED_LOCAL_EXPERIMENT_V2 = "new-tab-automated-personalization-local-ranking-2"
107105

108106

109-
@unique
110-
class CrawlExperimentBranchName(str, Enum):
111-
"""Branch names for the RSS vs. Zyte (crawl) experiment."""
112-
113-
CONTROL = "control"
114-
TREATMENT_CRAWL = "treatment-crawl"
115-
TREATMENT_CRAWL_PLUS_SUBTOPICS = "treatment-crawl-subtopics"
116-
117-
118107
# Maximum tileId that Firefox can support. Firefox uses Javascript to store this value. The max
119108
# value of a Javascript number can be found using `Number.MAX_SAFE_INTEGER`. which is 2^53 - 1
120109
# because it uses a 64-bit IEEE 754 float.

merino/curated_recommendations/sections.py

Lines changed: 9 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from merino.curated_recommendations.prior_backends.experiment_rescaler import (
2424
SchedulerHoldbackRescaler,
2525
SUBTOPIC_EXPERIMENT_CURATED_ITEM_FLAG,
26-
DefaultCrawlerRescaler,
2726
)
2827
from merino.curated_recommendations.prior_backends.protocol import PriorBackend, ExperimentRescaler
2928
from merino.curated_recommendations.protocol import (
@@ -33,7 +32,6 @@
3332
SectionConfiguration,
3433
ExperimentName,
3534
ProcessedInterests,
36-
CrawlExperimentBranchName,
3735
Layout,
3836
)
3937
from merino.curated_recommendations.rankers import (
@@ -56,7 +54,6 @@
5654
DOUBLE_ROW_TOP_STORIES_COUNT = 9
5755
TOP_STORIES_SECTION_EXTRA_COUNT = 5 # Extra top stories pulled from later sections
5856
HEADLINES_SECTION_KEY = "headlines_section"
59-
HEADLINES_CRAWL_SECTION_KEY = "headlines_crawl"
6057

6158

6259
def map_section_item_to_recommendation(
@@ -170,7 +167,6 @@ async def get_corpus_sections(
170167
sections_backend: SectionsProtocol,
171168
surface_id: SurfaceId,
172169
min_feed_rank: int,
173-
crawl_branch: str | None = None,
174170
include_subtopics: bool = False,
175171
scheduled_surface_backend: ScheduledSurfaceProtocol | None = None,
176172
is_custom_sections_experiment: bool = False,
@@ -181,7 +177,6 @@ async def get_corpus_sections(
181177
sections_backend: Backend interface to fetch corpus sections.
182178
surface_id: Identifier for which surface to fetch sections.
183179
min_feed_rank: Starting rank offset for assigning receivedFeedRank.
184-
crawl_branch: The crawl experiment branch name or None.
185180
include_subtopics: Whether to include subtopic sections.
186181
scheduled_surface_backend: Backend interface to fetch scheduled corpus items (temporary)
187182
is_custom_sections_experiment: Whether custom sections experiment is enabled.
@@ -210,7 +205,6 @@ async def get_corpus_sections(
210205
# Apply RSS vs. Zyte experiment filtering and custom sections filtering
211206
filtered_corpus_sections = filter_sections_by_experiment(
212207
remaining_raw_corpus_sections,
213-
crawl_branch,
214208
include_subtopics,
215209
is_custom_sections_experiment,
216210
)
@@ -228,11 +222,11 @@ async def get_corpus_sections(
228222
def split_headlines_section(
229223
corpus_sections: list[CorpusSection],
230224
) -> tuple[CorpusSection | None, list[CorpusSection]]:
231-
"""Return the headlines_crawl section separately from everything else."""
225+
"""Return the headlines section separately from everything else."""
232226
headlines_section: CorpusSection | None = None
233227
remaining_sections: list[CorpusSection] = []
234228
for cs in corpus_sections:
235-
if cs.externalId == HEADLINES_CRAWL_SECTION_KEY:
229+
if cs.externalId == HEADLINES_SECTION_KEY:
236230
headlines_section = cs
237231
else:
238232
remaining_sections.append(cs)
@@ -302,7 +296,6 @@ def is_subtopics_experiment(request: CuratedRecommendationsRequest) -> bool:
302296
303297
Include subtopics if:
304298
- ML sections experiment is enabled (treatment branch), OR
305-
- Crawl experiment is in the TREATMENT_CRAWL_PLUS_SUBTOPICS branch
306299
"""
307300
in_holdback = is_scheduler_holdback_experiment(request)
308301
# Subtopics only in the US
@@ -323,30 +316,14 @@ def is_custom_sections_experiment(request: CuratedRecommendationsRequest) -> boo
323316
)
324317

325318

326-
def get_crawl_experiment_branch(request: CuratedRecommendationsRequest) -> str | None:
327-
"""Return the branch name for the RSS vs. Zyte experiment
328-
329-
Branches:
330-
- control: Non-crawl legacy topics only
331-
- treatment-crawl: Crawl legacy topics only
332-
- treatment-crawl-subtopics: Crawl legacy topics + non-crawl subtopics
333-
334-
"""
335-
if is_scheduler_holdback_experiment(request) or request.region != "US":
336-
return CrawlExperimentBranchName.CONTROL.value
337-
338-
return CrawlExperimentBranchName.TREATMENT_CRAWL_PLUS_SUBTOPICS.value
339-
340-
341319
def get_ranking_rescaler_for_branch(
342320
request: CuratedRecommendationsRequest,
343321
) -> ExperimentRescaler | None:
344322
"""Get the correct interactions and prior rescaler for the current experiment"""
345-
if request.region != "US":
323+
if request.region != "US" or not is_scheduler_holdback_experiment(request):
346324
return None
347-
if is_scheduler_holdback_experiment(request):
325+
else:
348326
return SchedulerHoldbackRescaler()
349-
return DefaultCrawlerRescaler()
350327

351328

352329
def update_received_feed_rank(sections: dict[str, Section]):
@@ -369,43 +346,29 @@ def get_corpus_sections_for_legacy_topic(
369346
return {sid: section for sid, section in corpus_sections.items() if sid in legacy_topics}
370347

371348

372-
def is_crawl_section_id(section_id: str) -> bool:
373-
"""Check if a section ID represents a crawl section.
374-
375-
Args:
376-
section_id: The section external ID to check
377-
378-
Returns:
379-
True if the section ID ends with '_crawl', False otherwise
380-
"""
381-
return section_id.endswith("_crawl")
382-
383-
384349
def filter_sections_by_experiment(
385350
corpus_sections: list[CorpusSection],
386-
crawl_branch: str | None,
387351
include_subtopics: bool = False,
388352
is_custom_sections_experiment: bool = False,
389353
) -> dict[str, CorpusSection]:
390354
"""Filter sections based on RSS vs. Zyte experiment branch and custom sections experiment.
391355
392356
Args:
393357
corpus_sections: List of CorpusSection objects
394-
crawl_branch: The experiment branch name or None
395358
include_subtopics: Whether to include subtopic sections
396359
is_custom_sections_experiment: Whether custom sections experiment is enabled
397360
398361
Returns:
399-
Filtered sections with _crawl suffix removed from keys for crawl sections
362+
Filtered sections
400363
"""
401364
legacy_topics = get_legacy_topic_ids()
402365
result = {}
403366

404367
for section in corpus_sections:
405368
section_id = section.externalId
406-
is_crawl_section = is_crawl_section_id(section_id)
407-
base_id = section_id.replace("_crawl", "") if is_crawl_section else section_id
369+
base_id = section_id
408370
is_legacy = base_id in legacy_topics
371+
# is_legacy = base_id in legacy_topics
409372
is_manual_section = section.createSource == CreateSource.MANUAL
410373

411374
# Custom sections experiment: only include MANUAL sections in treatment, exclude them in control
@@ -418,28 +381,8 @@ def filter_sections_by_experiment(
418381
# Control/default: exclude MANUAL sections
419382
if is_manual_section:
420383
continue
421-
422-
# Determine if we should include this section based on the branch
423-
if crawl_branch in [
424-
CrawlExperimentBranchName.TREATMENT_CRAWL.value,
425-
CrawlExperimentBranchName.TREATMENT_CRAWL_PLUS_SUBTOPICS.value,
426-
]:
427-
# Treatment branches: use _crawl for legacy, regular for subtopics
428-
if is_legacy and is_crawl_section:
429-
result[base_id] = section
430-
elif (
431-
not is_legacy
432-
and not is_crawl_section
433-
and crawl_branch == CrawlExperimentBranchName.TREATMENT_CRAWL_PLUS_SUBTOPICS.value
434-
):
435-
# Include non-crawl subtopics only in crawl-plus-subtopics branch
436-
result[base_id] = section
437-
else:
438-
# Control branch or no experiment: use non-_crawl sections
439-
if not is_crawl_section:
440-
# Include based on whether subtopics are enabled
441-
if is_legacy or include_subtopics:
442-
result[base_id] = section
384+
if is_legacy or include_subtopics:
385+
result[base_id] = section
443386

444387
return result
445388

@@ -638,9 +581,6 @@ async def get_sections(
638581
Returns:
639582
A dict mapping section IDs to fully-configured Section models.
640583
"""
641-
# 1. Get corpus sections with RSS vs. Zyte experiment filtering
642-
crawl_branch = get_crawl_experiment_branch(request)
643-
644584
# Determine if we should include subtopics based on experiments
645585
include_subtopics = is_subtopics_experiment(request)
646586

@@ -653,7 +593,6 @@ async def get_sections(
653593
sections_backend=sections_backend,
654594
surface_id=surface_id,
655595
min_feed_rank=1,
656-
crawl_branch=crawl_branch,
657596
include_subtopics=include_subtopics,
658597
scheduled_surface_backend=scheduled_surface_backend,
659598
is_custom_sections_experiment=custom_sections_enabled,

0 commit comments

Comments
 (0)