Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,18 @@ def _normalize_interests(self, interests: str) -> str | None:
normalized_chunks.append(replacement_map.get(chunk, "0000"))
return "".join(normalized_chunks)

def _is_empty_cohort_for_no_clicks(self, normalized_interests: str) -> bool:
"""Determine if the normalized interests correspond to the empty cohort for no clicks."""
def _is_empty_cohort_for_no_clicks(
self, normalized_interests: str, skip_last_interest: bool = True
) -> bool:
"""Determine if the normalized interests correspond to the empty cohort for no clicks.
The last interest is reserved in the US for time zone offset, so it doesn't count as a click
"""
if not DO_EMPTY_COHORT_FOR_NO_CLICKS:
return False
for k in range(self._num_bits // 4):
num_interests = self._num_bits // 4
if skip_last_interest:
num_interests -= 1
for k in range(num_interests):
chunk = normalized_interests[k * 4 : (k + 1) * 4]
if (
chunk != "0000" and chunk != "1000"
Expand Down
10 changes: 10 additions & 0 deletions merino/curated_recommendations/ml_backends/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ class ModelData(BaseModel):
private_features: list | None = None


class PrivacyOverrides(BaseModel):
"""Defines privacy overrides, so they can be applied automatically for Merino based experiments to reduce risk of privacy issues"""

iv_in_telemetry: bool | None = None
random_content_click_probability_epsilon_micro: int | None = None
daily_click_event_cap: int | None = None
local_popular_today_rerank: bool | None = None


class InferredLocalModel(BaseModel):
"""Class that defines parameters on the local Firefox client for defining an interest vector from interaction
events
Expand All @@ -70,6 +79,7 @@ class InferredLocalModel(BaseModel):
surface_id: str

model_data: ModelData
privacy_overrides: PrivacyOverrides | None = None

def get_unary_encoded_index(self, encoded_string: str, support_two: bool = False) -> list[int]:
"""Decode a unary encoded string with differential privacy added.
Expand Down
165 changes: 129 additions & 36 deletions merino/curated_recommendations/ml_backends/static_local_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Backup local model for testing and in case of GCS failure"""

from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

from merino.curated_recommendations.corpus_backends.protocol import Topic
from merino.curated_recommendations.ml_backends.protocol import (
InferredLocalModel,
Expand All @@ -8,6 +11,7 @@
InterestVectorConfig,
ModelType,
DayTimeWeightingConfig,
PrivacyOverrides,
)
from merino.curated_recommendations.protocol import ExperimentName

Expand All @@ -16,6 +20,8 @@
INFERRED_LOCAL_EXPERIMENT_NAME_V3 = ExperimentName.INFERRED_LOCAL_EXPERIMENT_V3.value
INFERRED_LOCAL_EXPERIMENT_NAME_V4 = ExperimentName.INFERRED_LOCAL_EXPERIMENT_V4.value

TEST_INFERRED_EXPERIMENT = "test-inferred-experiment"

LOCAL_AND_SERVER_V1_MODEL_ID = "local-and-server"
LOCAL_ONLY_V1_MODEL_ID = "local-only"
SERVER_V3_MODEL_ID = "inferred-v3-model"
Expand All @@ -36,6 +42,15 @@
SUPPORTED_LIVE_MODELS = {SERVER_V3_MODEL_ID}

DEFAULT_PRODUCTION_MODEL_ID = SERVER_V3_MODEL_ID
EXPERIMENT_PRODUCTION_MODEL_ID = SERVER_V3_MODEL_ID + "_exp"

# These cause interest vector to have no randomization and should only be used
# when thresholds force a constant ouput
FIXED_VALUE_P = 1.0
FIXED_VALUE_Q = 0.0

# Very high threshold to ensure that the 0 index is always returned
VERY_HIGH_THRESHOLD = 1000.0

# Features corresponding to a combination of remaining topics not specified in a feature model
DEFAULT_INTERESTS_KEY = "other"
Expand Down Expand Up @@ -161,22 +176,42 @@ def get_topic(topic: str) -> InterestVectorConfig:
)


MODEL_P_VALUE_V1 = 0.806
MODEL_Q_VALUE_V1 = 0.030
# See calculation https://colab.research.google.com/drive/1GlEr2TScikP8YLKpAL1sGTawnimD1IyV#scrollTo=KawDDJnjBwIM
# Section March 2026 rollout
MODEL_P_VALUE = 0.92
MODEL_Q_VALUE = 0.0288

MODEL_P_VALUE_V3 = 0.91
MODEL_Q_VALUE_V3 = 0.030

OFF_THRESH_VALUE = 100

THRESHOLDS_V3_NORMALIZED = [0.3, 0.5, 0.8]
THRESHOLDS_V3_NORMALIZED = [0.25, 0.46, 0.8]
THRESHOLDS_V3_NON_NORMALIZED = [0.002, 0.008, 0.017]
THRESHOLDS_V3_NON_NORMALIZED_ALL_TOPICS = [0.0001, 0.002, 0.004]

SUBTOPIC_TOPIC_BLEND_RATIO = 0.15

TIME_ZONE_OFFSET_INFERRED_KEY = "timeZoneOffset"

CLICK_RANDOMIZATION_EPSILON_MICRO_FOR_EXPERIMENT = 14700000

SPECIAL_ALL_TOPIC_KEYWOWRD = "all"


class PrivacyOverridesForFivePercentExperimentUS(PrivacyOverrides):
"""Defines privacy overrides, so they can be applied automatically for Merino based experiments to reduce risk of privacy issues"""

def __init__(self, **data) -> None:
data.setdefault("iv_in_telemetry", False)
data.setdefault(
"random_content_click_probability_epsilon_micro",
CLICK_RANDOMIZATION_EPSILON_MICRO_FOR_EXPERIMENT,
)
data.setdefault(
"daily_click_event_cap", 2
) # Cap of 10 click events per day to reduce risk of outliers
super().__init__(**data)


# Creates a limited model based on topics. Topics features are stored with a t_
# in telemetry.
class SuperInferredModel(LocalModelBackend):
Expand All @@ -198,24 +233,63 @@ class SuperInferredModel(LocalModelBackend):
Topic.FOOD.value,
Topic.TECHNOLOGY.value,
Topic.SCIENCE.value,
SPECIAL_ALL_TOPIC_KEYWOWRD,
# Time zone is added for 8th private feature
]

# These are the only features supported in a small experiment (in addition to time zone)
v3_small_experiment_topics = {
Topic.SPORTS.value,
Topic.PARENTING.value,
Topic.SCIENCE.value,
}

limited_topics_set = set(v3_limited_topics)

@staticmethod
def _get_topic(topic: str, thresholds: list[float]) -> InterestVectorConfig:
def _get_topic(
topic: str, thresholds: list[float], disable_feature=False
) -> InterestVectorConfig:
"""Return feature for a topic, with a disabled (constant 0 output) feature
if disabled_feature is True.

Sometimes for privacy purposes we want to keep the feature in the list for
interest vector consistency issues, but hard code to 0 for a particual privacy profile,
such as within an experiment
"""
if disable_feature:
return InterestVectorConfig(
features={f"t_{topic}": 1},
thresholds=[VERY_HIGH_THRESHOLD for _ in range(len(thresholds))],
diff_p=FIXED_VALUE_P,
diff_q=FIXED_VALUE_Q,
)
if topic == SPECIAL_ALL_TOPIC_KEYWOWRD:
return InterestVectorConfig(
features={f"t_{t}": 1 for t in BASE_TOPICS},
thresholds=THRESHOLDS_V3_NON_NORMALIZED_ALL_TOPICS,
diff_p=MODEL_P_VALUE_V3,
diff_q=MODEL_Q_VALUE_V3,
diff_p=MODEL_P_VALUE,
diff_q=MODEL_Q_VALUE,
)
return InterestVectorConfig(
features={f"t_{topic}": 1},
thresholds=thresholds,
diff_p=MODEL_P_VALUE_V3,
diff_q=MODEL_Q_VALUE_V3,
diff_p=MODEL_P_VALUE,
diff_q=MODEL_Q_VALUE,
)

@staticmethod
def _get_time_zone() -> InterestVectorConfig:
"""Time zone key has special functionality in Firefox, but we must specifiy threshols here
based on UTC offset +24 (positive values). These thresholds support the 4 continental US zones
"""
now: datetime = datetime.now(ZoneInfo("America/Los_Angeles"))
offset: timedelta = now.utcoffset() or timedelta(0)
pacific_bucket: float = (offset.total_seconds() / 3600) % 24
return InterestVectorConfig(
features={},
thresholds=[pacific_bucket + 0.1, pacific_bucket + 1.1, pacific_bucket + 2.1],
diff_p=MODEL_P_VALUE,
diff_q=MODEL_Q_VALUE,
)

@staticmethod
Expand All @@ -224,43 +298,61 @@ def _get_section(section_name: str, thresholds: list[float]) -> InterestVectorCo
return InterestVectorConfig(
features=features,
thresholds=thresholds,
diff_p=MODEL_P_VALUE_V3,
diff_q=MODEL_Q_VALUE_V3, # Note since these section features are non-private features, p/q are ignored
diff_p=MODEL_P_VALUE,
diff_q=MODEL_Q_VALUE, # Note since these section features are non-private features, p/q are ignored
)

def _build_local(self, model_id, surface_id) -> InferredLocalModel | None:
model_thresholds = THRESHOLDS_V3_NON_NORMALIZED
def _build_local(
self, model_id, surface_id, small_experiment=False
) -> InferredLocalModel | None:
model_thresholds = THRESHOLDS_V3_NORMALIZED
private_features: list[str] | None = None

if model_id == SERVER_V3_MODEL_ID:
## private features are sent to merino, "private" from differentially private
private_features = self.v3_limited_topics
"""
Section features are disabled but will be returned soon when we have the ability to scale their influence
locally via the server_score parameter
_section_features = {
a: self._get_section(a, model_thresholds)
for a in BASE_SECTIONS_FOR_LOCAL_MODEL
if a not in self.limited_topics_set
section_features = {
a: self._get_section(a, model_thresholds)
for a in BASE_SECTIONS_FOR_LOCAL_MODEL
if a not in self.limited_topics_set
}

private_features = self.v3_limited_topics + [TIME_ZONE_OFFSET_INFERRED_KEY]

if small_experiment:
topic_features = {
a: self._get_topic(
a, model_thresholds, disable_feature=a not in self.v3_small_experiment_topics
)
for a in self.v3_limited_topics
}
"""
topic_features = {a: self._get_topic(a, model_thresholds) for a in self.v3_limited_topics}
else:
topic_features = {
a: self._get_topic(a, model_thresholds) for a in self.v3_limited_topics
}

model_data: ModelData = ModelData(
model_type=ModelType.CTR,
rescale=False,
rescale=True,
noise_scale=0.0,
day_time_weighting=DayTimeWeightingConfig(
days=[30],
relative_weight=[1],
),
interest_vector={**topic_features}, # **_section_features},
interest_vector={
**topic_features,
TIME_ZONE_OFFSET_INFERRED_KEY: self._get_time_zone(),
**section_features,
},
private_features=private_features,
)
# No privacy overrides until this is implemented in Merino
privacy_overrides: PrivacyOverrides | None = (
PrivacyOverridesForFivePercentExperimentUS() if small_experiment else None
)
return InferredLocalModel(
model_id=model_id,
surface_id=surface_id,
model_data=model_data,
model_version=0,
privacy_overrides=privacy_overrides,
)

def get(
Expand All @@ -285,18 +377,19 @@ def get(
## there will be another call to "get" with model_id=None
## where the next model is built+returned
return None
supported_model = self._build_local(SERVER_V3_MODEL_ID, surface_id)

if model_id is None: ## this is the "get" call for building the model sent in the response
## switch on experiment name, not using util because we have string name instead of request object
if (
experiment_name == INFERRED_LOCAL_EXPERIMENT_NAME_V4
experiment_name is None # Default
or experiment_name == INFERRED_LOCAL_EXPERIMENT_NAME_V4
or experiment_name == f"optin-{INFERRED_LOCAL_EXPERIMENT_NAME_V4}"
or experiment_name == INFERRED_LOCAL_EXPERIMENT_NAME_V3
or experiment_name == f"optin-{INFERRED_LOCAL_EXPERIMENT_NAME_V3}"
):
# We don't have to check for branch here as control won't call inferred code
return supported_model
return self._build_local(SERVER_V3_MODEL_ID, surface_id)
else:
return supported_model # this is the default model
return self._build_local(
EXPERIMENT_PRODUCTION_MODEL_ID, surface_id, small_experiment=True
)
# Normally we would pick the model based on model_id here, but we are supporting only one right now
return supported_model
return self._build_local(SERVER_V3_MODEL_ID, surface_id)
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
# This was a 50% experiment but overal users has declined over time
INFERRED_EXPERIMENT_PERCENTAGE = 0.25

LOCAL_RERANK_WEGHT = (
30.0 # Gives items a slight boost. Ave ctr 0.002, and this number is multipled, then
)

FIXED_ITEM_TARGET_ARTICLE_IMPRESSIONS = 12000

EST_DAILY_IMPRESSIONS_TOP_STORY_TILE = (
Expand Down Expand Up @@ -72,6 +76,7 @@ def __init__(self, **data: Any):
data.setdefault("fresh_items_top_stories_max_percentage", 0.15)
data.setdefault("fresh_items_section_ranking_max_percentage", 0.15)
data.setdefault("fresh_items_limit_prior_threshold_multiplier", 1)
data.setdefault("local_rerank_scalar", LOCAL_RERANK_WEGHT)
super().__init__(**data)

@classmethod
Expand Down
2 changes: 2 additions & 0 deletions merino/curated_recommendations/prior_backends/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class EngagementRescaler(BaseModel):
float # Max number of fresh items to use when considering section rank
) = 0

local_rerank_scalar: float = 0

fresh_items_top_stories_fixed_position: (
int | None # Fixed position to host bulk of fresh stories
) = None
Expand Down
1 change: 1 addition & 0 deletions merino/curated_recommendations/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ class CuratedRecommendation(CorpusItem):
ranking_data: Annotated[RankingData | None, Field(exclude=True)] = None
tileId: Annotated[int | None, Field(strict=True, ge=MIN_TILE_ID, le=MAX_TILE_ID)] = None
receivedRank: int
serverScore: float | None = None
features: dict[str, float] = Field(
default_factory=dict,
description="Maps feature names to weights, which the client "
Expand Down
8 changes: 1 addition & 7 deletions merino/curated_recommendations/rankers/contextual_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,16 @@
# contexual is rolled out we can use a dynamic computed value based on daily impressions.

CONTEXUAL_AVG_BETA_VALUE = 4000
CONTEXTAL_LIMIT_PERCENTAGE_ADJUSTMENT = (
0.5 # Underscored items tend to scale higher, leading to too much fresh content
)

logger = logging.getLogger(__name__)

# These topics are in the current interest vector but not being used to determine the
# cohort selection.
CONTEXUAL_INFERRED_PER_TOPIC_WEIGHTING = {
Topic.TECHNOLOGY: 1.0,
Topic.POLITICS: 0.3,
Topic.ARTS: 0.3,
}

CONTEXUAL_INFERRED_SINGLE_TOPIC_BOOST_WEIGHT = 0.0003
CONTEXUAL_INFERRED_SINGLE_TOPIC_BOOST_WEIGHT = 0.0001
CONTEXUAL_INFERRED_SINGLE_TOPIC_BOOST_OFFSET = 0.2


Expand Down Expand Up @@ -171,7 +166,6 @@ def sample_score(sec: Section) -> float:
"""Create score based on top items in section"""
fresh_retain_likelyhood = (
rescaler.fresh_items_section_ranking_max_percentage
* CONTEXTAL_LIMIT_PERCENTAGE_ADJUSTMENT
if rescaler is not None
else 0.0
)
Expand Down
Loading
Loading