Skip to content

Commit 31448f3

Browse files
committed
DISCO-3202 - Enrich the curated-recommendations endpoint with icon-urls
1 parent 2c2e9b1 commit 31448f3

File tree

16 files changed

+392
-69
lines changed

16 files changed

+392
-69
lines changed

merino/curated_recommendations/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from merino.utils.http_client import create_http_client
3232
from merino.utils.synced_gcs_blob import SyncedGcsBlob
3333

34+
from merino.providers.manifest import get_provider as get_manifest_provider
35+
3436
logger = logging.getLogger(__name__)
3537

3638
_provider: CuratedRecommendationsProvider
@@ -121,6 +123,7 @@ def init_provider() -> None:
121123
http_client=create_http_client(base_url=""),
122124
graph_config=CorpusApiGraphConfig(),
123125
metrics_client=get_metrics_client(),
126+
manifest_provider=get_manifest_provider(),
124127
)
125128

126129
extended_expiration_corpus_backend = ExtendedExpirationCorpusBackend(

merino/curated_recommendations/corpus_backends/corpus_api_backend.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
ScheduledSurfaceId,
2727
)
2828
from merino.exceptions import BackendError
29+
from merino.providers.manifest import Provider as ManifestProvider
2930
from merino.utils.version import fetch_app_version_from_file
3031

3132
logger = logging.getLogger(__name__)
@@ -123,6 +124,7 @@ class CorpusApiBackend(CorpusBackend):
123124
http_client: AsyncClient
124125
graph_config: CorpusApiGraphConfig
125126
metrics_client: aiodogstatsd.Client
127+
manifest_provider: ManifestProvider
126128

127129
# time-to-live was chosen because 1 minute (+/- 10 s) is short enough that updates by curators
128130
# such as breaking news or editorial corrections propagate fast enough, and that the request
@@ -137,10 +139,12 @@ def __init__(
137139
http_client: AsyncClient,
138140
graph_config: CorpusApiGraphConfig,
139141
metrics_client: aiodogstatsd.Client,
142+
manifest_provider: ManifestProvider,
140143
):
141144
self.http_client = http_client
142145
self.graph_config = graph_config
143146
self.metrics_client = metrics_client
147+
self.manifest_provider = manifest_provider
144148
self._cache = {}
145149
self._background_tasks = set()
146150

@@ -257,7 +261,9 @@ async def fetch(
257261
before_sleep=before_sleep_log(logger, logging.WARNING),
258262
)
259263
async def _fetch_from_backend(
260-
self, surface_id: ScheduledSurfaceId, days_offset: int
264+
self,
265+
surface_id: ScheduledSurfaceId,
266+
days_offset: int,
261267
) -> list[CorpusItem]:
262268
"""Issue a scheduledSurface query"""
263269
query = """
@@ -321,6 +327,9 @@ async def _fetch_from_backend(
321327
item["corpusItem"]["url"] = self.update_url_utm_source(
322328
item["corpusItem"]["url"], str(utm_source)
323329
)
330+
# Add icon URL if available
331+
if icon_url := self.manifest_provider.get_icon_url(item["corpusItem"]["url"]):
332+
item["corpusItem"]["iconUrl"] = icon_url
324333

325334
curated_recommendations = [
326335
CorpusItem(

merino/curated_recommendations/corpus_backends/protocol.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class CorpusItem(BaseModel):
6161
publisher: str
6262
isTimeSensitive: bool
6363
imageUrl: HttpUrl
64+
iconUrl: HttpUrl | None = None
6465

6566

6667
class CorpusBackend(Protocol):

merino/providers/manifest/provider.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import asyncio
44
import time
55
import logging
6+
from urllib.parse import urlparse
7+
8+
from pydantic import HttpUrl
69

710
from merino.providers.manifest.backends.filemanager import GetManifestResultCode
811
from merino.utils import cron
@@ -20,6 +23,7 @@ class Provider:
2023
"""Provide access to in-memory manifest data fetched from GCS."""
2124

2225
manifest_data: ManifestData | None
26+
domain_lookup_table: dict[str, int]
2327
cron_task: asyncio.Task
2428
resync_interval_sec: int
2529
cron_interval_sec: int
@@ -39,6 +43,7 @@ def __init__(
3943
self.cron_interval_sec = cron_interval_sec
4044
self.last_fetch_at = 0.0
4145
self.manifest_data = ManifestData(domains=[])
46+
self.domain_lookup_table = {}
4247
self.data_fetched_event = asyncio.Event()
4348

4449
super().__init__()
@@ -63,8 +68,11 @@ async def _fetch_data(self) -> None:
6368
result_code, data = await self.backend.fetch()
6469

6570
match GetManifestResultCode(result_code):
66-
case GetManifestResultCode.SUCCESS:
71+
case GetManifestResultCode.SUCCESS if data is not None:
6772
self.manifest_data = data
73+
self.domain_lookup_table = {
74+
domain.domain: idx for idx, domain in enumerate(data.domains)
75+
}
6876
self.last_fetch_at = time.time()
6977

7078
case GetManifestResultCode.SKIP:
@@ -86,3 +94,27 @@ def _should_fetch(self) -> bool:
8694
def get_manifest_data(self) -> ManifestData | None:
8795
"""Return manifest data"""
8896
return self.manifest_data
97+
98+
def get_icon_url(self, url: str | HttpUrl) -> str | None:
99+
"""Get icon URL for a domain.
100+
101+
Args:
102+
url: Full URL to extract domain from (string or HttpUrl)
103+
104+
Returns:
105+
Icon URL if found, None otherwise
106+
"""
107+
try:
108+
url_str = str(url)
109+
# Remove www. and get the domain
110+
domain = urlparse(url_str).netloc.replace("www.", "")
111+
# Strip TLD by taking first part of domain
112+
base_domain = domain.split(".")[0] if "." in domain else domain
113+
114+
idx = self.domain_lookup_table.get(base_domain, -1)
115+
if idx >= 0 and self.manifest_data is not None:
116+
return self.manifest_data.domains[idx].icon
117+
118+
except Exception as e:
119+
logger.warning(f"Error getting icon for URL {url}: {e}")
120+
return None

tests/integration/api/conftest.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@
66

77
from logging import LogRecord
88
from typing import Iterator, Generator
9+
from unittest.mock import AsyncMock
910

1011
import pytest
1112
import orjson
1213
from starlette.testclient import TestClient
1314
from aiodogstatsd import Client as AioDogstatsdClient
15+
16+
from merino.providers.manifest import Provider
17+
from merino.providers.manifest.backends.manifest import ManifestBackend
18+
from merino.providers.manifest.backends.protocol import GetManifestResultCode, ManifestData
1419
from merino.utils.gcs.gcs_uploader import GcsUploader
1520
from contextlib import nullcontext
1621
from merino.curated_recommendations.fakespot_backend.protocol import (
@@ -134,3 +139,42 @@ def fakespot_feed() -> FakespotFeed:
134139
footerCopy=FAKESPOT_FOOTER_COPY,
135140
cta=FakespotCTA(ctaCopy=FAKESPOT_CTA_COPY, url=FAKESPOT_CTA_URL),
136141
)
142+
143+
144+
@pytest.fixture
145+
def mock_manifest():
146+
"""Mock manifest data with a known icon URL."""
147+
return {
148+
"domains": [
149+
{
150+
"rank": 1,
151+
"domain": "spotify",
152+
"categories": ["Entertainment"],
153+
"serp_categories": [0],
154+
"url": "https://www.spotify.com",
155+
"title": "Spotify",
156+
"icon": "https://test.com/spotify-favicon.ico",
157+
}
158+
]
159+
}
160+
161+
162+
@pytest.fixture
163+
def mock_manifest_backend(mock_manifest):
164+
"""Mock ManifestBackend that returns our test data."""
165+
backend = ManifestBackend()
166+
backend.fetch = AsyncMock(
167+
return_value=(GetManifestResultCode.SUCCESS, ManifestData(**mock_manifest))
168+
)
169+
return backend
170+
171+
172+
@pytest.fixture
173+
def manifest_provider(mock_manifest_backend):
174+
"""Override the manifest provider fixture with our mocked data."""
175+
provider = Provider(
176+
backend=mock_manifest_backend,
177+
resync_interval_sec=86400,
178+
cron_interval_sec=3600,
179+
)
180+
return provider

tests/integration/api/v1/curated_recommendations/corpus_backends/fixtures.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,12 @@ def corpus_http_client(fixture_response_data, fixture_request_data) -> AsyncMock
6565

6666

6767
@pytest.fixture()
68-
def corpus_backend(corpus_http_client: AsyncMock) -> CorpusApiBackend:
68+
def corpus_backend(corpus_http_client: AsyncMock, manifest_provider) -> CorpusApiBackend:
6969
"""Mock corpus api backend."""
7070
# Initialize the backend with the mock HTTP client
7171
return CorpusApiBackend(
7272
http_client=corpus_http_client,
7373
graph_config=CorpusApiGraphConfig(),
7474
metrics_client=get_metrics_client(),
75+
manifest_provider=manifest_provider,
7576
)

tests/integration/api/v1/curated_recommendations/corpus_backends/test_corpus_api_backend.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ async def test_fetch(corpus_backend: CorpusApiBackend, fixture_response_data):
4242
),
4343
scheduledCorpusItemId="de614b6b-6df6-470a-97f2-30344c56c1b3",
4444
corpusItemId="4095b364-02ff-402c-b58a-792a067fccf2",
45+
iconUrl=None,
4546
)
4647

4748

tests/integration/api/v1/curated_recommendations/test_curated_recommendations.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
get_provider,
2222
ConstantPrior,
2323
ExtendedExpirationCorpusBackend,
24+
CorpusApiGraphConfig,
2425
)
2526
from merino.curated_recommendations.corpus_backends.protocol import Topic, ScheduledSurfaceId
2627
from merino.curated_recommendations.engagement_backends.protocol import (
@@ -46,6 +47,9 @@
4647
)
4748
from merino.curated_recommendations.protocol import CuratedRecommendation
4849
from merino.main import app
50+
from merino.providers.manifest import get_provider as get_manifest_provider
51+
from merino.providers.manifest.backends.protocol import Domain
52+
from merino.utils.metrics import get_metrics_client
4953
from tests.integration.api.conftest import fakespot_feed
5054

5155

@@ -121,6 +125,16 @@ def extended_expiration_corpus_backend(
121125
)
122126

123127

128+
@pytest.fixture(autouse=True)
129+
def setup_manifest_provider(manifest_provider):
130+
"""Set up the manifest provider dependency"""
131+
app.dependency_overrides[get_manifest_provider] = lambda: manifest_provider
132+
yield
133+
# Clean up after test
134+
if get_manifest_provider in app.dependency_overrides:
135+
del app.dependency_overrides[get_manifest_provider]
136+
137+
124138
@pytest.fixture(name="corpus_provider")
125139
def provider(
126140
corpus_backend: CorpusApiBackend,
@@ -1529,3 +1543,94 @@ def mock_post_by_days_ago(*args, **kwargs):
15291543
else:
15301544
# Check that only today's items are returned if in control or not in the experiment.
15311545
assert set(days_ago_counter.keys()) == {0}
1546+
1547+
1548+
@pytest.mark.asyncio
1549+
async def test_curated_recommendations_enriched_with_icons(
1550+
manifest_provider,
1551+
corpus_http_client,
1552+
fixture_request_data,
1553+
):
1554+
"""Test the enrichment of a curated recommendation with an added icon-url."""
1555+
# Set up the manifest data first
1556+
manifest_provider.manifest_data.domains = [
1557+
Domain(
1558+
rank=2,
1559+
title="Microsoft – AI, Cloud, Productivity, Computing, Gaming & Apps",
1560+
url="https://www.microsoft.com",
1561+
domain="microsoft",
1562+
icon="https://merino-images.services.mozilla.com/favicons/microsoft-icon.png",
1563+
categories=["Business", "Information Technology"],
1564+
serp_categories=[0],
1565+
)
1566+
]
1567+
manifest_provider.domain_lookup_table = {"microsoft": 0}
1568+
1569+
backend = CorpusApiBackend(
1570+
http_client=corpus_http_client,
1571+
graph_config=CorpusApiGraphConfig(),
1572+
metrics_client=get_metrics_client(),
1573+
manifest_provider=manifest_provider,
1574+
)
1575+
1576+
provider_instance = CuratedRecommendationsProvider(
1577+
corpus_backend=backend,
1578+
extended_expiration_corpus_backend=ExtendedExpirationCorpusBackend(
1579+
backend=backend, engagement_backend=MockEngagementBackend()
1580+
),
1581+
engagement_backend=MockEngagementBackend(),
1582+
prior_backend=ConstantPrior(),
1583+
fakespot_backend=MockFakespotBackend(),
1584+
)
1585+
1586+
app.dependency_overrides[get_provider] = lambda: provider_instance
1587+
1588+
mocked_response = {
1589+
"data": {
1590+
"scheduledSurface": {
1591+
"items": [
1592+
{
1593+
"id": "scheduledSurfaceItemId-ABC",
1594+
"corpusItem": {
1595+
"id": "corpusItemId-XYZ",
1596+
"url": "https://www.microsoft.com/some-article?utm_source=firefox-newtab-en-us",
1597+
"title": "Some MS Article",
1598+
"excerpt": "All about Microsoft something",
1599+
"topic": "tech",
1600+
"publisher": "ExamplePublisher",
1601+
"isTimeSensitive": False,
1602+
"imageUrl": "https://somewhere.com/test.jpg",
1603+
},
1604+
}
1605+
]
1606+
}
1607+
}
1608+
}
1609+
corpus_http_client.post.return_value = Response(
1610+
status_code=200,
1611+
json=mocked_response,
1612+
request=fixture_request_data,
1613+
)
1614+
1615+
async with AsyncClient(app=app, base_url="http://test") as ac:
1616+
response = await ac.post(
1617+
"/api/v1/curated-recommendations",
1618+
json={"locale": "en-US"},
1619+
)
1620+
assert response.status_code == 200
1621+
1622+
data = response.json()
1623+
items = data["data"]
1624+
assert len(items) == 1
1625+
1626+
item = items[0]
1627+
assert item["url"] == "https://www.microsoft.com/some-article?utm_source=firefox-newtab-en-us"
1628+
1629+
assert "iconUrl" in item
1630+
assert (
1631+
item["iconUrl"] == "https://merino-images.services.mozilla.com/favicons/microsoft-icon.png"
1632+
)
1633+
1634+
# Clean up
1635+
if get_provider in app.dependency_overrides:
1636+
del app.dependency_overrides[get_provider]

tests/integration/api/v1/manifest/test_manifest.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,6 @@
1111
from merino.web.api_v1 import router
1212

1313

14-
@pytest.fixture(scope="module")
15-
def mock_manifest_2025() -> dict:
16-
"""Mock manifest json for the year 2025"""
17-
return {
18-
"domains": [
19-
{
20-
"rank": 1,
21-
"domain": "spotify",
22-
"categories": ["Entertainment"],
23-
"serp_categories": [0],
24-
"url": "https://www.spotify.com",
25-
"title": "Spotify",
26-
"icon": "",
27-
}
28-
]
29-
}
30-
31-
3214
@pytest.fixture(autouse=True, name="cleanup")
3315
def cleanup_tasks_fixture():
3416
"""Return a method that cleans up existing cron tasks after initialization"""
@@ -46,9 +28,7 @@ async def cleanup_tasks(provider: Provider):
4628

4729

4830
@pytest.mark.asyncio
49-
async def test_get_manifest_success(
50-
client_with_metrics, gcp_uploader, mock_manifest_2025, cleanup
51-
):
31+
async def test_get_manifest_success(client_with_metrics, gcp_uploader, mock_manifest, cleanup):
5232
"""Uploads a manifest to the gcs bucket and verifies that the endpoint returns the uploaded file."""
5333
# initialize provider on startup
5434
await init_provider()
@@ -57,7 +37,7 @@ async def test_get_manifest_success(
5737
app.include_router(router, prefix="/api/v1")
5838

5939
# upload a manifest file to GCS test container
60-
gcp_uploader.upload_content(orjson.dumps(mock_manifest_2025), "top_picks_latest.json")
40+
gcp_uploader.upload_content(orjson.dumps(mock_manifest), "top_picks_latest.json")
6141

6242
provider = get_provider()
6343
await provider.data_fetched_event.wait()

0 commit comments

Comments
 (0)