Skip to content

Commit c1d087d

Browse files
Add automated saving for custom favicons (#1135)
1 parent d8c91e2 commit c1d087d

File tree

3 files changed

+126
-33
lines changed

3 files changed

+126
-33
lines changed

merino/jobs/navigational_suggestions/domain_metadata_extractor.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,9 @@ async def _extract_favicons(
334334

335335
return favicons
336336

337-
def _is_better_favicon(
338-
self, favicon: dict[str, Any], width: int, best_width: int, best_source: str
337+
@staticmethod
338+
def is_better_favicon(
339+
favicon: dict[str, Any], width: int, best_width: int, best_source: str
339340
) -> bool:
340341
"""Check if this favicon is better than the current best using Firefox prioritization"""
341342
source = favicon.get("_source", "default")
@@ -481,7 +482,7 @@ async def _upload_best_favicon(
481482
continue
482483

483484
# Check if this is a better favicon than what we've seen so far
484-
if self._is_better_favicon(
485+
if self.is_better_favicon(
485486
favicons[original_idx],
486487
width_val,
487488
best_favicon_width,
@@ -502,7 +503,7 @@ async def _upload_best_favicon(
502503
except Exception as e:
503504
logger.warning(f"Failed to upload bitmap favicon: {e}")
504505
# Fallback to original URL if upload fails
505-
if self._is_better_favicon(
506+
if self.is_better_favicon(
506507
favicons[original_idx],
507508
width_val,
508509
best_favicon_width,

merino/jobs/utils/domain_tester.py

Lines changed: 103 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,15 @@
88
from rich.table import Table
99
from pydantic import BaseModel
1010
from google.cloud.storage import Blob
11-
12-
from merino.jobs.navigational_suggestions.domain_metadata_extractor import (
13-
DomainMetadataExtractor,
14-
current_scraper,
15-
Scraper,
16-
)
17-
from merino.jobs.navigational_suggestions.utils import AsyncFaviconDownloader
18-
from merino.jobs.navigational_suggestions.domain_metadata_uploader import (
19-
DomainMetadataUploader,
20-
)
21-
from merino.utils.gcs.models import BaseContentUploader
11+
import ast
12+
import re
13+
from pprint import pformat
14+
import tldextract
15+
from pathlib import Path
2216

2317
cli = typer.Typer(no_args_is_help=True)
2418
console = Console()
19+
FAVICON_PATH = "merino/jobs/navigational_suggestions/custom_favicons.py"
2520

2621

2722
class DomainTestResult(BaseModel):
@@ -38,6 +33,17 @@ class DomainTestResult(BaseModel):
3833

3934
async def async_test_domain(domain: str, min_width: int) -> DomainTestResult:
4035
"""Test metadata extraction for a single domain asynchronously"""
36+
from merino.jobs.navigational_suggestions.domain_metadata_extractor import (
37+
DomainMetadataExtractor,
38+
current_scraper,
39+
Scraper,
40+
)
41+
from merino.jobs.navigational_suggestions.utils import AsyncFaviconDownloader
42+
from merino.jobs.navigational_suggestions.domain_metadata_uploader import (
43+
DomainMetadataUploader,
44+
)
45+
from merino.utils.gcs.models import BaseContentUploader
46+
4147
timestamp = datetime.now().isoformat()
4248

4349
try:
@@ -206,12 +212,67 @@ async def probe_domains(domains: list[str], min_width: int) -> list[DomainTestRe
206212
return results
207213

208214

215+
def update_custom_favicons(title: str, url: str, table: Table) -> None:
216+
"""Update the custom favicons dictionary with a given title and url."""
217+
dic = {title.lower(): url}
218+
with open(FAVICON_PATH, "r") as f:
219+
content = f.read()
220+
pattern = r"(\s*CUSTOM_FAVICONS\s*:\s*dict\[\s*str\s*,\s*str\s*\]\s*=\s*\{.*?\})"
221+
match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
222+
if not match:
223+
table.add_row("Error", "Cannot find CUSTOM_FAVICONS dictionary")
224+
return
225+
dict_str = match.group(1)
226+
try:
227+
dict_part = dict_str.split("=", 1)[1].strip()
228+
parsed_dict = ast.literal_eval(dict_part)
229+
except Exception as e:
230+
table.add_row("Error", f"Unable to parse CUSTOM_FAVICONS dictionary: {e}")
231+
return
232+
parsed_dict.update(dic)
233+
updated_dict_str = "\nCUSTOM_FAVICONS: dict[str, str] = {\n "
234+
updated_dict_str += (
235+
pformat(parsed_dict, indent=4).replace("{", "").replace("}", "").replace("'", '"')
236+
)
237+
updated_dict_str += "\n}"
238+
updated_content = content.replace(dict_str, updated_dict_str)
239+
try:
240+
# Abstract Syntax Tree parsing suceeds only if the target is valid python code
241+
ast.parse(updated_content)
242+
with open(FAVICON_PATH, "w") as f:
243+
f.write(updated_content)
244+
table.add_row("Saved Domain", title)
245+
table.add_row("Saved URL", url)
246+
table.add_row("Save PATH", FAVICON_PATH)
247+
except Exception:
248+
table.add_row("Error", "Result is an invalid file")
249+
250+
251+
def favicon_width_convertor(width: str) -> int:
252+
"""Convert the width of a favicon to an integer."""
253+
size = width.split("x")
254+
if len(size) < 2:
255+
best_width = 1
256+
else:
257+
best_width = int(max(size))
258+
return best_width
259+
260+
209261
@cli.command()
210262
def test_domains(
211263
domains: list[str] = typer.Argument(..., help="List of domains to test"),
212264
min_width: int = typer.Option(32, help="Minimum favicon width", show_default=True),
265+
save_favicon: bool = typer.Option(False, "--save", help="Save custom favicon", is_flag=True),
213266
):
214267
"""Test domain metadata extraction for multiple domains"""
268+
if not Path("pyproject.toml").exists():
269+
print("The probe-images command must be run from the root directory.")
270+
return
271+
272+
from merino.jobs.navigational_suggestions.domain_metadata_extractor import (
273+
DomainMetadataExtractor,
274+
)
275+
215276
with console.status("Testing domains concurrently..."):
216277
results = asyncio.run(probe_domains(domains, min_width))
217278

@@ -227,6 +288,32 @@ def test_domains(
227288
console.print("✅ Success!")
228289
console.print(table)
229290

291+
save_table = Table(show_header=False, box=None)
292+
293+
if save_favicon and result.favicon_data:
294+
title = tldextract.extract(result.domain).domain
295+
best_icon = result.favicon_data["links"][0]
296+
best_width = favicon_width_convertor(
297+
result.favicon_data["links"][0].get("sizes", "1x1")
298+
)
299+
for icon in result.favicon_data["links"]:
300+
if not best_icon:
301+
best_icon = icon
302+
best_width = favicon_width_convertor(icon.get("sizes", "1x1"))
303+
continue
304+
width = favicon_width_convertor(icon.get("sizes", "1x1"))
305+
if DomainMetadataExtractor.is_better_favicon(
306+
icon, width, best_width, best_icon["_source"]
307+
):
308+
best_icon = icon
309+
best_width = width
310+
if title and best_icon:
311+
update_custom_favicons(title, best_icon["href"], save_table)
312+
elif not title:
313+
save_table.add_row("Error", "Unable to extract domain")
314+
else:
315+
save_table.add_row("Error", "Unable to find any favicons")
316+
230317
if result.favicon_data:
231318
console.print("\nAll favicons found:")
232319
for link in result.favicon_data["links"]:
@@ -239,6 +326,11 @@ def test_domains(
239326
if "type" in link:
240327
desc.append(f"type={link['type']}")
241328
console.print(f"- {link['href']} ({' '.join(desc)})")
329+
330+
if save_favicon:
331+
console.print("\nSave Results:")
332+
console.print(save_table)
333+
242334
else:
243335
console.print(f"\n❌ Failed testing domain: {result.domain}")
244336
if result.error:

tests/integration/jobs/navigational_suggestions/test_domain_metadata_extractor.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,7 @@ async def test_upload_best_favicon_source_prioritization(
612612
):
613613
"""Test the Firefox-style favicon prioritization with _source tracking.
614614
615-
Firefox prioritization rules (implemented in _is_better_favicon):
615+
Firefox prioritization rules (implemented in is_better_favicon):
616616
1. Source priority: link > meta > manifest > default
617617
2. Within same source: larger size wins
618618
3. SVGs are handled during the early SVG processing phase
@@ -1179,7 +1179,7 @@ def test_process_domain_metadata_multiple_tlds(self, mock_uploader, mocker):
11791179
assert res["url"].startswith("https://espn")
11801180

11811181
def test_is_better_favicon_prioritization_rules(self):
1182-
"""Test the _is_better_favicon method that implements Firefox prioritization logic.
1182+
"""Test the is_better_favicon method that implements Firefox prioritization logic.
11831183
11841184
This tests the core prioritization algorithm:
11851185
1. Source priority (most important): link(1) > meta(2) > manifest(3) > default(4)
@@ -1195,55 +1195,55 @@ def test_is_better_favicon_prioritization_rules(self):
11951195

11961196
# Link source should win even with smaller size
11971197
assert (
1198-
extractor._is_better_favicon(link_small, width=16, best_width=512, best_source="meta")
1198+
extractor.is_better_favicon(link_small, width=16, best_width=512, best_source="meta")
11991199
is True
12001200
)
12011201
assert (
1202-
extractor._is_better_favicon(
1202+
extractor.is_better_favicon(
12031203
link_small, width=16, best_width=512, best_source="manifest"
12041204
)
12051205
is True
12061206
)
12071207
assert (
1208-
extractor._is_better_favicon(
1208+
extractor.is_better_favicon(
12091209
link_small, width=16, best_width=512, best_source="default"
12101210
)
12111211
is True
12121212
)
12131213

12141214
# Test 2: Meta beats manifest and default, loses to link
12151215
assert (
1216-
extractor._is_better_favicon(
1216+
extractor.is_better_favicon(
12171217
meta_large, width=16, best_width=512, best_source="manifest"
12181218
)
12191219
is True
12201220
)
12211221
assert (
1222-
extractor._is_better_favicon(
1222+
extractor.is_better_favicon(
12231223
meta_large, width=16, best_width=512, best_source="default"
12241224
)
12251225
is True
12261226
)
12271227
assert (
1228-
extractor._is_better_favicon(meta_large, width=512, best_width=16, best_source="link")
1228+
extractor.is_better_favicon(meta_large, width=512, best_width=16, best_source="link")
12291229
is False
12301230
)
12311231

12321232
# Test 3: Manifest beats default, loses to meta and link
12331233
assert (
1234-
extractor._is_better_favicon(
1234+
extractor.is_better_favicon(
12351235
manifest_huge, width=512, best_width=16, best_source="default"
12361236
)
12371237
is True
12381238
)
12391239
assert (
1240-
extractor._is_better_favicon(
1240+
extractor.is_better_favicon(
12411241
manifest_huge, width=512, best_width=16, best_source="meta"
12421242
)
12431243
is False
12441244
)
12451245
assert (
1246-
extractor._is_better_favicon(
1246+
extractor.is_better_favicon(
12471247
manifest_huge, width=512, best_width=16, best_source="link"
12481248
)
12491249
is False
@@ -1252,36 +1252,36 @@ def test_is_better_favicon_prioritization_rules(self):
12521252
# Test 4: Within same source, size matters
12531253
link_large = {"_source": "link"}
12541254
assert (
1255-
extractor._is_better_favicon(link_large, width=64, best_width=32, best_source="link")
1255+
extractor.is_better_favicon(link_large, width=64, best_width=32, best_source="link")
12561256
is True
12571257
)
12581258
assert (
1259-
extractor._is_better_favicon(link_large, width=32, best_width=64, best_source="link")
1259+
extractor.is_better_favicon(link_large, width=32, best_width=64, best_source="link")
12601260
is False
12611261
)
12621262
assert (
1263-
extractor._is_better_favicon(link_large, width=32, best_width=32, best_source="link")
1263+
extractor.is_better_favicon(link_large, width=32, best_width=32, best_source="link")
12641264
is False
12651265
)
12661266

12671267
# Test 5: Missing _source defaults to "default" priority (lowest)
12681268
no_source = {} # No _source field
12691269
assert (
1270-
extractor._is_better_favicon(no_source, width=512, best_width=16, best_source="link")
1270+
extractor.is_better_favicon(no_source, width=512, best_width=16, best_source="link")
12711271
is False
12721272
)
12731273
assert (
1274-
extractor._is_better_favicon(no_source, width=512, best_width=16, best_source="meta")
1274+
extractor.is_better_favicon(no_source, width=512, best_width=16, best_source="meta")
12751275
is False
12761276
)
12771277
assert (
1278-
extractor._is_better_favicon(
1278+
extractor.is_better_favicon(
12791279
no_source, width=512, best_width=16, best_source="manifest"
12801280
)
12811281
is False
12821282
)
12831283
# But should win against another default with larger size
12841284
assert (
1285-
extractor._is_better_favicon(no_source, width=64, best_width=32, best_source="default")
1285+
extractor.is_better_favicon(no_source, width=64, best_width=32, best_source="default")
12861286
is True
12871287
)

0 commit comments

Comments
 (0)