88from rich .table import Table
99from pydantic import BaseModel
1010from google .cloud .storage import Blob
11-
12- from merino .jobs .navigational_suggestions .domain_metadata_extractor import (
13- DomainMetadataExtractor ,
14- current_scraper ,
15- Scraper ,
16- )
17- from merino .jobs .navigational_suggestions .utils import AsyncFaviconDownloader
18- from merino .jobs .navigational_suggestions .domain_metadata_uploader import (
19- DomainMetadataUploader ,
20- )
21- from merino .utils .gcs .models import BaseContentUploader
11+ import ast
12+ import re
13+ from pprint import pformat
14+ import tldextract
15+ from pathlib import Path
2216
2317cli = typer .Typer (no_args_is_help = True )
2418console = Console ()
19+ FAVICON_PATH = "merino/jobs/navigational_suggestions/custom_favicons.py"
2520
2621
2722class DomainTestResult (BaseModel ):
@@ -38,6 +33,17 @@ class DomainTestResult(BaseModel):
3833
3934async def async_test_domain (domain : str , min_width : int ) -> DomainTestResult :
4035 """Test metadata extraction for a single domain asynchronously"""
36+ from merino .jobs .navigational_suggestions .domain_metadata_extractor import (
37+ DomainMetadataExtractor ,
38+ current_scraper ,
39+ Scraper ,
40+ )
41+ from merino .jobs .navigational_suggestions .utils import AsyncFaviconDownloader
42+ from merino .jobs .navigational_suggestions .domain_metadata_uploader import (
43+ DomainMetadataUploader ,
44+ )
45+ from merino .utils .gcs .models import BaseContentUploader
46+
4147 timestamp = datetime .now ().isoformat ()
4248
4349 try :
@@ -206,12 +212,67 @@ async def probe_domains(domains: list[str], min_width: int) -> list[DomainTestRe
206212 return results
207213
208214
215+ def update_custom_favicons (title : str , url : str , table : Table ) -> None :
216+ """Update the custom favicons dictionary with a given title and url."""
217+ dic = {title .lower (): url }
218+ with open (FAVICON_PATH , "r" ) as f :
219+ content = f .read ()
220+ pattern = r"(\s*CUSTOM_FAVICONS\s*:\s*dict\[\s*str\s*,\s*str\s*\]\s*=\s*\{.*?\})"
221+ match = re .search (pattern , content , re .DOTALL | re .IGNORECASE )
222+ if not match :
223+ table .add_row ("Error" , "Cannot find CUSTOM_FAVICONS dictionary" )
224+ return
225+ dict_str = match .group (1 )
226+ try :
227+ dict_part = dict_str .split ("=" , 1 )[1 ].strip ()
228+ parsed_dict = ast .literal_eval (dict_part )
229+ except Exception as e :
230+ table .add_row ("Error" , f"Unable to parse CUSTOM_FAVICONS dictionary: { e } " )
231+ return
232+ parsed_dict .update (dic )
233+ updated_dict_str = "\n CUSTOM_FAVICONS: dict[str, str] = {\n "
234+ updated_dict_str += (
235+ pformat (parsed_dict , indent = 4 ).replace ("{" , "" ).replace ("}" , "" ).replace ("'" , '"' )
236+ )
237+ updated_dict_str += "\n }"
238+ updated_content = content .replace (dict_str , updated_dict_str )
239+ try :
240+ # Abstract Syntax Tree parsing suceeds only if the target is valid python code
241+ ast .parse (updated_content )
242+ with open (FAVICON_PATH , "w" ) as f :
243+ f .write (updated_content )
244+ table .add_row ("Saved Domain" , title )
245+ table .add_row ("Saved URL" , url )
246+ table .add_row ("Save PATH" , FAVICON_PATH )
247+ except Exception :
248+ table .add_row ("Error" , "Result is an invalid file" )
249+
250+
251+ def favicon_width_convertor (width : str ) -> int :
252+ """Convert the width of a favicon to an integer."""
253+ size = width .split ("x" )
254+ if len (size ) < 2 :
255+ best_width = 1
256+ else :
257+ best_width = int (max (size ))
258+ return best_width
259+
260+
209261@cli .command ()
210262def test_domains (
211263 domains : list [str ] = typer .Argument (..., help = "List of domains to test" ),
212264 min_width : int = typer .Option (32 , help = "Minimum favicon width" , show_default = True ),
265+ save_favicon : bool = typer .Option (False , "--save" , help = "Save custom favicon" , is_flag = True ),
213266):
214267 """Test domain metadata extraction for multiple domains"""
268+ if not Path ("pyproject.toml" ).exists ():
269+ print ("The probe-images command must be run from the root directory." )
270+ return
271+
272+ from merino .jobs .navigational_suggestions .domain_metadata_extractor import (
273+ DomainMetadataExtractor ,
274+ )
275+
215276 with console .status ("Testing domains concurrently..." ):
216277 results = asyncio .run (probe_domains (domains , min_width ))
217278
@@ -227,6 +288,32 @@ def test_domains(
227288 console .print ("✅ Success!" )
228289 console .print (table )
229290
291+ save_table = Table (show_header = False , box = None )
292+
293+ if save_favicon and result .favicon_data :
294+ title = tldextract .extract (result .domain ).domain
295+ best_icon = result .favicon_data ["links" ][0 ]
296+ best_width = favicon_width_convertor (
297+ result .favicon_data ["links" ][0 ].get ("sizes" , "1x1" )
298+ )
299+ for icon in result .favicon_data ["links" ]:
300+ if not best_icon :
301+ best_icon = icon
302+ best_width = favicon_width_convertor (icon .get ("sizes" , "1x1" ))
303+ continue
304+ width = favicon_width_convertor (icon .get ("sizes" , "1x1" ))
305+ if DomainMetadataExtractor .is_better_favicon (
306+ icon , width , best_width , best_icon ["_source" ]
307+ ):
308+ best_icon = icon
309+ best_width = width
310+ if title and best_icon :
311+ update_custom_favicons (title , best_icon ["href" ], save_table )
312+ elif not title :
313+ save_table .add_row ("Error" , "Unable to extract domain" )
314+ else :
315+ save_table .add_row ("Error" , "Unable to find any favicons" )
316+
230317 if result .favicon_data :
231318 console .print ("\n All favicons found:" )
232319 for link in result .favicon_data ["links" ]:
@@ -239,6 +326,11 @@ def test_domains(
239326 if "type" in link :
240327 desc .append (f"type={ link ['type' ]} " )
241328 console .print (f"- { link ['href' ]} ({ ' ' .join (desc )} )" )
329+
330+ if save_favicon :
331+ console .print ("\n Save Results:" )
332+ console .print (save_table )
333+
242334 else :
243335 console .print (f"\n ❌ Failed testing domain: { result .domain } " )
244336 if result .error :
0 commit comments