1919
2020DEFAULT_FAVICON = "PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiIGNsYXNzPSJsdWNpZGUgbHVjaWRlLWxpbmstaWNvbiBsdWNpZGUtbGluayI+PHBhdGggZD0iTTEwIDEzYTUgNSAwIDAgMCA3LjU0LjU0bDMtM2E1IDUgMCAwIDAtNy4wNy03LjA3bC0xLjcyIDEuNzEiLz48cGF0aCBkPSJNMTQgMTFhNSA1IDAgMCAwLTcuNTQtLjU0bC0zIDNhNSA1IDAgMCAwIDcuMDcgNy4wN2wxLjcxLTEuNzEiLz48L3N2Zz4=" # noqa: E501
2121
22-
23- @shared_task
24- def crawl_work_item_link_title (id : str , url : str ) -> None :
25- meta_data = crawl_work_item_link_title_and_favicon (url )
26- issue_link = IssueLink .objects .get (id = id )
27-
28- issue_link .metadata = meta_data
29-
30- issue_link .save ()
31-
32-
3322def crawl_work_item_link_title_and_favicon (url : str ) -> Dict [str , Any ]:
3423 """
3524 Crawls a URL to extract the title and favicon.
@@ -57,17 +46,18 @@ def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
5746 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
5847 }
5948
60- # Fetch the main page
61- response = requests . get ( url , headers = headers , timeout = 2 )
49+ soup = None
50+ title = None
6251
63- response .raise_for_status ()
52+ try :
53+ response = requests .get (url , headers = headers , timeout = 1 )
6454
65- # Parse HTML
66- soup = BeautifulSoup (response .content , "html.parser" )
55+ soup = BeautifulSoup (response .content , "html.parser" )
56+ title_tag = soup .find ("title" )
57+ title = title_tag .get_text ().strip () if title_tag else None
6758
68- # Extract title
69- title_tag = soup .find ("title" )
70- title = title_tag .get_text ().strip () if title_tag else None
59+ except requests .RequestException as e :
60+ logger .warning (f"Failed to fetch HTML for title: { str (e )} " )
7161
7262 # Fetch and encode favicon
7363 favicon_base64 = fetch_and_encode_favicon (headers , soup , url )
@@ -82,14 +72,6 @@ def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
8272
8373 return result
8474
85- except requests .RequestException as e :
86- log_exception (e )
87- return {
88- "error" : f"Request failed: { str (e )} " ,
89- "title" : None ,
90- "favicon" : None ,
91- "url" : url ,
92- }
9375 except Exception as e :
9476 log_exception (e )
9577 return {
@@ -100,7 +82,7 @@ def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
10082 }
10183
10284
103- def find_favicon_url (soup : BeautifulSoup , base_url : str ) -> Optional [str ]:
85+ def find_favicon_url (soup : Optional [ BeautifulSoup ] , base_url : str ) -> Optional [str ]:
10486 """
10587 Find the favicon URL from HTML soup.
10688
@@ -111,18 +93,20 @@ def find_favicon_url(soup: BeautifulSoup, base_url: str) -> Optional[str]:
11193 Returns:
11294 str: Absolute URL to favicon or None
11395 """
114- # Look for various favicon link tags
115- favicon_selectors = [
116- 'link[rel="icon"]' ,
117- 'link[rel="shortcut icon"]' ,
118- 'link[rel="apple-touch-icon"]' ,
119- 'link[rel="apple-touch-icon-precomposed"]' ,
120- ]
121-
122- for selector in favicon_selectors :
123- favicon_tag = soup .select_one (selector )
124- if favicon_tag and favicon_tag .get ("href" ):
125- return urljoin (base_url , favicon_tag ["href" ])
96+
97+ if soup is not None :
98+ # Look for various favicon link tags
99+ favicon_selectors = [
100+ 'link[rel="icon"]' ,
101+ 'link[rel="shortcut icon"]' ,
102+ 'link[rel="apple-touch-icon"]' ,
103+ 'link[rel="apple-touch-icon-precomposed"]' ,
104+ ]
105+
106+ for selector in favicon_selectors :
107+ favicon_tag = soup .select_one (selector )
108+ if favicon_tag and favicon_tag .get ("href" ):
109+ return urljoin (base_url , favicon_tag ["href" ])
126110
127111 # Fallback to /favicon.ico
128112 parsed_url = urlparse (base_url )
@@ -131,7 +115,6 @@ def find_favicon_url(soup: BeautifulSoup, base_url: str) -> Optional[str]:
131115 # Check if fallback exists
132116 try :
133117 response = requests .head (fallback_url , timeout = 2 )
134- response .raise_for_status ()
135118 if response .status_code == 200 :
136119 return fallback_url
137120 except requests .RequestException as e :
@@ -142,8 +125,8 @@ def find_favicon_url(soup: BeautifulSoup, base_url: str) -> Optional[str]:
142125
143126
144127def fetch_and_encode_favicon (
145- headers : Dict [str , str ], soup : BeautifulSoup , url : str
146- ) -> Optional [ Dict [str , str ]]:
128+ headers : Dict [str , str ], soup : Optional [ BeautifulSoup ] , url : str
129+ ) -> Dict [str , Optional [ str ]]:
147130 """
148131 Fetch favicon and encode it as base64.
149132
@@ -162,8 +145,7 @@ def fetch_and_encode_favicon(
162145 "favicon_base64" : f"data:image/svg+xml;base64,{ DEFAULT_FAVICON } " ,
163146 }
164147
165- response = requests .get (favicon_url , headers = headers , timeout = 2 )
166- response .raise_for_status ()
148+ response = requests .get (favicon_url , headers = headers , timeout = 1 )
167149
168150 # Get content type
169151 content_type = response .headers .get ("content-type" , "image/x-icon" )
@@ -183,3 +165,13 @@ def fetch_and_encode_favicon(
183165 "favicon_url" : None ,
184166 "favicon_base64" : f"data:image/svg+xml;base64,{ DEFAULT_FAVICON } " ,
185167 }
168+
169+
170+ @shared_task
171+ def crawl_work_item_link_title (id : str , url : str ) -> None :
172+ meta_data = crawl_work_item_link_title_and_favicon (url )
173+ issue_link = IssueLink .objects .get (id = id )
174+
175+ issue_link .metadata = meta_data
176+
177+ issue_link .save ()
0 commit comments