11# Python imports
22import logging
33
4-
54# Third party imports
65from celery import shared_task
76import requests
2019DEFAULT_FAVICON = "PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiIGNsYXNzPSJsdWNpZGUgbHVjaWRlLWxpbmstaWNvbiBsdWNpZGUtbGluayI+PHBhdGggZD0iTTEwIDEzYTUgNSAwIDAgMCA3LjU0LjU0bDMtM2E1IDUgMCAwIDAtNy4wNy03LjA3bC0xLjcyIDEuNzEiLz48cGF0aCBkPSJNMTQgMTFhNSA1IDAgMCAwLTcuNTQtLjU0bC0zIDNhNSA1IDAgMCAwIDcuMDcgNy4wN2wxLjcxLTEuNzEiLz48L3N2Zz4=" # noqa: E501
2120
2221
22+ def validate_url_ip (url : str ) -> None :
23+ """
24+ Validate that a URL doesn't point to a private/internal IP address.
25+ Only checks if the hostname is a direct IP address.
26+
27+ Args:
28+ url: The URL to validate
29+
30+ Raises:
31+ ValueError: If the URL points to a private/internal IP
32+ """
33+ parsed = urlparse (url )
34+ hostname = parsed .hostname
35+
36+ if not hostname :
37+ return
38+
39+ try :
40+ ip = ipaddress .ip_address (hostname )
41+ except ValueError :
42+ # Not an IP address (it's a domain name), nothing to check here
43+ return
44+
45+ # It IS an IP address - check if it's private/internal
46+ if ip .is_private or ip .is_loopback or ip .is_reserved :
47+ raise ValueError ("Access to private/internal networks is not allowed" )
48+
49+
2350def crawl_work_item_link_title_and_favicon (url : str ) -> Dict [str , Any ]:
2451 """
2552 Crawls a URL to extract the title and favicon.
@@ -31,27 +58,23 @@ def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
3158 str: JSON string containing title and base64-encoded favicon
3259 """
3360 try :
34- # Prevent access to private IP ranges
35- parsed = urlparse (url )
36-
37- try :
38- ip = ipaddress .ip_address (parsed .hostname )
39- if ip .is_private or ip .is_loopback or ip .is_reserved :
40- raise ValueError ("Access to private/internal networks is not allowed" )
41- except ValueError :
42- # Not an IP address, continue with domain validation
43- pass
44-
4561 # Set up headers to mimic a real browser
4662 headers = {
4763 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
4864 }
4965
5066 soup = None
5167 title = None
68+ final_url = url
69+
70+ validate_url_ip (final_url )
5271
5372 try :
54- response = requests .get (url , headers = headers , timeout = 1 )
73+ response = requests .get (final_url , headers = headers , timeout = 1 )
74+ final_url = response .url # Get the final URL after any redirects
75+
76+ # check for redirected url also
77+ validate_url_ip (final_url )
5578
5679 soup = BeautifulSoup (response .content , "html.parser" )
5780 title_tag = soup .find ("title" )
@@ -60,8 +83,8 @@ def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
6083 except requests .RequestException as e :
6184 logger .warning (f"Failed to fetch HTML for title: { str (e )} " )
6285
63- # Fetch and encode favicon
64- favicon_base64 = fetch_and_encode_favicon (headers , soup , url )
86+ # Fetch and encode favicon using final URL (after redirects)
87+ favicon_base64 = fetch_and_encode_favicon (headers , soup , final_url )
6588
6689 # Prepare result
6790 result = {
0 commit comments