1+ from urllib .parse import urlparse
2+ import socket
3+ import requests
4+ from scrapegraph_py .logger import sgai_logger as logger
5+
6+ def validate_website_url (url : str ) -> None :
7+ """Validate if website URL is reachable."""
8+ logger .info (f"π Validating website URL: { url } " )
9+
10+ try :
11+ # Validate URL format
12+ parsed = urlparse (url )
13+ if not all ([parsed .scheme , parsed .netloc ]):
14+ logger .error (f"β Invalid URL format: { url } " )
15+ raise ValueError ("Invalid URL format" )
16+ logger .info ("β
URL format is valid" )
17+
18+ # Try to resolve domain
19+ logger .info (f"π Checking domain accessibility: { parsed .netloc } " )
20+ socket .gethostbyname (parsed .netloc )
21+
22+ # Try to make a HEAD request to verify the website responds
23+ logger .info (f"π Verifying website response..." )
24+ response = requests .head (url , timeout = 5 , allow_redirects = True )
25+ response .raise_for_status ()
26+ logger .info (f"β
Website is accessible and responding" )
27+
28+ except socket .gaierror :
29+ error_msg = f"Could not resolve domain: { url } "
30+ logger .error (f"β { error_msg } " )
31+ raise ValueError (error_msg )
32+ except requests .exceptions .RequestException as e :
33+ error_msg = f"Website not reachable: { url } - { str (e )} "
34+ logger .error (f"β { error_msg } " )
35+ raise ValueError (error_msg )
36+ except Exception as e :
37+ error_msg = f"Invalid URL: { str (e )} "
38+ logger .error (f"β { error_msg } " )
39+ raise ValueError (error_msg )
0 commit comments