Skip to content

Commit 521de75

Browse files
committed
feat: add validating website function
1 parent cbf2da4 commit 521de75

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

β€Žscrapegraph-py/examples/smartscraper_example.pyβ€Ž

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from scrapegraph_py import Client
22
from scrapegraph_py.logger import sgai_logger
33

4-
sgai_logger.set_logging(level="INFO")
4+
# Set logging to DEBUG level to see all logs
5+
sgai_logger.set_logging(level="DEBUG")
56

67
# Initialize the client with explicit API key
7-
sgai_client = Client(api_key="your-api-key-here")
8+
sgai_client = Client(api_key="sgai-6a2e7715-d895-4960-a037-27ec220f00e9")
89

910
# SmartScraper request
1011
response = sgai_client.smartscraper(
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from urllib.parse import urlparse
2+
import socket
3+
import requests
4+
from scrapegraph_py.logger import sgai_logger as logger
5+
6+
def validate_website_url(url: str) -> None:
7+
"""Validate if website URL is reachable."""
8+
logger.info(f"πŸ” Validating website URL: {url}")
9+
10+
try:
11+
# Validate URL format
12+
parsed = urlparse(url)
13+
if not all([parsed.scheme, parsed.netloc]):
14+
logger.error(f"❌ Invalid URL format: {url}")
15+
raise ValueError("Invalid URL format")
16+
logger.info("βœ… URL format is valid")
17+
18+
# Try to resolve domain
19+
logger.info(f"πŸ” Checking domain accessibility: {parsed.netloc}")
20+
socket.gethostbyname(parsed.netloc)
21+
22+
# Try to make a HEAD request to verify the website responds
23+
logger.info(f"πŸ” Verifying website response...")
24+
response = requests.head(url, timeout=5, allow_redirects=True)
25+
response.raise_for_status()
26+
logger.info(f"βœ… Website is accessible and responding")
27+
28+
except socket.gaierror:
29+
error_msg = f"Could not resolve domain: {url}"
30+
logger.error(f"❌ {error_msg}")
31+
raise ValueError(error_msg)
32+
except requests.exceptions.RequestException as e:
33+
error_msg = f"Website not reachable: {url} - {str(e)}"
34+
logger.error(f"❌ {error_msg}")
35+
raise ValueError(error_msg)
36+
except Exception as e:
37+
error_msg = f"Invalid URL: {str(e)}"
38+
logger.error(f"❌ {error_msg}")
39+
raise ValueError(error_msg)

0 commit comments

Comments
Β (0)