Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ The WholeSiteReader is a sophisticated web scraping tool that employs a breadth-
- **URL Prefix Focus:** Targets scraping efforts to specific subsections of a site based on URL prefixes.
- **Selenium-Based:** Leverages Selenium for dynamic interaction with web pages, supporting JavaScript-rendered content.
- **Add your own chromedriver with options:** Configurable
- **Configurable Rate Limiting:** Adjust the delay between page requests (default: 1.0 second).
- **robots.txt Support:** Respects website robots.txt policies for ethical scraping (default: enabled).
- **Progress Tracking:** Optional callback to monitor scraping progress in real-time.

```python
from llama_index.readers.web import WholeSiteReader
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import logging
import time
import warnings
from typing import List, Optional
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser

from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
Expand All @@ -10,6 +13,8 @@
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

logger = logging.getLogger(__name__)


class WholeSiteReader(BaseReader):
"""
Expand All @@ -21,11 +26,16 @@ class WholeSiteReader(BaseReader):
Attributes:
prefix (str): URL prefix to focus the scraping.
max_depth (int): Maximum depth for BFS algorithm.
delay (float): Delay in seconds between page requests.
respect_robots_txt (bool): Whether to respect robots.txt rules.

Args:
prefix (str): URL prefix for scraping.
max_depth (int, optional): Maximum depth for BFS. Defaults to 10.
uri_as_id (bool, optional): Whether to use the URI as the document ID. Defaults to False.
driver (Optional[webdriver.Chrome], optional): Custom Chrome WebDriver instance. Defaults to None.
delay (float, optional): Delay in seconds between page requests. Defaults to 1.0.
respect_robots_txt (bool, optional): Whether to respect robots.txt rules. Defaults to True.

"""

Expand All @@ -35,14 +45,77 @@ def __init__(
max_depth: int = 10,
uri_as_id: bool = False,
driver: Optional[webdriver.Chrome] = None,
delay: float = 1.0,
respect_robots_txt: bool = True,
) -> None:
"""
Initialize the WholeSiteReader with the provided prefix and maximum depth.

Args:
prefix (str): URL prefix for scraping.
max_depth (int): Maximum depth for BFS algorithm.
uri_as_id (bool): Whether to use the URI as the document ID.
driver (Optional[webdriver.Chrome]): Custom Chrome WebDriver instance.
delay (float): Delay in seconds between page requests.
respect_robots_txt (bool): Whether to respect robots.txt rules.

"""
self.prefix = prefix
self.max_depth = max_depth
self.uri_as_id = uri_as_id
self.driver = driver if driver else self.setup_driver()
self.delay = delay
self.respect_robots_txt = respect_robots_txt
self._robot_parser: Optional[RobotFileParser] = None

# Initialize robots.txt parser if enabled
if self.respect_robots_txt:
self._init_robot_parser()

def _init_robot_parser(self) -> None:
"""
Initialize the robots.txt parser for the base URL.

This method fetches and parses the robots.txt file from the base URL
to determine which URLs are allowed to be crawled.

"""
try:
parsed_url = urlparse(self.prefix)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"

self._robot_parser = RobotFileParser()
self._robot_parser.set_url(robots_url)
self._robot_parser.read()
logger.info(f"Successfully loaded robots.txt from {robots_url}")
except Exception as e:
logger.warning(
f"Failed to load robots.txt: {e}. Proceeding without robots.txt restrictions."
)
self._robot_parser = None

def _can_fetch(self, url: str) -> bool:
"""
Check if the given URL can be fetched according to robots.txt rules.

Args:
url (str): The URL to check.

Returns:
bool: True if the URL can be fetched, False otherwise.

"""
if not self.respect_robots_txt or self._robot_parser is None:
return True

try:
can_fetch = self._robot_parser.can_fetch("*", url)
if not can_fetch:
logger.info(f"URL disallowed by robots.txt: {url}")
return can_fetch
except Exception as e:
logger.warning(f"Error checking robots.txt for {url}: {e}. Allowing fetch.")
return True

def setup_driver(self):
"""
Expand Down Expand Up @@ -90,13 +163,19 @@ def extract_links(self):
"""
return self.driver.execute_script(js_script)

def load_data(self, base_url: str) -> List[Document]:
def load_data(
self,
base_url: str,
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
) -> List[Document]:
"""
Load data from the base URL using BFS algorithm.

Args:
base_url (str): Base URL to start scraping.

progress_callback (Optional[Callable[[Dict[str, Any]], None]]): Optional callback
function to track progress. The callback receives a dictionary with keys:
'current_url', 'depth', 'pages_visited', 'pages_remaining', 'total_pages_found'.

Returns:
List[Document]: List of scraped documents.
Expand All @@ -105,15 +184,36 @@ def load_data(self, base_url: str) -> List[Document]:
added_urls = set()
urls_to_visit = [(base_url, 0)]
documents = []
pages_visited = 0

while urls_to_visit:
current_url, depth = urls_to_visit.pop(0)
print(f"Visiting: {current_url}, {len(urls_to_visit)} left")
logger.info(
f"Visiting: {current_url}, {len(urls_to_visit)} pages remaining"
)

# Call progress callback if provided
if progress_callback:
progress_callback(
{
"current_url": current_url,
"depth": depth,
"pages_visited": pages_visited,
"pages_remaining": len(urls_to_visit),
"total_pages_found": len(added_urls),
}
)

# Check robots.txt before visiting
if not self._can_fetch(current_url):
logger.info(f"Skipping URL disallowed by robots.txt: {current_url}")
continue

try:
self.driver.get(current_url)
page_content = self.extract_content()
added_urls.add(current_url)
pages_visited += 1

next_depth = depth + 1
if next_depth <= self.max_depth:
Expand All @@ -123,14 +223,17 @@ def load_data(self, base_url: str) -> List[Document]:
links = [self.clean_url(link) for link in links]
# extract new links
links = [link for link in links if link not in added_urls]
print(f"Found {len(links)} new potential links")
logger.info(
f"Found {len(links)} new potential links at depth {depth}"
)

for href in links:
try:
if href.startswith(self.prefix) and href not in added_urls:
urls_to_visit.append((href, next_depth))
added_urls.add(href)
except Exception:
except Exception as e:
logger.debug(f"Error processing link {href}: {e}")
continue

doc = Document(text=page_content, extra_info={"URL": current_url})
Expand All @@ -140,14 +243,22 @@ def load_data(self, base_url: str) -> List[Document]:
)
doc.id_ = current_url
documents.append(doc)
time.sleep(1)
logger.debug(f"Successfully scraped {current_url}")
time.sleep(self.delay)

except WebDriverException:
print("WebDriverException encountered, restarting driver...")
except WebDriverException as e:
logger.error(
f"WebDriverException encountered: {e}. Restarting driver..."
)
self.restart_driver()
except Exception as e:
print(f"An unexpected exception occurred: {e}, skipping URL...")
logger.error(
f"An unexpected exception occurred: {e}. Skipping URL: {current_url}"
)
continue

self.driver.quit()
logger.info(
f"Scraping complete. Visited {pages_visited} pages, collected {len(documents)} documents."
)
return documents
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dev = [

[project]
name = "llama-index-readers-web"
version = "0.5.6"
version = "0.6.0"
description = "llama-index readers web integration"
authors = [{name = "Your Name", email = "[email protected]"}]
requires-python = ">=3.9,<4.0"
Expand Down
Loading