run-llama · KishoreKumar1308 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/...ns/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md b/...ns/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md
@@ -13,6 +13,9 @@ The WholeSiteReader is a sophisticated web scraping tool that employs a breadth-
 - **URL Prefix Focus:** Targets scraping efforts to specific subsections of a site based on URL prefixes.
 - **Selenium-Based:** Leverages Selenium for dynamic interaction with web pages, supporting JavaScript-rendered content.
 - **Add your own chromedriver with options:** Configurable
+- **Configurable Rate Limiting:** Adjust the delay between page requests (default: 1.0 second).
+- **robots.txt Support:** Respects website robots.txt policies for ethical scraping (default: enabled).
+- **Progress Tracking:** Optional callback to monitor scraping progress in real-time.
 
 ```python
 from llama_index.readers.web import WholeSiteReader

diff --git a/...x-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py b/...x-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py
@@ -1,6 +1,9 @@
+import logging
 import time
 import warnings
-from typing import List, Optional
+from typing import Any, Callable, Dict, List, Optional
+from urllib.parse import urlparse
+from urllib.robotparser import RobotFileParser
 
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
@@ -10,6 +13,8 @@
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
+logger = logging.getLogger(__name__)
+
 
 class WholeSiteReader(BaseReader):
     """
@@ -21,11 +26,16 @@ class WholeSiteReader(BaseReader):
     Attributes:
         prefix (str): URL prefix to focus the scraping.
         max_depth (int): Maximum depth for BFS algorithm.
+        delay (float): Delay in seconds between page requests.
+        respect_robots_txt (bool): Whether to respect robots.txt rules.
 
     Args:
         prefix (str): URL prefix for scraping.
         max_depth (int, optional): Maximum depth for BFS. Defaults to 10.
         uri_as_id (bool, optional): Whether to use the URI as the document ID. Defaults to False.
+        driver (Optional[webdriver.Chrome], optional): Custom Chrome WebDriver instance. Defaults to None.
+        delay (float, optional): Delay in seconds between page requests. Defaults to 1.0.
+        respect_robots_txt (bool, optional): Whether to respect robots.txt rules. Defaults to True.
 
     """
 
@@ -35,14 +45,77 @@ def __init__(
         max_depth: int = 10,
         uri_as_id: bool = False,
         driver: Optional[webdriver.Chrome] = None,
+        delay: float = 1.0,
+        respect_robots_txt: bool = True,
     ) -> None:
         """
         Initialize the WholeSiteReader with the provided prefix and maximum depth.
+
+        Args:
+            prefix (str): URL prefix for scraping.
+            max_depth (int): Maximum depth for BFS algorithm.
+            uri_as_id (bool): Whether to use the URI as the document ID.
+            driver (Optional[webdriver.Chrome]): Custom Chrome WebDriver instance.
+            delay (float): Delay in seconds between page requests.
+            respect_robots_txt (bool): Whether to respect robots.txt rules.
+
         """
         self.prefix = prefix
         self.max_depth = max_depth
         self.uri_as_id = uri_as_id
         self.driver = driver if driver else self.setup_driver()
+        self.delay = delay
+        self.respect_robots_txt = respect_robots_txt
+        self._robot_parser: Optional[RobotFileParser] = None
+
+        # Initialize robots.txt parser if enabled
+        if self.respect_robots_txt:
+            self._init_robot_parser()
+
+    def _init_robot_parser(self) -> None:
+        """
+        Initialize the robots.txt parser for the base URL.
+
+        This method fetches and parses the robots.txt file from the base URL
+        to determine which URLs are allowed to be crawled.
+
+        """
+        try:
+            parsed_url = urlparse(self.prefix)
+            robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
+
+            self._robot_parser = RobotFileParser()
+            self._robot_parser.set_url(robots_url)
+            self._robot_parser.read()
+            logger.info(f"Successfully loaded robots.txt from {robots_url}")
+        except Exception as e:
+            logger.warning(
+                f"Failed to load robots.txt: {e}. Proceeding without robots.txt restrictions."
+            )
+            self._robot_parser = None
+
+    def _can_fetch(self, url: str) -> bool:
+        """
+        Check if the given URL can be fetched according to robots.txt rules.
+
+        Args:
+            url (str): The URL to check.
+
+        Returns:
+            bool: True if the URL can be fetched, False otherwise.
+
+        """
+        if not self.respect_robots_txt or self._robot_parser is None:
+            return True
+
+        try:
+            can_fetch = self._robot_parser.can_fetch("*", url)
+            if not can_fetch:
+                logger.info(f"URL disallowed by robots.txt: {url}")
+            return can_fetch
+        except Exception as e:
+            logger.warning(f"Error checking robots.txt for {url}: {e}. Allowing fetch.")
+            return True
 
     def setup_driver(self):
         """
@@ -90,13 +163,19 @@ def extract_links(self):
             """
         return self.driver.execute_script(js_script)
 
-    def load_data(self, base_url: str) -> List[Document]:
+    def load_data(
+        self,
+        base_url: str,
+        progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> List[Document]:
         """
         Load data from the base URL using BFS algorithm.
 
         Args:
             base_url (str): Base URL to start scraping.
-
+            progress_callback (Optional[Callable[[Dict[str, Any]], None]]): Optional callback
+                function to track progress. The callback receives a dictionary with keys:
+                'current_url', 'depth', 'pages_visited', 'pages_remaining', 'total_pages_found'.
 
         Returns:
             List[Document]: List of scraped documents.
@@ -105,15 +184,36 @@ def load_data(self, base_url: str) -> List[Document]:
         added_urls = set()
         urls_to_visit = [(base_url, 0)]
         documents = []
+        pages_visited = 0
 
         while urls_to_visit:
             current_url, depth = urls_to_visit.pop(0)
-            print(f"Visiting: {current_url}, {len(urls_to_visit)} left")
+            logger.info(
+                f"Visiting: {current_url}, {len(urls_to_visit)} pages remaining"
+            )
+
+            # Call progress callback if provided
+            if progress_callback:
+                progress_callback(
+                    {
+                        "current_url": current_url,
+                        "depth": depth,
+                        "pages_visited": pages_visited,
+                        "pages_remaining": len(urls_to_visit),
+                        "total_pages_found": len(added_urls),
+                    }
+                )
+
+            # Check robots.txt before visiting
+            if not self._can_fetch(current_url):
+                logger.info(f"Skipping URL disallowed by robots.txt: {current_url}")
+                continue
 
             try:
                 self.driver.get(current_url)
                 page_content = self.extract_content()
                 added_urls.add(current_url)
+                pages_visited += 1
 
                 next_depth = depth + 1
                 if next_depth <= self.max_depth:
@@ -123,14 +223,17 @@ def load_data(self, base_url: str) -> List[Document]:
                     links = [self.clean_url(link) for link in links]
                     # extract new links
                     links = [link for link in links if link not in added_urls]
-                    print(f"Found {len(links)} new potential links")
+                    logger.info(
+                        f"Found {len(links)} new potential links at depth {depth}"
+                    )
 
                     for href in links:
                         try:
                             if href.startswith(self.prefix) and href not in added_urls:
                                 urls_to_visit.append((href, next_depth))
                                 added_urls.add(href)
-                        except Exception:
+                        except Exception as e:
+                            logger.debug(f"Error processing link {href}: {e}")
                             continue
 
                 doc = Document(text=page_content, extra_info={"URL": current_url})
@@ -140,14 +243,22 @@ def load_data(self, base_url: str) -> List[Document]:
                     )
                     doc.id_ = current_url
                 documents.append(doc)
-                time.sleep(1)
+                logger.debug(f"Successfully scraped {current_url}")
+                time.sleep(self.delay)
 
-            except WebDriverException:
-                print("WebDriverException encountered, restarting driver...")
+            except WebDriverException as e:
+                logger.error(
+                    f"WebDriverException encountered: {e}. Restarting driver..."
+                )
                 self.restart_driver()
             except Exception as e:
-                print(f"An unexpected exception occurred: {e}, skipping URL...")
+                logger.error(
+                    f"An unexpected exception occurred: {e}. Skipping URL: {current_url}"
+                )
                 continue
 
         self.driver.quit()
+        logger.info(
+            f"Scraping complete. Visited {pages_visited} pages, collected {len(documents)} documents."
+        )
         return documents
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -26,7 +26,7 @@ dev = [
 
 [project]
 name = "llama-index-readers-web"
-version = "0.5.6"
+version = "0.6.0"
 description = "llama-index readers web integration"
 authors = [{name = "Your Name", email = "[email protected]"}]
 requires-python = ">=3.9,<4.0"