fix(issue): RecursiveUrlLoader parsing base url (#221)

siddarthreddygsr · graphite-app[bot] · web-flow · commit 814a6f56e05f · 2025-08-26T10:30:30.000-04:00
Fixing issue: [langchain#32340](#241) --------- Signed-off-by: Siddarthreddygsr <siddarthreddygsr@gmail.com> Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -14,6 +14,7 @@
     Union,
     cast,
 )
+from urllib.parse import urlparse
 
 import aiohttp
 import requests
@@ -372,9 +373,23 @@ def simple_metadata_extractor(
         self.headers = headers
         self.check_response_status = check_response_status
         self.continue_on_failure = continue_on_failure
-        self.base_url = base_url if base_url is not None else url
+        self.base_url = base_url if base_url is not None else self._parse_base_url(url)
         self.proxies = proxies
 
+    def _parse_base_url(self, url: str) -> str:
+        """Parse the base URL from the given URL.
+
+        Args:
+            url: The URL to parse.
+
+        Returns:
+            The base URL with scheme and netloc only, ending with a slash.
+        """
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
+        parsed_url = urlparse(url)
+        return f"{parsed_url.scheme}://{parsed_url.netloc}/"
+
     def _get_child_links_recursive(
         self, url: str, visited: Set[str], *, depth: int = 0
     ) -> Iterator[Document]: