Skip to content

Commit 814a6f5

Browse files
fix(issue): RecursiveUrlLoader parsing base url (#221)
Fixing issue: [langchain#32340](#241) --------- Signed-off-by: Siddarthreddygsr <[email protected]> Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
1 parent 7414609 commit 814a6f5

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

libs/community/langchain_community/document_loaders/recursive_url_loader.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
Union,
1515
cast,
1616
)
17+
from urllib.parse import urlparse
1718

1819
import aiohttp
1920
import requests
@@ -372,9 +373,23 @@ def simple_metadata_extractor(
372373
self.headers = headers
373374
self.check_response_status = check_response_status
374375
self.continue_on_failure = continue_on_failure
375-
self.base_url = base_url if base_url is not None else url
376+
self.base_url = base_url if base_url is not None else self._parse_base_url(url)
376377
self.proxies = proxies
377378

379+
def _parse_base_url(self, url: str) -> str:
380+
"""Parse the base URL from the given URL.
381+
382+
Args:
383+
url: The URL to parse.
384+
385+
Returns:
386+
The base URL with scheme and netloc only, ending with a slash.
387+
"""
388+
if not url.startswith(("http://", "https://")):
389+
url = "https://" + url
390+
parsed_url = urlparse(url)
391+
return f"{parsed_url.scheme}://{parsed_url.netloc}/"
392+
378393
def _get_child_links_recursive(
379394
self, url: str, visited: Set[str], *, depth: int = 0
380395
) -> Iterator[Document]:

0 commit comments

Comments
 (0)