|
| 1 | +""" |
| 2 | +fetch_node_level_k module |
| 3 | +""" |
1 | 4 | from typing import List, Optional |
2 | | -from .base_node import BaseNode |
3 | | -from ..docloaders import ChromiumLoader |
4 | | -from ..utils.cleanup_html import cleanup_html |
5 | | -from ..utils.convert_to_md import convert_to_md |
| 5 | +from urllib.parse import urljoin |
6 | 6 | from langchain_core.documents import Document |
7 | 7 | from bs4 import BeautifulSoup |
8 | | -from urllib.parse import quote, urljoin |
| 8 | +from .base_node import BaseNode |
| 9 | +from ..docloaders import ChromiumLoader |
9 | 10 |
|
10 | 11 | class FetchNodeLevelK(BaseNode): |
11 | 12 | """ |
@@ -102,17 +103,18 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: |
102 | 103 | Optional[str]: The fetched HTML content or None if fetching failed. |
103 | 104 | """ |
104 | 105 | self.logger.info(f"--- (Fetching HTML from: {source}) ---") |
105 | | - |
| 106 | + |
106 | 107 | if self.browser_base is not None: |
107 | 108 | try: |
108 | 109 | from ..docloaders.browser_base import browser_base_fetch |
109 | 110 | except ImportError: |
110 | 111 | raise ImportError("""The browserbase module is not installed. |
111 | 112 | Please install it using `pip install browserbase`.""") |
112 | 113 |
|
113 | | - data = browser_base_fetch(self.browser_base.get("api_key"), |
| 114 | + data = browser_base_fetch(self.browser_base.get("api_key"), |
114 | 115 | self.browser_base.get("project_id"), [source]) |
115 | | - document = [Document(page_content=content, metadata={"source": source}) for content in data] |
| 116 | + document = [Document(page_content=content, |
| 117 | + metadata={"source": source}) for content in data] |
116 | 118 | else: |
117 | 119 | loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) |
118 | 120 | document = loader.load() |
@@ -179,7 +181,8 @@ def obtain_content(self, documents: List, loader_kwargs) -> List: |
179 | 181 | full_links = self.get_full_links(source, links) |
180 | 182 |
|
181 | 183 | for link in full_links: |
182 | | - if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): |
| 184 | + if not any(d.get('source', '') == link for d in documents) \ |
| 185 | + and not any(d.get('source', '') == link for d in new_documents): |
183 | 186 | new_documents.append({"source": link}) |
184 | 187 |
|
185 | 188 | documents.extend(new_documents) |
@@ -208,7 +211,8 @@ def process_links(self, base_url: str, links: list, |
208 | 211 |
|
209 | 212 | if current_depth < depth: |
210 | 213 | new_links = self.extract_links(link_content) |
211 | | - content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1)) |
| 214 | + content_dict.update(self.process_links(full_link, new_links, |
| 215 | + loader_kwargs, depth, current_depth + 1)) |
212 | 216 | else: |
213 | 217 | self.logger.warning(f"Failed to fetch content for {full_link}") |
214 | 218 | return content_dict |
0 commit comments