Skip to content

Commit df57053

Browse files
authored
feat: improve crawlers and update chunk filtering (#2250)
1 parent 5662be1 commit df57053

File tree

5 files changed

+130
-21
lines changed

5 files changed

+130
-21
lines changed

application/api/user/sources/chunks.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,14 @@ def get(self):
5555

5656
if path:
5757
chunk_source = metadata.get("source", "")
58-
# Check if the chunk's source matches the requested path
58+
chunk_file_path = metadata.get("file_path", "")
59+
# Check if the chunk matches the requested path
60+
# For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf")
61+
# For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md")
62+
source_match = chunk_source and chunk_source.endswith(path)
63+
file_path_match = chunk_file_path and chunk_file_path.endswith(path)
5964

60-
if not chunk_source or not chunk_source.endswith(path):
65+
if not (source_match or file_path_match):
6166
continue
6267
# Filter by search term if provided
6368

application/parser/remote/crawler_loader.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
import requests
34
from urllib.parse import urlparse, urljoin
45
from bs4 import BeautifulSoup
@@ -47,10 +48,13 @@ def load_data(self, inputs):
4748
docs = loader.load()
4849
# Convert the loaded documents to your Document schema
4950
for doc in docs:
51+
metadata = dict(doc.metadata or {})
52+
source_url = metadata.get("source") or current_url
53+
metadata["file_path"] = self._url_to_virtual_path(source_url)
5054
loaded_content.append(
5155
Document(
5256
doc.page_content,
53-
extra_info=doc.metadata
57+
extra_info=metadata
5458
)
5559
)
5660
except Exception as e:
@@ -74,3 +78,29 @@ def load_data(self, inputs):
7478
break
7579

7680
return loaded_content
81+
82+
def _url_to_virtual_path(self, url):
83+
"""
84+
Convert a URL to a virtual file path ending with .md.
85+
86+
Examples:
87+
https://docs.docsgpt.cloud/ -> index.md
88+
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
89+
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
90+
https://example.com/page.html -> page.md
91+
"""
92+
parsed = urlparse(url)
93+
path = parsed.path.strip("/")
94+
95+
if not path:
96+
return "index.md"
97+
98+
# Remove common file extensions and add .md
99+
base, ext = os.path.splitext(path)
100+
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
101+
path = base
102+
103+
if not path.endswith(".md"):
104+
path = f"{path}.md"
105+
106+
return path

application/parser/remote/crawler_markdown.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from markdownify import markdownify
88
from application.parser.schema.base import Document
99
import tldextract
10+
import os
1011

1112
class CrawlerLoader(BaseRemote):
1213
def __init__(self, limit=10, allow_subdomains=False):
@@ -57,13 +58,21 @@ def load_data(self, inputs):
5758
# Convert the HTML to Markdown for cleaner text formatting
5859
title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
5960
if processed_markdown:
61+
# Generate virtual file path from URL for consistent file-like matching
62+
virtual_path = self._url_to_virtual_path(current_url)
63+
6064
# Create a Document for each visited page
6165
documents.append(
6266
Document(
6367
processed_markdown, # content
6468
None, # doc_id
6569
None, # embedding
66-
{"source": current_url, "title": title, "language": language} # extra_info
70+
{
71+
"source": current_url,
72+
"title": title,
73+
"language": language,
74+
"file_path": virtual_path,
75+
}, # extra_info
6776
)
6877
)
6978

@@ -145,4 +154,31 @@ def _filter_links(self, links, base_domain):
145154
# Exact domain match
146155
if link_base == base_domain:
147156
filtered.append(link)
148-
return filtered
157+
return filtered
158+
159+
def _url_to_virtual_path(self, url):
160+
"""
161+
Convert a URL to a virtual file path ending with .md.
162+
163+
Examples:
164+
https://docs.docsgpt.cloud/ -> index.md
165+
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
166+
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
167+
https://example.com/page.html -> page.md
168+
"""
169+
parsed = urlparse(url)
170+
path = parsed.path.strip("/")
171+
172+
if not path:
173+
return "index.md"
174+
175+
# Remove common file extensions and add .md
176+
base, ext = os.path.splitext(path)
177+
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
178+
path = base
179+
180+
# Ensure path ends with .md
181+
if not path.endswith(".md"):
182+
path = path + ".md"
183+
184+
return path

application/worker.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -869,27 +869,33 @@ def remote_worker(
869869
logging.info("Total tokens calculated: %d", tokens)
870870

871871
# Build directory structure from loaded documents
872-
# Format matches local file uploads: flat structure with type, size_bytes, token_count
872+
# Format matches local file uploads: nested structure with type, size_bytes, token_count
873873
directory_structure = {}
874874
for doc in raw_docs:
875-
# Get the file path/name from doc_id or extra_info
876-
file_path = doc.doc_id or ""
877-
if not file_path and doc.extra_info:
878-
file_path = doc.extra_info.get("key", "") or doc.extra_info.get(
879-
"title", ""
875+
# Get the file path from extra_info
876+
# For crawlers: file_path is a virtual path like "guides/setup.md"
877+
# For other remotes: use key or title as fallback
878+
file_path = ""
879+
if doc.extra_info:
880+
file_path = (
881+
doc.extra_info.get("file_path", "")
882+
or doc.extra_info.get("key", "")
883+
or doc.extra_info.get("title", "")
880884
)
885+
if not file_path:
886+
file_path = doc.doc_id or ""
881887

882888
if file_path:
883-
# Use just the filename (last part of path) for flat structure
884-
file_name = file_path.split("/")[-1] if "/" in file_path else file_path
885-
886889
# Calculate token count
887-
token_count = len(doc.text.split()) if doc.text else 0
890+
token_count = num_tokens_from_string(doc.text) if doc.text else 0
888891

889892
# Estimate size in bytes from text content
890893
size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0
891894

892895
# Guess mime type from extension
896+
file_name = (
897+
file_path.split("/")[-1] if "/" in file_path else file_path
898+
)
893899
ext = os.path.splitext(file_name)[1].lower()
894900
mime_types = {
895901
".txt": "text/plain",
@@ -909,11 +915,23 @@ def remote_worker(
909915
}
910916
file_type = mime_types.get(ext, "application/octet-stream")
911917

912-
directory_structure[file_name] = {
913-
"type": file_type,
914-
"size_bytes": size_bytes,
915-
"token_count": token_count,
916-
}
918+
# Build nested directory structure from path
919+
# e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
920+
path_parts = file_path.split("/")
921+
current_level = directory_structure
922+
for i, part in enumerate(path_parts):
923+
if i == len(path_parts) - 1:
924+
# Last part is the file
925+
current_level[part] = {
926+
"type": file_type,
927+
"size_bytes": size_bytes,
928+
"token_count": token_count,
929+
}
930+
else:
931+
# Intermediate parts are directories
932+
if part not in current_level:
933+
current_level[part] = {}
934+
current_level = current_level[part]
917935

918936
logging.info(
919937
f"Built directory structure with {len(directory_structure)} files: "

tests/parser/remote/test_crawler_loader.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ def loader_factory(url_list):
7878
sources = {doc.extra_info.get("source") for doc in result}
7979
assert sources == {"http://example.com", "http://example.com/about"}
8080

81+
paths = {doc.extra_info.get("file_path") for doc in result}
82+
assert paths == {"index.md", "about.md"}
83+
8184
texts = {doc.text for doc in result}
8285
assert texts == {"Root content", "About content"}
8386

@@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va
107110

108111
assert len(result) == 1
109112
assert result[0].text == "Homepage"
110-
assert result[0].extra_info == {"source": "http://example.com"}
113+
assert result[0].extra_info == {
114+
"source": "http://example.com",
115+
"file_path": "index.md",
116+
}
111117

112118

113119
@patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url)
@@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url):
190196
assert result == []
191197
mock_validate_url.assert_called_once()
192198

199+
200+
def test_url_to_virtual_path_variants():
201+
crawler = CrawlerLoader()
202+
203+
assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md"
204+
assert (
205+
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup")
206+
== "guides/setup.md"
207+
)
208+
assert (
209+
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/")
210+
== "guides/setup.md"
211+
)
212+
assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"

0 commit comments

Comments
 (0)