Skip to content

Commit 788d8fc

Browse files
committed
feat(rag): enhance url loader with Jina Reader for better HTML parsing
1 parent afdc93f commit 788d8fc

File tree

2 files changed

+47
-9
lines changed

2 files changed

+47
-9
lines changed

examples/smoke/rag_github_loader_test.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@
2020

2121
# Test Target: Spoon-Core README
2222
TEST_URL = "https://github.com/XSpoonAi/spoon-core/blob/main/README.md"
23+
TEST_URL = "https://spoon.gforge.inria.fr/mvnsites/spoon-core/apidocs/"
2324
DB_DIR = ".rag_test_github"
2425

2526
async def main():
2627
print(f"== RAG GitHub Loader Integration Test ==")
2728
print(f"Target URL: {TEST_URL}")
28-
29+
2930
# 0. Cleanup previous test run
3031
if os.path.exists(DB_DIR):
3132
shutil.rmtree(DB_DIR)
@@ -38,7 +39,7 @@ async def main():
3839
chunk_size=1000,
3940
chunk_overlap=50
4041
)
41-
42+
4243
# Initialize components using factories
4344
embeddings = get_embedding_client(
4445
provider=config.embeddings_provider,
@@ -48,15 +49,15 @@ async def main():
4849
anyroute_model=config.anyroute_model,
4950
)
5051
store = get_vector_store(config.backend)
51-
52+
5253
# 2. Ingest
5354
print("\n[1] Ingesting...")
5455
# Inject dependencies
5556
index = RagIndex(config=config, store=store, embeddings=embeddings)
5657
# This triggers load_inputs -> _load_url -> _try_convert_github_url
5758
count = index.ingest([TEST_URL])
5859
print(f" Ingested chunks: {count}")
59-
60+
6061
if count == 0:
6162
print("!! Failed to ingest any chunks. Check network or loader logic.")
6263
return
@@ -69,20 +70,21 @@ async def main():
6970

7071
# 4. Test Case A: Specific QA
7172
question = "What is Spoon-Core?"
73+
question = "What provides the default integrated launchers for Spoon program processing?"
7274
print(f"\n[2] Testing QA: '{question}'")
73-
75+
7476
# Manual workflow: Retrieve -> Answer
7577
chunks = retriever.retrieve(question)
7678
answer_res = await qa_engine.answer(question, chunks)
77-
79+
7880
print(f" Answer: {answer_res.answer}")
7981
if answer_res.citations:
8082
print(f" Source: {answer_res.citations[0].source}")
8183

8284
# 5. Test Case B: Summarization (Full Context)
8385
summary_prompt = "Please summarize the main features of this project."
8486
print(f"\n[3] Testing Summarization: '{summary_prompt}'")
85-
87+
8688
summary_chunks = retriever.retrieve(summary_prompt, top_k=10) # Retrieve more for summary
8789
summary_res = await qa_engine.answer(summary_prompt, summary_chunks)
8890
print(f" Summary: {summary_res.answer}")

spoon_ai/rag/loader.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,17 +67,53 @@ def _load_file(path: Path) -> Optional[LoadedDoc]:
6767

6868
def _load_url(url: str) -> Optional[LoadedDoc]:
6969
try:
70-
# Try to convert to raw URL for better content extraction (e.g. GitHub)
70+
# 1. GitHub 转换: 尝试将 GitHub Blob URL 转为 Raw URL,以便获取纯内容
7171
target_url = _try_convert_github_url(url)
7272

73+
# 2. 策略判断:
74+
# 如果是 Github Raw 链接或常见的纯文本/代码文件后缀,直接下载更高效且精准。
75+
# 否则 (通用网页),尝试使用 Jina Reader 将 HTML 转换为高质量 Markdown。
76+
77+
# 常见纯文本/代码后缀,不需要 LLM Reader 进行清理
78+
raw_extensions = (
79+
".txt", ".md", ".json", ".yaml", ".yml", ".csv", ".xml", ".ini", ".conf",
80+
".py", ".js", ".ts", ".go", ".rs", ".java", ".c", ".cpp", ".h", ".cs", ".php", ".rb", ".sh"
81+
)
82+
83+
is_github_raw = "raw.githubusercontent.com" in target_url
84+
is_pure_text = target_url.lower().endswith(raw_extensions)
85+
86+
should_use_jina = not (is_github_raw or is_pure_text)
87+
88+
if should_use_jina:
89+
# 3. 尝试 Jina Reader (https://jina.ai/reader)
90+
# 它可以将杂乱的网页转换为干净的 Markdown,非常适合 RAG
91+
jina_api_key = os.getenv("JINA_API_KEY")
92+
headers = {"X-Retain-Images": "none"}
93+
if jina_api_key:
94+
headers["Authorization"] = f"Bearer {jina_api_key}"
95+
96+
try:
97+
jina_url = f"https://r.jina.ai/{target_url}"
98+
r_jina = requests.get(jina_url, headers=headers, timeout=20)
99+
if r_jina.status_code == 200:
100+
return LoadedDoc(id=url, text=r_jina.text, source=url)
101+
except Exception:
102+
# 如果 Jina 服务超时或失败,静默回退到普通下载
103+
pass
104+
105+
# 4. 回退/默认路径: 直接请求目标 URL
106+
# 适用于 Jina 失败、或者是直接下载路径 (GitHub Raw/Text files)
73107
r = requests.get(target_url, timeout=20)
74108
r.raise_for_status()
109+
75110
content_type = r.headers.get("content-type", "").lower()
76-
text: str
77111
if "html" in content_type:
112+
# 使用简易方式去除标签作为保底
78113
text = _strip_html(r.text)
79114
else:
80115
text = r.text
116+
81117
return LoadedDoc(id=url, text=text, source=url)
82118
except Exception:
83119
return None

0 commit comments

Comments
 (0)