feat(rag): enhance url loader with Jina Reader for better HTML parsing

yueliao11 · yueliao11 · commit 788d8fc282e9 · 2025-12-19T20:02:21.000+08:00
diff --git a/examples/smoke/rag_github_loader_test.py b/examples/smoke/rag_github_loader_test.py
@@ -20,12 +20,13 @@
 
 # Test Target: Spoon-Core README
 TEST_URL = "https://github.com/XSpoonAi/spoon-core/blob/main/README.md"
+TEST_URL = "https://spoon.gforge.inria.fr/mvnsites/spoon-core/apidocs/"
 DB_DIR = ".rag_test_github"
 
 async def main():
     print(f"== RAG GitHub Loader Integration Test ==")
     print(f"Target URL: {TEST_URL}")
-    
+
     # 0. Cleanup previous test run
     if os.path.exists(DB_DIR):
         shutil.rmtree(DB_DIR)
@@ -38,7 +39,7 @@ async def main():
         chunk_size=1000,
         chunk_overlap=50
     )
-    
+
     # Initialize components using factories
     embeddings = get_embedding_client(
         provider=config.embeddings_provider,
@@ -48,15 +49,15 @@ async def main():
         anyroute_model=config.anyroute_model,
     )
     store = get_vector_store(config.backend)
-    
+
     # 2. Ingest
     print("\n[1] Ingesting...")
     # Inject dependencies
     index = RagIndex(config=config, store=store, embeddings=embeddings)
     # This triggers load_inputs -> _load_url -> _try_convert_github_url
     count = index.ingest([TEST_URL])
     print(f"    Ingested chunks: {count}")
-    
+
     if count == 0:
         print("!! Failed to ingest any chunks. Check network or loader logic.")
         return
@@ -69,20 +70,21 @@ async def main():
 
     # 4. Test Case A: Specific QA
     question = "What is Spoon-Core?"
+    question = "What provides the default integrated launchers for Spoon program processing?"
     print(f"\n[2] Testing QA: '{question}'")
-    
+
     # Manual workflow: Retrieve -> Answer
     chunks = retriever.retrieve(question)
     answer_res = await qa_engine.answer(question, chunks)
-    
+
     print(f"    Answer: {answer_res.answer}")
     if answer_res.citations:
         print(f"    Source: {answer_res.citations[0].source}")
 
     # 5. Test Case B: Summarization (Full Context)
     summary_prompt = "Please summarize the main features of this project."
     print(f"\n[3] Testing Summarization: '{summary_prompt}'")
-    
+
     summary_chunks = retriever.retrieve(summary_prompt, top_k=10) # Retrieve more for summary
     summary_res = await qa_engine.answer(summary_prompt, summary_chunks)
     print(f"    Summary: {summary_res.answer}")
diff --git a/spoon_ai/rag/loader.py b/spoon_ai/rag/loader.py
@@ -67,17 +67,53 @@ def _load_file(path: Path) -> Optional[LoadedDoc]:
 
 def _load_url(url: str) -> Optional[LoadedDoc]:
     try:
-        # Try to convert to raw URL for better content extraction (e.g. GitHub)
+        # 1. GitHub 转换: 尝试将 GitHub Blob URL 转为 Raw URL，以便获取纯内容
         target_url = _try_convert_github_url(url)
         
+        # 2. 策略判断: 
+        # 如果是 Github Raw 链接或常见的纯文本/代码文件后缀，直接下载更高效且精准。
+        # 否则 (通用网页)，尝试使用 Jina Reader 将 HTML 转换为高质量 Markdown。
+        
+        # 常见纯文本/代码后缀，不需要 LLM Reader 进行清理
+        raw_extensions = (
+            ".txt", ".md", ".json", ".yaml", ".yml", ".csv", ".xml", ".ini", ".conf",
+            ".py", ".js", ".ts", ".go", ".rs", ".java", ".c", ".cpp", ".h", ".cs", ".php", ".rb", ".sh"
+        )
+        
+        is_github_raw = "raw.githubusercontent.com" in target_url
+        is_pure_text = target_url.lower().endswith(raw_extensions)
+        
+        should_use_jina = not (is_github_raw or is_pure_text)
+
+        if should_use_jina:
+            # 3. 尝试 Jina Reader (https://jina.ai/reader)
+            # 它可以将杂乱的网页转换为干净的 Markdown，非常适合 RAG
+            jina_api_key = os.getenv("JINA_API_KEY")
+            headers = {"X-Retain-Images": "none"}
+            if jina_api_key:
+                headers["Authorization"] = f"Bearer {jina_api_key}"
+            
+            try:
+                jina_url = f"https://r.jina.ai/{target_url}"
+                r_jina = requests.get(jina_url, headers=headers, timeout=20)
+                if r_jina.status_code == 200:
+                    return LoadedDoc(id=url, text=r_jina.text, source=url)
+            except Exception:
+                # 如果 Jina 服务超时或失败，静默回退到普通下载
+                pass
+
+        # 4. 回退/默认路径: 直接请求目标 URL
+        # 适用于 Jina 失败、或者是直接下载路径 (GitHub Raw/Text files)
         r = requests.get(target_url, timeout=20)
         r.raise_for_status()
+        
         content_type = r.headers.get("content-type", "").lower()
-        text: str
         if "html" in content_type:
+            # 使用简易方式去除标签作为保底
             text = _strip_html(r.text)
         else:
             text = r.text
+            
         return LoadedDoc(id=url, text=text, source=url)
     except Exception:
         return None