Merge pull request #225 from yueliao11/feat/rag-framework

moonpyt · web-flow · commit ae5892b25050 · 2025-12-24T11:49:13.000+08:00
feat(rag): enhance url loader with Jina Reader for better HTML parsing
diff --git a/spoon_ai/rag/loader.py b/spoon_ai/rag/loader.py
@@ -26,18 +26,18 @@ def _strip_html(html: str) -> str:
 
 
 def _try_convert_github_url(url: str) -> str:
-    """Convert GitHub blob URLs to raw URLs to fetch clean content.
-
-    Example:
-        https://github.com/user/repo/blob/main/README.md
-        -> https://raw.githubusercontent.com/user/repo/main/README.md
     """
+    Convert GitHub blob URLs to raw URLs to extract clean content without HTML UI.
+    Example: https://github.com/user/repo/blob/main/README.md 
+    -> https://raw.githubusercontent.com/user/repo/main/README.md
+    """
+    # Pattern matches: github.com/{user}/{repo}/blob/{branch}/{path}
     pattern = r"^https?://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)$"
     match = re.match(pattern, url)
-    if not match:
-        return url
-    user, repo, branch, path = match.groups()
-    return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
+    if match:
+        user, repo, branch, path = match.groups()
+        return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
+    return url
 
 
 def _load_file(path: Path) -> Optional[LoadedDoc]:
@@ -67,15 +67,53 @@ def _load_file(path: Path) -> Optional[LoadedDoc]:
 
 def _load_url(url: str) -> Optional[LoadedDoc]:
     try:
+        # 1. GitHub Conversion: Try to convert GitHub Blob URL to Raw URL for improved content extraction
         target_url = _try_convert_github_url(url)
+        
+        # 2. Strategy Decision: 
+        # If it is a Github Raw link or a common pure text/code file suffix, direct download is more efficient and accurate.
+        # Otherwise (general webpage), try to use Jina Reader to convert HTML into high-quality Markdown.
+        
+        # Common pure text/code suffixes, do not need LLM Reader for cleaning
+        raw_extensions = (
+            ".txt", ".md", ".json", ".yaml", ".yml", ".csv", ".xml", ".ini", ".conf",
+            ".py", ".js", ".ts", ".go", ".rs", ".java", ".c", ".cpp", ".h", ".cs", ".php", ".rb", ".sh"
+        )
+        
+        is_github_raw = "raw.githubusercontent.com" in target_url
+        is_pure_text = target_url.lower().endswith(raw_extensions)
+        
+        should_use_jina = not (is_github_raw or is_pure_text)
+
+        if should_use_jina:
+            # 3. Try Jina Reader (https://jina.ai/reader)
+            # It can convert cluttered webpages into clean Markdown, which is very suitable for RAG
+            jina_api_key = os.getenv("JINA_API_KEY")
+            headers = {"X-Retain-Images": "none"}
+            if jina_api_key:
+                headers["Authorization"] = f"Bearer {jina_api_key}"
+            
+            try:
+                jina_url = f"https://r.jina.ai/{target_url}"
+                r_jina = requests.get(jina_url, headers=headers, timeout=20)
+                if r_jina.status_code == 200:
+                    return LoadedDoc(id=url, text=r_jina.text, source=url)
+            except Exception:
+                # If Jina service times out or fails, silently fallback to normal download
+                pass
+
+        # 4. Fallback/Default Path: Directly request the target URL
+        # Applies when Jina fails, or for direct download paths (GitHub Raw/Text files)
         r = requests.get(target_url, timeout=20)
         r.raise_for_status()
+        
         content_type = r.headers.get("content-type", "").lower()
-        text: str
         if "html" in content_type:
+            # Use simple method to strip tags as a fallback
             text = _strip_html(r.text)
         else:
             text = r.text
+            
         return LoadedDoc(id=url, text=text, source=url)
     except Exception:
         return None
diff --git a/spoon_ai/rag/qa.py b/spoon_ai/rag/qa.py
@@ -30,7 +30,7 @@ class QAResult:
 
 DEFAULT_QA_SYSTEM = (
     "You are a helpful assistant that answers questions using the provided context. "
-    "Always cite sources using [n] markers (e.g. [1], [2]) that refer to the numbered context snippets provided."
+    "Always cite sources using the exact [id] markers provided in the context (e.g. [docname_0], [url_1])."
 )
 
 QA_PROMPT_TEMPLATE = (
@@ -39,7 +39,7 @@ class QAResult:
     "Question: {question}\n\n"
     "Instructions:\n"
     "- If the answer is not in the context, say you don't know.\n"
-    "- Use [n] markers in the answer to cite the snippet numbers.\n"
+    "- Use the provided [id] markers in the answer to cite the snippets exactly.\n"
     "- Keep the answer concise and relevant.\n"
 )
 
@@ -60,14 +60,35 @@ def __init__(
         # Simple char limit safeguard (approx 30k tokens for modern models, but keep it safe)
         self.max_context_chars = 60000
 
+    def _get_chunk_marker(self, chunk: RetrievedChunk) -> str:
+        """Generate a stable citation marker: [doc_id_chunk_index]"""
+        raw_id = str(chunk.metadata.get("doc_id", "unknown"))
+        # Clean doc_id to be shorter and safer
+        # 1. Get basename if it looks like a path
+        if "/" in raw_id or "\\" in raw_id:
+            try:
+                raw_id = os.path.basename(str(raw_id))
+            except Exception:
+                pass
+        
+        # 2. Remove extension for brevity
+        base = os.path.splitext(raw_id)[0]
+        
+        # 3. Sanitize characters
+        clean_id = re.sub(r"[^a-zA-Z0-9_\-]", "_", base)
+        
+        idx = chunk.metadata.get("chunk_index", "0")
+        return f"[{clean_id}_{idx}]"
+
     def _truncate_context(self, chunks: List[RetrievedChunk]) -> str:
-        """Join chunks into a context string, respecting length limits."""
+        """Join chunks into a context string using stable IDs."""
         lines = []
         current_len = 0
         
-        for i, c in enumerate(chunks, start=1):
-            # Format: [n] content...
-            snippet = f"[{i}] {c.text}"
+        for c in chunks:
+            marker = self._get_chunk_marker(c)
+            # Format: [doc_1] content...
+            snippet = f"{marker} {c.text}"
             snippet_len = len(snippet) + 2  # + 2 for newlines
             
             if current_len + snippet_len > self.max_context_chars:
@@ -87,21 +108,24 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
                 citations=[]
             )
 
+        # Build map for citation lookup
+        chunk_map = {self._get_chunk_marker(c): c for c in chunks}
+
         # Optional offline fallback
         if os.getenv("RAG_FAKE_QA") == "1" or not (self.llm and hasattr(self.llm, "ask")):
             # P2: Consistent language (English default) for offline fallback to match system prompt
             answer = "Offline Mode / No LLM:\n" + "\n".join([
-                f"Source [{i}]: {c.text[:200]}..." for i, c in enumerate(chunks, start=1)
+                f"Source {self._get_chunk_marker(c)}: {c.text[:200]}..." for c in chunks
             ])
             cites = [
                 Citation(
-                    marker=f"[{i}]",
+                    marker=self._get_chunk_marker(c),
                     source=c.metadata.get("source", "unknown"),
                     doc_id=c.metadata.get("doc_id"),
                     chunk_index=c.metadata.get("chunk_index"),
                     text_snippet=c.text[:50]
                 )
-                for i, c in enumerate(chunks, start=1)
+                for c in chunks
             ]
             return QAResult(answer=answer, citations=cites)
 
@@ -124,22 +148,23 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
         else:
             text = getattr(resp, "content", "") or ""
 
-        # P1: Regex-based citation parsing
-        # Matches [1], [12], etc.
-        found_indices: Set[int] = set()
-        matches = re.findall(r"\[(\d+)\]", text)
-        for m in matches:
-            if m.isdigit():
-                found_indices.add(int(m))
-
+        # P1: ID-based citation parsing
+        # Matches [doc_1], [file_name_12], etc.
         final_citations: List[Citation] = []
-        # chunks is 0-indexed, markers are 1-indexed
-        for idx in sorted(found_indices):
-            if 1 <= idx <= len(chunks):
-                c = chunks[idx - 1]
+        seen_markers: Set[str] = set()
+        
+        # Regex to find potential markers in the text
+        # We look for [content] and check if it exists in our map
+        matches = re.findall(r"\[([^\]]+)\]", text)
+        
+        for m_str in matches:
+            marker = f"[{m_str}]"
+            if marker in chunk_map and marker not in seen_markers:
+                c = chunk_map[marker]
+                seen_markers.add(marker)
                 final_citations.append(
                     Citation(
-                        marker=f"[{idx}]",
+                        marker=marker,
                         source=c.metadata.get("source", "unknown"),
                         doc_id=c.metadata.get("doc_id"),
                         chunk_index=c.metadata.get("chunk_index"),