Skip to content

Commit ae5892b

Browse files
authored
Merge pull request #225 from yueliao11/feat/rag-framework
feat(rag): enhance url loader with Jina Reader for better HTML parsing
2 parents 2a3bf03 + 9d11c7a commit ae5892b

File tree

2 files changed

+95
-32
lines changed

2 files changed

+95
-32
lines changed

spoon_ai/rag/loader.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,18 @@ def _strip_html(html: str) -> str:
2626

2727

2828
def _try_convert_github_url(url: str) -> str:
29-
"""Convert GitHub blob URLs to raw URLs to fetch clean content.
30-
31-
Example:
32-
https://github.com/user/repo/blob/main/README.md
33-
-> https://raw.githubusercontent.com/user/repo/main/README.md
3429
"""
30+
Convert GitHub blob URLs to raw URLs to extract clean content without HTML UI.
31+
Example: https://github.com/user/repo/blob/main/README.md
32+
-> https://raw.githubusercontent.com/user/repo/main/README.md
33+
"""
34+
# Pattern matches: github.com/{user}/{repo}/blob/{branch}/{path}
3535
pattern = r"^https?://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)$"
3636
match = re.match(pattern, url)
37-
if not match:
38-
return url
39-
user, repo, branch, path = match.groups()
40-
return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
37+
if match:
38+
user, repo, branch, path = match.groups()
39+
return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
40+
return url
4141

4242

4343
def _load_file(path: Path) -> Optional[LoadedDoc]:
@@ -67,15 +67,53 @@ def _load_file(path: Path) -> Optional[LoadedDoc]:
6767

6868
def _load_url(url: str) -> Optional[LoadedDoc]:
6969
try:
70+
# 1. GitHub Conversion: Try to convert GitHub Blob URL to Raw URL for improved content extraction
7071
target_url = _try_convert_github_url(url)
72+
73+
# 2. Strategy Decision:
74+
# If it is a Github Raw link or a common pure text/code file suffix, direct download is more efficient and accurate.
75+
# Otherwise (general webpage), try to use Jina Reader to convert HTML into high-quality Markdown.
76+
77+
# Common pure text/code suffixes, do not need LLM Reader for cleaning
78+
raw_extensions = (
79+
".txt", ".md", ".json", ".yaml", ".yml", ".csv", ".xml", ".ini", ".conf",
80+
".py", ".js", ".ts", ".go", ".rs", ".java", ".c", ".cpp", ".h", ".cs", ".php", ".rb", ".sh"
81+
)
82+
83+
is_github_raw = "raw.githubusercontent.com" in target_url
84+
is_pure_text = target_url.lower().endswith(raw_extensions)
85+
86+
should_use_jina = not (is_github_raw or is_pure_text)
87+
88+
if should_use_jina:
89+
# 3. Try Jina Reader (https://jina.ai/reader)
90+
# It can convert cluttered webpages into clean Markdown, which is very suitable for RAG
91+
jina_api_key = os.getenv("JINA_API_KEY")
92+
headers = {"X-Retain-Images": "none"}
93+
if jina_api_key:
94+
headers["Authorization"] = f"Bearer {jina_api_key}"
95+
96+
try:
97+
jina_url = f"https://r.jina.ai/{target_url}"
98+
r_jina = requests.get(jina_url, headers=headers, timeout=20)
99+
if r_jina.status_code == 200:
100+
return LoadedDoc(id=url, text=r_jina.text, source=url)
101+
except Exception:
102+
# If Jina service times out or fails, silently fallback to normal download
103+
pass
104+
105+
# 4. Fallback/Default Path: Directly request the target URL
106+
# Applies when Jina fails, or for direct download paths (GitHub Raw/Text files)
71107
r = requests.get(target_url, timeout=20)
72108
r.raise_for_status()
109+
73110
content_type = r.headers.get("content-type", "").lower()
74-
text: str
75111
if "html" in content_type:
112+
# Use simple method to strip tags as a fallback
76113
text = _strip_html(r.text)
77114
else:
78115
text = r.text
116+
79117
return LoadedDoc(id=url, text=text, source=url)
80118
except Exception:
81119
return None

spoon_ai/rag/qa.py

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class QAResult:
3030

3131
DEFAULT_QA_SYSTEM = (
3232
"You are a helpful assistant that answers questions using the provided context. "
33-
"Always cite sources using [n] markers (e.g. [1], [2]) that refer to the numbered context snippets provided."
33+
"Always cite sources using the exact [id] markers provided in the context (e.g. [docname_0], [url_1])."
3434
)
3535

3636
QA_PROMPT_TEMPLATE = (
@@ -39,7 +39,7 @@ class QAResult:
3939
"Question: {question}\n\n"
4040
"Instructions:\n"
4141
"- If the answer is not in the context, say you don't know.\n"
42-
"- Use [n] markers in the answer to cite the snippet numbers.\n"
42+
"- Use the provided [id] markers in the answer to cite the snippets exactly.\n"
4343
"- Keep the answer concise and relevant.\n"
4444
)
4545

@@ -60,14 +60,35 @@ def __init__(
6060
# Simple char limit safeguard (approx 30k tokens for modern models, but keep it safe)
6161
self.max_context_chars = 60000
6262

63+
def _get_chunk_marker(self, chunk: RetrievedChunk) -> str:
64+
"""Generate a stable citation marker: [doc_id_chunk_index]"""
65+
raw_id = str(chunk.metadata.get("doc_id", "unknown"))
66+
# Clean doc_id to be shorter and safer
67+
# 1. Get basename if it looks like a path
68+
if "/" in raw_id or "\\" in raw_id:
69+
try:
70+
raw_id = os.path.basename(str(raw_id))
71+
except Exception:
72+
pass
73+
74+
# 2. Remove extension for brevity
75+
base = os.path.splitext(raw_id)[0]
76+
77+
# 3. Sanitize characters
78+
clean_id = re.sub(r"[^a-zA-Z0-9_\-]", "_", base)
79+
80+
idx = chunk.metadata.get("chunk_index", "0")
81+
return f"[{clean_id}_{idx}]"
82+
6383
def _truncate_context(self, chunks: List[RetrievedChunk]) -> str:
64-
"""Join chunks into a context string, respecting length limits."""
84+
"""Join chunks into a context string using stable IDs."""
6585
lines = []
6686
current_len = 0
6787

68-
for i, c in enumerate(chunks, start=1):
69-
# Format: [n] content...
70-
snippet = f"[{i}] {c.text}"
88+
for c in chunks:
89+
marker = self._get_chunk_marker(c)
90+
# Format: [doc_1] content...
91+
snippet = f"{marker} {c.text}"
7192
snippet_len = len(snippet) + 2 # + 2 for newlines
7293

7394
if current_len + snippet_len > self.max_context_chars:
@@ -87,21 +108,24 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
87108
citations=[]
88109
)
89110

111+
# Build map for citation lookup
112+
chunk_map = {self._get_chunk_marker(c): c for c in chunks}
113+
90114
# Optional offline fallback
91115
if os.getenv("RAG_FAKE_QA") == "1" or not (self.llm and hasattr(self.llm, "ask")):
92116
# P2: Consistent language (English default) for offline fallback to match system prompt
93117
answer = "Offline Mode / No LLM:\n" + "\n".join([
94-
f"Source [{i}]: {c.text[:200]}..." for i, c in enumerate(chunks, start=1)
118+
f"Source {self._get_chunk_marker(c)}: {c.text[:200]}..." for c in chunks
95119
])
96120
cites = [
97121
Citation(
98-
marker=f"[{i}]",
122+
marker=self._get_chunk_marker(c),
99123
source=c.metadata.get("source", "unknown"),
100124
doc_id=c.metadata.get("doc_id"),
101125
chunk_index=c.metadata.get("chunk_index"),
102126
text_snippet=c.text[:50]
103127
)
104-
for i, c in enumerate(chunks, start=1)
128+
for c in chunks
105129
]
106130
return QAResult(answer=answer, citations=cites)
107131

@@ -124,22 +148,23 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
124148
else:
125149
text = getattr(resp, "content", "") or ""
126150

127-
# P1: Regex-based citation parsing
128-
# Matches [1], [12], etc.
129-
found_indices: Set[int] = set()
130-
matches = re.findall(r"\[(\d+)\]", text)
131-
for m in matches:
132-
if m.isdigit():
133-
found_indices.add(int(m))
134-
151+
# P1: ID-based citation parsing
152+
# Matches [doc_1], [file_name_12], etc.
135153
final_citations: List[Citation] = []
136-
# chunks is 0-indexed, markers are 1-indexed
137-
for idx in sorted(found_indices):
138-
if 1 <= idx <= len(chunks):
139-
c = chunks[idx - 1]
154+
seen_markers: Set[str] = set()
155+
156+
# Regex to find potential markers in the text
157+
# We look for [content] and check if it exists in our map
158+
matches = re.findall(r"\[([^\]]+)\]", text)
159+
160+
for m_str in matches:
161+
marker = f"[{m_str}]"
162+
if marker in chunk_map and marker not in seen_markers:
163+
c = chunk_map[marker]
164+
seen_markers.add(marker)
140165
final_citations.append(
141166
Citation(
142-
marker=f"[{idx}]",
167+
marker=marker,
143168
source=c.metadata.get("source", "unknown"),
144169
doc_id=c.metadata.get("doc_id"),
145170
chunk_index=c.metadata.get("chunk_index"),

0 commit comments

Comments
 (0)