@@ -30,7 +30,7 @@ class QAResult:
3030
3131DEFAULT_QA_SYSTEM = (
3232 "You are a helpful assistant that answers questions using the provided context. "
33- "Always cite sources using [n ] markers (e.g. [1 ], [2]) that refer to the numbered context snippets provided ."
33+ "Always cite sources using the exact [id ] markers provided in the context (e.g. [docname_0 ], [url_1]) ."
3434)
3535
3636QA_PROMPT_TEMPLATE = (
@@ -39,7 +39,7 @@ class QAResult:
3939 "Question: {question}\n \n "
4040 "Instructions:\n "
4141 "- If the answer is not in the context, say you don't know.\n "
42- "- Use [n ] markers in the answer to cite the snippet numbers .\n "
42+ "- Use the provided [id ] markers in the answer to cite the snippets exactly .\n "
4343 "- Keep the answer concise and relevant.\n "
4444)
4545
@@ -60,14 +60,35 @@ def __init__(
6060 # Simple char limit safeguard (approx 30k tokens for modern models, but keep it safe)
6161 self .max_context_chars = 60000
6262
63+ def _get_chunk_marker (self , chunk : RetrievedChunk ) -> str :
64+ """Generate a stable citation marker: [doc_id_chunk_index]"""
65+ raw_id = str (chunk .metadata .get ("doc_id" , "unknown" ))
66+ # Clean doc_id to be shorter and safer
67+ # 1. Get basename if it looks like a path
68+ if "/" in raw_id or "\\ " in raw_id :
69+ try :
70+ raw_id = os .path .basename (str (raw_id ))
71+ except Exception :
72+ pass
73+
74+ # 2. Remove extension for brevity
75+ base = os .path .splitext (raw_id )[0 ]
76+
77+ # 3. Sanitize characters
78+ clean_id = re .sub (r"[^a-zA-Z0-9_\-]" , "_" , base )
79+
80+ idx = chunk .metadata .get ("chunk_index" , "0" )
81+ return f"[{ clean_id } _{ idx } ]"
82+
6383 def _truncate_context (self , chunks : List [RetrievedChunk ]) -> str :
64- """Join chunks into a context string, respecting length limits ."""
84+ """Join chunks into a context string using stable IDs ."""
6585 lines = []
6686 current_len = 0
6787
68- for i , c in enumerate (chunks , start = 1 ):
69- # Format: [n] content...
70- snippet = f"[{ i } ] { c .text } "
88+ for c in chunks :
89+ marker = self ._get_chunk_marker (c )
90+ # Format: [doc_1] content...
91+ snippet = f"{ marker } { c .text } "
7192 snippet_len = len (snippet ) + 2 # + 2 for newlines
7293
7394 if current_len + snippet_len > self .max_context_chars :
@@ -87,21 +108,24 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
87108 citations = []
88109 )
89110
111+ # Build map for citation lookup
112+ chunk_map = {self ._get_chunk_marker (c ): c for c in chunks }
113+
90114 # Optional offline fallback
91115 if os .getenv ("RAG_FAKE_QA" ) == "1" or not (self .llm and hasattr (self .llm , "ask" )):
92116 # P2: Consistent language (English default) for offline fallback to match system prompt
93117 answer = "Offline Mode / No LLM:\n " + "\n " .join ([
94- f"Source [ { i } ] : { c .text [:200 ]} ..." for i , c in enumerate ( chunks , start = 1 )
118+ f"Source { self . _get_chunk_marker ( c ) } : { c .text [:200 ]} ..." for c in chunks
95119 ])
96120 cites = [
97121 Citation (
98- marker = f"[ { i } ]" ,
122+ marker = self . _get_chunk_marker ( c ) ,
99123 source = c .metadata .get ("source" , "unknown" ),
100124 doc_id = c .metadata .get ("doc_id" ),
101125 chunk_index = c .metadata .get ("chunk_index" ),
102126 text_snippet = c .text [:50 ]
103127 )
104- for i , c in enumerate ( chunks , start = 1 )
128+ for c in chunks
105129 ]
106130 return QAResult (answer = answer , citations = cites )
107131
@@ -124,22 +148,23 @@ async def answer(self, question: str, chunks: List[RetrievedChunk]) -> QAResult:
124148 else :
125149 text = getattr (resp , "content" , "" ) or ""
126150
127- # P1: Regex-based citation parsing
128- # Matches [1], [12], etc.
129- found_indices : Set [int ] = set ()
130- matches = re .findall (r"\[(\d+)\]" , text )
131- for m in matches :
132- if m .isdigit ():
133- found_indices .add (int (m ))
134-
151+ # P1: ID-based citation parsing
152+ # Matches [doc_1], [file_name_12], etc.
135153 final_citations : List [Citation ] = []
136- # chunks is 0-indexed, markers are 1-indexed
137- for idx in sorted (found_indices ):
138- if 1 <= idx <= len (chunks ):
139- c = chunks [idx - 1 ]
154+ seen_markers : Set [str ] = set ()
155+
156+ # Regex to find potential markers in the text
157+ # We look for [content] and check if it exists in our map
158+ matches = re .findall (r"\[([^\]]+)\]" , text )
159+
160+ for m_str in matches :
161+ marker = f"[{ m_str } ]"
162+ if marker in chunk_map and marker not in seen_markers :
163+ c = chunk_map [marker ]
164+ seen_markers .add (marker )
140165 final_citations .append (
141166 Citation (
142- marker = f"[ { idx } ]" ,
167+ marker = marker ,
143168 source = c .metadata .get ("source" , "unknown" ),
144169 doc_id = c .metadata .get ("doc_id" ),
145170 chunk_index = c .metadata .get ("chunk_index" ),
0 commit comments