Skip to content

Commit e74e5b2

Browse files
committed
fix: [LAW] Fix pipeline vector search and graph search
1 parent 72dd18b commit e74e5b2

File tree

4 files changed

+154
-28
lines changed

4 files changed

+154
-28
lines changed

ai_service/app/pipelines/answer_composer.py

Lines changed: 88 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import google.generativeai as genai
33
from typing import Dict, Any, List
44
from ..dependencies import get_settings
5+
from ..utils.stream_utils import create_metadata_chunk
56

67
log = logging.getLogger(__name__)
78

@@ -12,19 +13,42 @@ def __init__(self):
1213
raise ValueError("GEMINI_API_KEY not found")
1314
genai.configure(api_key=settings.gemini_api_key)
1415
self.model_name = 'gemini-2.5-pro' # Or settings.GEMINI_COMPOSER_MODEL
16+
17+
def _build_citation_from_node_id(self, node_id: str) -> str:
18+
"""Parse node_id to build citation title"""
19+
import re
20+
parts = []
21+
22+
article_match = re.search(r'_Dieu_(\d+)', node_id)
23+
clause_match = re.search(r'_Khoan_(\d+)', node_id)
24+
point_match = re.search(r'_Diem_([a-z])', node_id)
25+
doc_match = re.match(r'^(luat|nghi-dinh|thong-tu)-(\d+)-(\d+)-([A-Z\-]+)', node_id)
26+
27+
if clause_match:
28+
parts.append(f"Khoản {clause_match.group(1)}")
29+
if point_match:
30+
parts.append(f"Điểm {point_match.group(1)}")
31+
if article_match:
32+
parts.append(f"Điều {article_match.group(1)}")
33+
if doc_match:
34+
doc_type, number, year, issuer = doc_match.groups()
35+
type_map = {"luat": "Luật", "nghi-dinh": "Nghị định", "thong-tu": "Thông tư"}
36+
parts.append(f"của {type_map.get(doc_type, 'Văn bản')} {number}/{year}/{issuer}")
37+
38+
return ", ".join(parts) if parts else f"Node {node_id}"
1539

1640
def compose(
1741
self,
1842
question: str,
19-
graph_result: Dict[str, Any],
43+
search_result: Dict[str, Any], # Chứa kết quả từ vector + graph search
2044
web_context: str = ""
2145
) -> str:
2246
"""
2347
Tổng hợp câu trả lời cuối cùng.
2448
"""
2549
log.info(f"Composing answer for question: {question}")
26-
final_choice = graph_result.get("final_choice")
27-
all_candidates = graph_result.get("all_candidates", [])
50+
final_choice = search_result.get("final_choice")
51+
all_candidates = search_result.get("all_candidates", [])
2852

2953
# Xây dựng context từ Graph
3054
graph_context_str = ""
@@ -79,6 +103,31 @@ def compose(
79103
else:
80104
graph_context_str = "Không tìm thấy thông tin trong cơ sở dữ liệu luật nội bộ."
81105

106+
# Build citations từ all_candidates cho FE
107+
citations = []
108+
if all_candidates:
109+
for idx, candidate in enumerate(all_candidates, 1):
110+
node_id = candidate.get("source_node_id", "")
111+
112+
# Format title từ metadata
113+
parts = []
114+
if candidate.get("clause_number"):
115+
parts.append(f"Khoản {candidate['clause_number']}")
116+
if candidate.get("point_letter"):
117+
parts.append(f"Điểm {candidate['point_letter']}")
118+
if candidate.get("article_number"):
119+
parts.append(f"Điều {candidate['article_number']}")
120+
if candidate.get("document_code"):
121+
parts.append(f"của {candidate['document_code']}")
122+
123+
title = ", ".join(parts) if parts else self._build_citation_from_node_id(node_id)
124+
125+
# Add to citations for FE
126+
citations.append({
127+
"node_id": node_id,
128+
"title": title
129+
})
130+
82131
# Xây dựng context từ Web (nếu có)
83132
web_context_str = ""
84133
if web_context:
@@ -143,22 +192,22 @@ def compose(
143192

144193
try:
145194
response = model.generate_content(prompt)
146-
return response.text.strip()
195+
return response.text.strip(), citations
147196
except Exception as e:
148197
log.error(f"Lỗi compose answer: {e}")
149-
return "Xin lỗi, hệ thống gặp sự cố khi tổng hợp câu trả lời."
198+
return "Xin lỗi, hệ thống gặp sự cố khi tổng hợp câu trả lời.", []
150199

151200
def compose_stream(
152201
self,
153202
question: str,
154-
graph_result: Dict[str, Any],
203+
search_result: Dict[str, Any], # Chứa kết quả từ vector + graph search
155204
web_context: str = ""
156205
):
157206
"""
158207
Tổng hợp câu trả lời cuối cùng (Streaming).
159208
"""
160-
final_choice = graph_result.get("final_choice")
161-
all_candidates = graph_result.get("all_candidates", [])
209+
final_choice = search_result.get("final_choice")
210+
all_candidates = search_result.get("all_candidates", [])
162211

163212
# Xây dựng context từ Graph (same logic as compose)
164213
graph_context_str = ""
@@ -207,6 +256,33 @@ def compose_stream(
207256
else:
208257
graph_context_str = "Không tìm thấy thông tin trong cơ sở dữ liệu luật nội bộ."
209258

259+
# Build citations từ all_candidates (duplicated from compose for consistency)
260+
citations = []
261+
log.info(f"DEBUG: all_candidates count = {len(all_candidates)}")
262+
if all_candidates:
263+
log.info(f"DEBUG: Building citations from {len(all_candidates)} candidates")
264+
for idx, candidate in enumerate(all_candidates, 1):
265+
node_id = candidate.get("source_node_id", "")
266+
267+
# Format title từ metadata
268+
parts = []
269+
if candidate.get("clause_number"):
270+
parts.append(f"Khoản {candidate['clause_number']}")
271+
if candidate.get("point_letter"):
272+
parts.append(f"Điểm {candidate['point_letter']}")
273+
if candidate.get("article_number"):
274+
parts.append(f"Điều {candidate['article_number']}")
275+
if candidate.get("document_code"):
276+
parts.append(f"của {candidate['document_code']}")
277+
278+
title = ", ".join(parts) if parts else self._build_citation_from_node_id(node_id)
279+
280+
# Add to citations for FE
281+
citations.append({
282+
"node_id": node_id,
283+
"title": title
284+
})
285+
210286
# Xây dựng context từ Web (nếu có)
211287
web_context_str = ""
212288
if web_context:
@@ -270,6 +346,10 @@ def compose_stream(
270346
)
271347

272348
try:
349+
# Yield metadata first với citations
350+
if citations:
351+
yield create_metadata_chunk(citations)
352+
273353
response = model.generate_content(prompt, stream=True)
274354
for chunk in response:
275355
if chunk.text:

ai_service/app/pipelines/graph_search.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -297,35 +297,50 @@ def run_direct_graph_query(
297297
GRAPH SCHEMA:
298298
{GraphSearchService._schema_snapshot}
299299
300-
CẤU TRÚC GRAPH:
301-
- Document (văn bản luật): có properties: title, code, law_number, issuer
302-
- Article (điều): có properties: number, heading, content
303-
- Clause (khoản): có properties: number, content
304-
- Point (điểm): có properties: letter, content
305-
- Chapter (chương): có properties: number, title
300+
CẤU TRÚC GRAPH (Properties thực tế):
301+
- Document (văn bản luật):
302+
• id: "luat-41-2024-QH15"
303+
• code: "41/2024/QH15"
304+
• doc_id: "luat-41-2024-QH15"
305+
• title: "Luật Bảo hiểm xã hội 41/2024/QH15"
306+
• type: "Luật"
307+
• effectiveDate, issuedDate
308+
309+
- Chapter (chương): number, title, content
310+
- Article (điều): number, heading, content
311+
- Clause (khoản): number, content
312+
- Point (điểm): letter, content
306313
307314
RELATIONSHIPS:
308315
- (Document)-[:HAS_CHAPTER]->(Chapter)
309316
- (Document)-[:HAS_ARTICLE]->(Article)
310317
- (Article)-[:HAS_CLAUSE]->(Clause)
311318
- (Clause)-[:HAS_POINT]->(Point)
312319
320+
**QUAN TRỌNG - Khi tìm kiếm Document theo tên:**
321+
- **TUYỆT ĐỐI KHÔNG** dùng cú pháp {{title: '...'}}
322+
- **BẮT BUỘC** phải dùng WHERE toLower(d.title) CONTAINS toLower('...') để tìm kiếm gần đúng
323+
- Ví dụ: MATCH (d:Document) WHERE toLower(d.title) CONTAINS toLower('Luật Bảo hiểm xã hội')
324+
313325
NHIỆM VỤ:
314326
Tạo Cypher query để trả lời câu hỏi. Các dạng query phổ biến:
315327
316328
1. **COUNT (Đếm số lượng)**:
317-
Ví dụ: "Luật X có bao nhiêu chương/điều"
329+
Ví dụ: "Luật BHXH 41/2024/QH15 có bao nhiêu chương"
318330
```cypher
319-
MATCH (d:Document {{code: 'X'}})-[:HAS_CHAPTER]->(ch:Chapter)
320-
RETURN d.title AS document_title, d.code AS document_code, count(ch) AS chapter_count
331+
MATCH (d:Document)
332+
WHERE toLower(d.title) CONTAINS toLower('Luật Bảo hiểm xã hội')
333+
AND toLower(d.title) CONTAINS '41/2024'
334+
MATCH (d)-[:HAS_CHAPTER]->(ch:Chapter)
335+
RETURN d.title, d.code, count(ch) AS chapter_count
321336
```
322337
323338
2. **LIST (Liệt kê)**:
324339
Ví dụ: "Danh sách văn bản luật"
325340
```cypher
326341
MATCH (d:Document)
327-
RETURN d.title AS title, d.code AS code, d.law_number AS law_number
328-
ORDER BY d.law_number
342+
RETURN d.title, d.code, d.type
343+
ORDER BY d.issuedDate DESC
329344
LIMIT 20
330345
```
331346
@@ -335,13 +350,14 @@ def run_direct_graph_query(
335350
MATCH (a:Article {{number: '60'}})
336351
OPTIONAL MATCH (a)<-[:HAS_ARTICLE]-(d:Document)
337352
RETURN a.number AS article_number, a.heading AS article_heading,
338-
a.content AS article_content, d.title AS document_title
353+
a.content AS article_content, d.title AS document_title, d.code AS document_code
339354
```
340355
341356
YÊU CẦU:
342357
- Chỉ trả về Cypher query, KHÔNG THÊM text giải thích
343358
- Query phải trả về kết quả có ý nghĩa (sử dụng AS alias rõ ràng)
344359
- Nếu có extracted_facts, sử dụng làm filter (WHERE)
360+
- **LUÔN dùng toLower() CONTAINS cho title search**, KHÔNG dùng exact match với title
345361
346362
HÃY TẠO CYPHER QUERY:
347363
"""
@@ -354,7 +370,7 @@ def run_direct_graph_query(
354370
response = model.generate_content(prompt)
355371
cypher_query = self._clean(response.text)
356372

357-
log.info(f"Generated direct Cypher query: {cypher_query[:200]}...")
373+
log.info(f"Generated direct Cypher query: {cypher_query}...")
358374

359375
# Run query
360376
results = self.run_cypher(cypher_query)

ai_service/app/services/streaming_service.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,36 @@ async def generate_streaming_response(question: str, is_debug: bool = False) ->
8989
yield create_tool_thought(AgentTool.GRAPH_SEARCH)
9090
log.info("GRAPH_LOOKUP category - using direct Cypher query (skip vector search)")
9191
gs = GraphSearchService()
92-
graph_result = await asyncio.to_thread(
92+
search_result = await asyncio.to_thread(
9393
gs.run_direct_graph_query,
9494
question,
9595
processed['structured']
9696
)
9797

98+
# Check if graph search found results
99+
has_results = search_result.get('final_choice') is not None and search_result.get('results')
100+
web_context = ""
101+
102+
if not has_results:
103+
# Fallback to web search if graph returns no results
104+
log.info("GRAPH_LOOKUP found no results, falling back to web search")
105+
yield create_tool_thought(AgentTool.WEB_SEARCH)
106+
web_context, reference_count = await StreamingService._perform_web_search(question)
107+
yield create_tool_thought(AgentTool.WEB_SEARCH, f"Số lượng website cung cấp thông tin tham khảo: {reference_count}")
108+
98109
# Go directly to answer composition
99110
yield create_tool_thought(AgentTool.CONCLUSION)
100111
composer = AnswerComposerService()
101112

102113
if is_debug:
103-
full_response = await asyncio.to_thread(composer.compose, question, graph_result, "")
114+
full_response, citations = await asyncio.to_thread(composer.compose, question, search_result, web_context)
104115
yield create_content_chunk(full_response)
116+
117+
if citations:
118+
from ..utils.stream_utils import create_metadata_chunk
119+
yield create_metadata_chunk(citations)
105120
else:
106-
iterator = composer.compose_stream(question, graph_result, "")
121+
iterator = composer.compose_stream(question, search_result, web_context)
107122
for chunk_text in iterator:
108123
yield create_content_chunk(chunk_text)
109124
return
@@ -120,9 +135,9 @@ async def generate_streaming_response(question: str, is_debug: bool = False) ->
120135
# 3. Graph Search (skip rerank, Gemini filtering happens inside)
121136
yield create_tool_thought(AgentTool.GRAPH_SEARCH)
122137
gs = GraphSearchService()
123-
graph_result = await asyncio.to_thread(gs.run_graph_search_for_node_ids, node_ids, question)
138+
search_result = await asyncio.to_thread(gs.run_graph_search_for_node_ids, node_ids, question)
124139

125-
final_choice = graph_result.get('final_choice')
140+
final_choice = search_result.get('final_choice')
126141
web_context = ""
127142

128143
# 5. Web Search Fallback
@@ -138,10 +153,14 @@ async def generate_streaming_response(question: str, is_debug: bool = False) ->
138153
composer = AnswerComposerService()
139154

140155
if is_debug:
141-
content = await asyncio.to_thread(composer.compose, question, graph_result, web_context)
156+
content, citations = await asyncio.to_thread(composer.compose, question, search_result, web_context)
157+
# Emit metadata first
158+
if citations:
159+
from ..utils.stream_utils import create_metadata_chunk
160+
yield create_metadata_chunk(citations)
142161
yield create_content_chunk(content)
143162
else:
144-
iterator = composer.compose_stream(question, graph_result, web_context)
163+
iterator = composer.compose_stream(question, search_result, web_context)
145164
for chunk_text in iterator:
146165
yield create_content_chunk(chunk_text)
147166

ai_service/app/utils/stream_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,15 @@ def create_content_chunk(text: str) -> str:
4545
},
4646
timestamp=time.time()
4747
)
48+
return f"data: {chunk.json()}\n\n"
49+
50+
def create_metadata_chunk(citations: list) -> str:
51+
"""Helper để tạo chunk metadata chứa citations"""
52+
chunk = StreamingChunk(
53+
type="metadata",
54+
data={
55+
"citations": citations
56+
},
57+
timestamp=time.time()
58+
)
4859
return f"data: {chunk.json()}\n\n"

0 commit comments

Comments
 (0)