DUT-Team-21TCLC-DT3
diff --git a/‎ai_service/.env.example‎
Lines changed: 4 additions & 12 deletions b/‎ai_service/.env.example‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎ai_service/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎ai_service/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ai_service/__init__.py‎ b/‎ai_service/__init__.py‎
diff --git a/‎ai_service/app/adapters/chain_builders.py‎
Lines changed: 36 additions & 1 deletion b/‎ai_service/app/adapters/chain_builders.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎ai_service/app/config.py‎
Lines changed: 1 addition & 0 deletions b/‎ai_service/app/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ai_service/app/main.py‎
Lines changed: 1 addition & 1 deletion b/‎ai_service/app/main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ai_service/app/main_new.py‎
Lines changed: 0 additions & 57 deletions b/‎ai_service/app/main_new.py‎
Lines changed: 0 additions & 57 deletions
diff --git a/‎ai_service/app/new_pipelines/query_preprocessor.py‎
Lines changed: 234 additions & 0 deletions b/‎ai_service/app/new_pipelines/query_preprocessor.py‎
Lines changed: 234 additions & 0 deletions
@@ -1,19 +1,11 @@
-# HTTP & gRPC
-START_GRPC=false
-HTTP_HOST=0.0.0.0
-HTTP_PORT=8000
-GRPC_BIND=0.0.0.0:50051
-
 # Neo4j
 NEO4J_URI=bolt://localhost:7687
 NEO4J_USER=neo4j
 NEO4J_PASSWORD=your_password
 
-# OpenAI
-OPENAI_API_KEY=sk-...
-
 # Models
-CYPHER_MODEL=gpt-4o-mini
-QA_MODEL=gpt-4o-mini
+CYPHER_MODEL=gemini-2.5-pro
+QA_MODEL=gemini-2.5-pro
 CYPHER_TEMPERATURE=0.0
-QA_TEMPERATURE=0.0
+QA_TEMPERATURE=0.0
+GEMINI_API_KEY=your_gemini_api_key
@@ -5,5 +5,5 @@ RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 ENV PYTHONUNBUFFERED=1
-EXPOSE 8000 50051
+EXPOSE 8000
 CMD ["python", "-m", "app.main"]
@@ -1,5 +1,5 @@
 import logging
-from typing import Optional
+from typing import Optional, List
 from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
 from langchain_core.prompts import PromptTemplate
 from .langchain_graph import GraphProvider
@@ -77,6 +77,41 @@ def custom_chain(self, prompt: PromptTemplate):
             # Fallback: build default chain, will still work
             log.warning("Custom cypher_prompt not supported by this langchain version; falling back to default prompt")
             return self.default_chain()
+    
+    def get_all_prompts(self) -> List[PromptTemplate]:
+        """
+        Return list of prompts to try in order of preference.
+        Used for retry strategy.
+        """
+        from ..pipelines.prompts import (
+            CUSTOM_CYPHER_PROMPT,
+            SIMPLE_SEARCH_PROMPT,
+            KEYWORD_PATTERN_PROMPT,
+            LAYER1_NODES, LAYER1_RELATIONSHIPS,
+            LAYER2_NODES, LAYER2_RELATIONSHIPS,
+            LAYER3_NODES, LAYER3_RELATIONSHIPS,
+            LAYER4_NODES, LAYER4_RELATIONSHIPS,
+        )
+        
+        # Partially fill the detailed prompt with schema layers
+        detailed_prompt = PromptTemplate.from_template(
+            CUSTOM_CYPHER_PROMPT.template
+        ).partial(
+            layer1_nodes=LAYER1_NODES,
+            layer1_rels=LAYER1_RELATIONSHIPS,
+            layer2_nodes=LAYER2_NODES,
+            layer2_rels=LAYER2_RELATIONSHIPS,
+            layer3_nodes=LAYER3_NODES,
+            layer3_rels=LAYER3_RELATIONSHIPS,
+            layer4_nodes=LAYER4_NODES,
+            layer4_rels=LAYER4_RELATIONSHIPS,
+        )
+        
+        return [
+            detailed_prompt,           # Strategy 1: Most detailed
+            SIMPLE_SEARCH_PROMPT,      # Strategy 2: Simplified
+            KEYWORD_PATTERN_PROMPT,    # Strategy 3: Pattern-based
+        ]
 
 
 class FallbackChain:
 
@@ -35,4 +35,5 @@ class Settings(BaseSettings):
         "protected_namespaces": (),
         "env_file": [".env", "../.env"],  # Check both current dir and parent dir
         "case_sensitive": True,
+        "extra": "ignore",  # Ignore extra fields from .env (e.g., old gRPC settings)
     }
@@ -54,4 +54,4 @@ async def lifespan(app: FastAPI):
         port=port,
         log_level="info",
         access_log=True
-    )
+    )
@@ -0,0 +1,234 @@
+import logging
+import json
+import os
+import sys
+import google.generativeai as genai
+from typing import Dict, Any, List
+from dotenv import load_dotenv
+from app.dependencies import get_settings
+
+# Cấu hình logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+log = logging.getLogger(__name__)
+
+# --- CẤU HÌNH SYSTEM INSTRUCTION ---
+# Đây là phần quan trọng nhất: Tách biệt luật lệ ra khỏi dữ liệu đầu vào
+SYSTEM_INSTRUCTION = """
+Bạn là chuyên gia AI về phân tích câu hỏi pháp luật Việt Nam (Luật Lao động, bảo hiểm xã hội).
+
+NHIỆM VỤ:
+Phân tích câu hỏi người dùng (có thể người dùng sử dụng một số từ địa phương và viết tắt, ví dụ: "BHXH" thay vì "Bảo hiểm xã hội",... do đó cần chuyển đổi các từ viết tắt thành đầy đủ) 
+và trả về JSON có cấu trúc để phục vụ tìm kiếm dữ liệu (RAG).
+
+BƯỚC 1: KIỂM TRA ĐỘ LIÊN QUAN (RELEVANCE CHECK)
+- Phạm vi nội bộ: Luật Lao động, BHXH, BHYT, BHTN, Tiền lương.
+- Phạm vi mở rộng: Giao thông, Dân sự, Hình sự... -> Category: "WEBSITE_SEARCH", vẫn coi là RELEVANT
+- Không liên quan: Thời tiết, bóng đá... -> is_relevant: false.
+
+BƯỚC 2: XỬ LÝ ĐẦU RA (OUTPUT HANDLING)
+
+TRƯỜNG HỢP 1: IRRELEVANT (KHÔNG LIÊN QUAN)
+- Trả về JSON: {"is_relevant": false, "intent": "Mô tả lý do không liên quan", "category": "IRRELEVANT", ...các trường khác để trống...}
+
+TRƯỜNG HỢP 2: RELEVANT (LIÊN QUAN)
+- Trả về JSON: {"is_relevant": true, ...}
+- Thực hiện phân tích chi tiết theo 3 QUY TẮC QUY TẮC BẤT DI BẤT DỊCH (CRITICAL RULES) sau:
+
+QUY TẮC 1. FACTS (Sự kiện & Metadata): 
+- CHỈ trích xuất các thông tin (con số, thời gian, mức lương) ĐƯỢC NÊU RÕ RÀNG trong câu hỏi, KHÔNG SUY DIỄN.
+- Đặc biệt với category "GRAPH_LOOKUP", bắt buộc trích xuất các key sau nếu có:
+    + "legal_article": Số hiệu điều luật (VD: "Điều 60", "Điều 3").
+    + "clause": Khoản (VD: "Khoản 1").
+    + "law_name": Tên luật/bộ luật/văn bản (VD: "Bộ luật Lao động", "Luật Bảo hiểm xã hội", "Nghị định 100/2020/NĐ-CP").
+    + "chapter": Chương.
+- Nếu câu hỏi chung chung (ví dụ: "Điều kiện hưởng là gì?"), trường "extracted_facts" PHẢI LÀ object rỗng {}.
+
+QUY TẮC 2. SEARCH QUERIES: 
+- Tạo 4-6 câu truy vấn tìm kiếm tối ưu.
+- PHẢI giải nghĩa từ viết tắt thành tiếng Việt đầy đủ.
+- Dùng từ ngữ pháp lý chính xác (ví dụ: dùng "trợ cấp thất nghiệp" thay vì "tiền thất nghiệp").
+- Các query phải bao quát được ý định của người dùng.
+
+QUY TẮC 3. PHÂN LOẠI CATEGORY (QUAN TRỌNG)
+1. "GRAPH_LOOKUP" (Ưu tiên dùng Graph DB/Cypher):
+- Khi câu hỏi yêu cầu chính xác một đối tượng cụ thể: "Điều 60 quy định gì?", "Khoản 2 Điều 3 nói gì?".
+- Khi câu hỏi về cấu trúc văn bản: "Chương 5 có bao nhiêu điều?", "Luật này ban hành năm nào?".
+- Khi câu hỏi đếm số lượng hoặc liệt kê: "Có bao nhiêu trường hợp...", "Liệt kê các loại...".
+=> Dấu hiệu nhận biết: Có chứa số hiệu điều luật, tên chương, hoặc từ khóa "bao nhiêu", "liệt kê", "danh sách".
+
+2. "CONSULTATION" (Dùng Vector Search + LLM):
+- Khi câu hỏi mô tả tình huống, cần tư vấn: "Tôi nghỉ việc thì được gì?", "Cách tính lương hưu?".
+- Câu hỏi "như thế nào", "ra sao", "điều kiện gì" mà KHÔNG chỉ đích danh điều luật cụ thể.
+- Các câu hỏi cần tổng hợp thông tin từ nhiều nơi.
+
+3. "WEBSITE_SEARCH": Các luật ngoài phạm vi nội bộ (Giao thông, Đất đai...).
+4. "SOCIAL_CHAT": Chào hỏi xã giao.
+
+CẤU TRÚC JSON TRẢ VỀ:
+{
+  "is_relevant": true/false,
+  "intent": "Mô tả ý định thực sự của user (VD: Hỏi về điều kiện hưởng thai sản)",
+  "search_queries": ["query 1", "query 2"],
+  "extracted_facts": {
+    "key": "value",
+    "legal_article": "Điều X (nếu có)",
+    "law_name": "Tên luật (nếu có)"
+  },
+  "category": "GRAPH_LOOKUP" | "CONSULTATION" | "WEBSITE_SEARCH" | "SOCIAL_CHAT"
+}
+
+--- VÍ DỤ MẪU (CHỈ THAM KHẢO CẤU TRÚC, KHÔNG COPY DỮ LIỆU) ---
+Ví dụ 1 (Graph Lookup - Hỏi cấu trúc):
+Input: "Luật BHXH có bao nhiêu chương?"
+Output:
+{
+  "is_relevant": true,
+  "intent": "Hỏi về số lượng chương trong Luật Bảo hiểm xã hội",
+  "search_queries": ["cấu trúc luật bảo hiểm xã hội", "số lượng chương luật bảo hiểm xã hội"],
+  "extracted_facts": {
+    "law_name": "Luật Bảo hiểm xã hội"
+  },
+  "category": "GRAPH_LOOKUP"
+}
+
+Ví dụ 2 (Câu hỏi có dữ liệu cụ thể, câu hỏi dạng tình huống):
+Input: "Chị A lương 10 triệu, đóng bảo hiểm 2 năm thì được nhận bao nhiêu?"
+Output:
+{
+  "is_relevant": true,
+  "intent": "Hỏi về mức hưởng bảo hiểm dựa trên mức lương và thời gian đóng cụ thể",
+  "search_queries": ["cách tính mức hưởng bảo hiểm xã hội một lần", "công thức tính trợ cấp bảo hiểm xã hội", "mức hưởng bảo hiểm xã hội theo mức lương đóng"],
+  "extracted_facts": {
+    "name": "Chị A",
+    "salary": "10 triệu",
+    "insurance_duration": "2 năm"
+  },
+  "category": "CONSULTATION"
+}
+
+Ví dụ 3 (Graph Lookup - Tra cứu đích danh):
+Input: "Điều 60 Bộ luật Lao động quy định gì?"
+Output:
+{
+  "is_relevant": true,
+  "intent": "Tra cứu nội dung quy định tại Điều 60 Bộ luật Lao động",
+  "search_queries": ["nội dung Điều 60 Bộ luật Lao động", "quy định tại Điều 60 Bộ luật Lao động năm 2019"],
+  "extracted_facts": {
+    "legal_article": "Điều 60",
+    "law_name": "Bộ luật Lao động"
+  },
+  "category": "GRAPH_LOOKUP"
+}
+
+Ví dụ 4 (Web Search - Ngoài phạm vi):
+Input: "Vượt đèn đỏ phạt bao nhiêu?"
+Output:
+{
+  "is_relevant": true,
+  "intent": "Hỏi về mức phạt vi phạm giao thông (vượt đèn đỏ)",
+  "search_queries": ["mức phạt lỗi vượt đèn đỏ xe máy 2024", "mức phạt vượt đèn đỏ ô tô"],
+  "extracted_facts": {},
+  "category": "WEBSITE_SEARCH"
+}
+"""
+
+class QueryPreprocessor:
+    def __init__(self):
+        # Load API Key
+        settings = get_settings()
+        print(f'config: {settings.model_dump_json()}')
+        genai.configure(api_key=settings.gemini_api_key)
+        
+        # Cấu hình Model
+        # Temperature = 0.0: Quan trọng để loại bỏ sự sáng tạo/ảo giác
+        generation_config = {
+            "response_mime_type": "application/json",
+            "temperature": 0.0, 
+        }
+        
+        # Khởi tạo model với System Instruction riêng biệt
+        # Khuyên dùng gemini-1.5-flash (nhanh, rẻ, tuân thủ tốt) hoặc gemini-1.5-pro
+        gemini_model = 'gemini-2.5-flash'
+        self.model = genai.GenerativeModel(
+            gemini_model, 
+            generation_config=generation_config,
+            system_instruction=SYSTEM_INSTRUCTION
+        )
+        
+        log.info(f"QueryRewriter initialized successfully ({gemini_model})")
+    
+    def _clean_json_string(self, text: str) -> str:
+        """Làm sạch chuỗi JSON nếu model trả về markdown"""
+        text = text.strip()
+        if text.startswith("```"):
+            lines = text.split('\n')
+            # Bỏ dòng đầu (```json) và dòng cuối (```)
+            if lines[0].startswith("```"): lines = lines[1:]
+            if lines and lines[-1].strip() == "```": lines = lines[:-1]
+            text = '\n'.join(lines).strip()
+        return text
+    
+    def rewrite(self, question: str) -> Dict[str, Any]:
+        """
+        Xử lý câu hỏi và trả về cấu trúc phân tích
+        """
+        log.info(f"Processing: '{question}'")
+        
+        try:
+            # Prompt gửi đi bây giờ rất đơn giản, chỉ chứa câu hỏi
+            # Điều này giúp model tập trung hoàn toàn vào input hiện tại
+            user_prompt = f"Phân tích câu hỏi sau: \"{question}\""
+            
+            response = self.model.generate_content(user_prompt)
+            json_text = self._clean_json_string(response.text)
+            
+            try:
+                structured_data = json.loads(json_text)
+            except json.JSONDecodeError:
+                log.error(f"JSON Decode Error. Raw text: {json_text}")
+                # Fallback cơ bản nếu lỗi JSON
+                structured_data = {
+                    "is_relevant": True,
+                    "intent": "Lỗi phân tích cú pháp",
+                    "search_queries": [question],
+                    "extracted_facts": {},
+                    "category": "CONSULTATION"
+                }
+
+            return {
+                "original": question,
+                "structured": structured_data,
+                "status": "success"
+            }
+            
+        except Exception as e:
+            log.error(f"Error in rewrite: {e}")
+            return {
+                "original": question,
+                "structured": {
+                    "is_relevant": True,
+                    "intent": "System Error",
+                    "search_queries": [question],
+                    "extracted_facts": {},
+                    "category": "CONSULTATION"
+                },
+                "status": "error"
+            }
+
+# --- PHẦN CHẠY THỬ NGHIỆM ---
+if __name__ == "__main__":
+    preprocessor = QueryPreprocessor()
+
+    queries = [
+        "Thời tiết hôm nay như thế nào?",
+        "tôi muốn hỏi về luật giao thông đường bộ?",
+        "Điều 60 Bộ luật Lao động quy định gì?",
+        "Có bao nhiêu trường hợp được hưởng trợ cấp thất nghiệp?"
+    ]
+
+    for query in queries:
+        result = preprocessor.rewrite(query)
+        print(f'result: {result["structured"]}')
Original file line number	Diff line number	Diff line change
`@@ -35,4 +35,5 @@ class Settings(BaseSettings):`
`35`	`35`	`"protected_namespaces": (),`
`36`	`36`	`"env_file": [".env", "../.env"], # Check both current dir and parent dir`
`37`	`37`	`"case_sensitive": True,`
	`38`	`+ "extra": "ignore", # Ignore extra fields from .env (e.g., old gRPC settings)`
`38`	`39`	`}`