feat: 쿼리 및 용어집 정보 추가 및 관련 메시지 수정

ehddnr301 · ehddnr301 · commit ce6e423005e7 · 2025-07-27T09:16:08.000Z
- 사용 가능한 테이블 및 컬럼 정보에 예시 쿼리와 용어집 정보를 포함하도록 메시지 수정
- 테이블 정보 검색 기능에서 쿼리 및 용어집 정보를 섹션별로 추출하여 저장하는 로직 추가
- 데이터베이스 정보와 함께 쿼리 및 용어집 정보를 반환하도록 개선
diff --git a/llm_utils/chains.py b/llm_utils/chains.py
@@ -31,7 +31,7 @@ def create_query_refiner_chain(llm):
             SystemMessagePromptTemplate.from_template(prompt),
             MessagesPlaceholder(variable_name="user_input"),
             SystemMessagePromptTemplate.from_template(
-                "다음은 사용자의 실제 사용 가능한 테이블 및 컬럼 정보입니다:"
+                "다음은 사용자의 실제 사용 가능한 테이블 및 컬럼 정보 와 예시쿼리 및 용어집 정보입니다:"
             ),
             MessagesPlaceholder(variable_name="searched_tables"),
             SystemMessagePromptTemplate.from_template(
@@ -63,7 +63,7 @@ def create_query_maker_chain(llm):
             MessagesPlaceholder(variable_name="refined_input"),
             (
                 "system",
-                "다음은 사용자의 db 환경정보와 사용 가능한 테이블 및 컬럼 정보입니다:",
+                "다음은 사용자의 db 환경정보와 사용 가능한 테이블 및 컬럼 정보 와 예시쿼리 및 용어집 정보입니다:",
             ),
             MessagesPlaceholder(variable_name="user_database_env"),
             MessagesPlaceholder(variable_name="searched_tables"),
@@ -84,7 +84,7 @@ def create_query_refiner_with_profile_chain(llm):
             SystemMessagePromptTemplate.from_template(prompt),
             MessagesPlaceholder(variable_name="user_input"),
             SystemMessagePromptTemplate.from_template(
-                "다음은 사용자의 실제 사용 가능한 테이블 및 컬럼 정보입니다:"
+                "다음은 사용자의 실제 사용 가능한 테이블 및 컬럼 정보 와 예시쿼리 및 용어집 정보입니다:"
             ),
             MessagesPlaceholder(variable_name="searched_tables"),
             # 프로파일 정보 입력
diff --git a/llm_utils/retrieval.py b/llm_utils/retrieval.py
@@ -97,18 +97,79 @@ def search_tables(
         # 테이블명 및 설명 추출
         table_name, table_desc = lines[0].split(": ", 1)
 
-        # 컬럼 정보 추출
+        # 섹션별로 정보 추출
         columns = {}
-        if len(lines) > 2 and lines[1].strip() == "Columns:":
-            for line in lines[2:]:
-                if ": " in line:
-                    col_name, col_desc = line.split(": ", 1)
-                    columns[col_name.strip()] = col_desc.strip()
+        queries = []
+        terms = []
+
+        current_section = None
+        current_query = {}
+        current_term = {}
+
+        for i, line in enumerate(lines[1:], 1):
+            line = line.strip()
+
+            # 섹션 헤더 확인
+            if line == "Columns:":
+                current_section = "columns"
+                continue
+            elif line == "Queries:":
+                current_section = "queries"
+                continue
+            elif line == "Terms:":
+                current_section = "terms"
+                continue
+
+            # 각 섹션의 내용 파싱
+            if current_section == "columns" and ": " in line:
+                col_name, col_desc = line.split(": ", 1)
+                columns[col_name.strip()] = col_desc.strip()
+
+            elif current_section == "queries" and line and line != "No queries":
+                # 쿼리 구분자 확인
+                if line == "---":
+                    # 이전 쿼리 저장
+                    if current_query:
+                        queries.append(current_query)
+                        current_query = {}
+                elif line.startswith("Name: "):
+                    # 이전 쿼리가 있다면 저장
+                    if current_query:
+                        queries.append(current_query)
+                    current_query = {"name": line[6:]}  # "Name: " 제거
+                elif line.startswith("Description: "):
+                    if current_query:
+                        current_query["description"] = line[13:]  # "Description: " 제거
+                elif line.startswith("Query: "):
+                    if current_query:
+                        current_query["statement"] = line[7:]  # "Query: " 제거
+
+            elif current_section == "terms" and line and line != "No terms":
+                if line.startswith("Term: "):
+                    # 이전 용어가 있다면 저장
+                    if current_term:
+                        terms.append(current_term)
+                    # 새로운 용어 시작
+                    current_term = {"name": line[6:]}  # "Term: " 제거
+                elif line.startswith("Description: ") and current_term:
+                    current_term["description"] = line[13:]  # "Description: " 제거
+                elif line.startswith("Definition: ") and current_term:
+                    current_term["definition"] = line[12:]  # "Definition: " 제거
+
+        # 마지막 쿼리 저장
+        if current_query and current_section == "queries":
+            queries.append(current_query)
+
+        # 마지막 용어 저장
+        if current_term and current_section == "terms":
+            terms.append(current_term)
 
         # 딕셔너리 저장
         documents_dict[table_name] = {
             "table_description": table_desc.strip(),
             **columns,  # 컬럼 정보 추가
+            "queries": queries,  # 쿼리 정보 추가 (딕셔너리 형태로)
+            "glossary_terms": terms,  # 용어집 정보 추가
         }
 
     return documents_dict
diff --git a/llm_utils/tools.py b/llm_utils/tools.py
@@ -140,6 +140,12 @@ def get_info_from_db(max_workers: int = 8) -> List[Document]:
 
     def process_table_info(item: tuple[str, str]) -> str:
         table_name, table_description = item
+        urn = urn_table_mapping.get(table_name, "")
+
+        # fetcher 인스턴스 생성
+        local_fetcher = _get_fetcher()
+
+        # 컬럼 정보 가져오기
         column_info = _get_column_info(
             table_name, urn_table_mapping, max_workers=max_workers
         )
@@ -149,7 +155,80 @@ def process_table_info(item: tuple[str, str]) -> str:
                 for col in column_info
             ]
         )
-        return f"{table_name}: {table_description}\nColumns:\n {column_info_str}"
+
+        # 쿼리 및 용어집 정보 가져오기
+        queries_result = local_fetcher.get_queries_by_urn(urn) if urn else {}
+        glossary_terms_result = (
+            local_fetcher.get_glossary_terms_by_urn(urn) if urn else {}
+        )
+
+        # GraphQL 응답에서 실제 쿼리 리스트 추출
+        queries = []
+        if (
+            queries_result
+            and "data" in queries_result
+            and "listQueries" in queries_result["data"]
+            and "queries" in queries_result["data"]["listQueries"]
+        ):
+            queries = queries_result["data"]["listQueries"]["queries"]
+
+        # GraphQL 응답에서 실제 glossary terms 추출
+        glossary_terms = []
+        if (
+            glossary_terms_result
+            and "data" in glossary_terms_result
+            and "dataset" in glossary_terms_result["data"]
+            and "glossaryTerms" in glossary_terms_result["data"]["dataset"]
+            and glossary_terms_result["data"]["dataset"]["glossaryTerms"] is not None
+            and "terms" in glossary_terms_result["data"]["dataset"]["glossaryTerms"]
+        ):
+            terms_data = glossary_terms_result["data"]["dataset"]["glossaryTerms"][
+                "terms"
+            ]
+            for term_item in terms_data:
+                if "term" in term_item and "properties" in term_item["term"]:
+                    props = term_item["term"]["properties"]
+                    name = props.get("name", "")
+                    description = props.get("description", "")
+                    definition = props.get("definition", "")
+                    glossary_terms.append(
+                        {
+                            "name": name,
+                            "description": description,
+                            "definition": definition,
+                        }
+                    )
+
+        # 쿼리 정보를 name, description, statement.value만 추출하여 포맷
+        if queries:
+            formatted_queries = []
+            for q in queries[:3]:  # 최대 3개 쿼리만
+                if isinstance(q, dict) and "properties" in q:
+                    props = q["properties"]
+                    name = props.get("name", "No name")
+                    description = props.get("description", "No description")
+                    statement_value = props.get("statement", {}).get(
+                        "value", "No query statement"
+                    )
+                    formatted_query = f"Name: {name}\nDescription: {description}\nQuery: {statement_value}"
+                    formatted_queries.append(formatted_query)
+            queries_str = (
+                "\n---\n".join(formatted_queries) if formatted_queries else "No queries"
+            )
+        else:
+            queries_str = "No queries"
+        terms_str = (
+            "\n".join(
+                [
+                    f"Term: {term['name']}\nDescription: {term['description']}\nDefinition: {term['definition']}"
+                    for term in glossary_terms
+                ]
+            )
+            if glossary_terms
+            else "No terms"
+        )
+
+        return f"{table_name}: {table_description}\nColumns:\n {column_info_str}\nQueries:\n {queries_str}\nTerms:\n {terms_str}"
 
     table_info_str_list = parallel_process(
         table_info.items(),