Optimizing code and vector queries

alexliyu7352 · alexliyu7352 · commit 5f8bd3132a24 · 2024-09-21T06:17:43.000+08:00
diff --git a/apps/review.py b/apps/review.py
@@ -231,6 +231,10 @@ async def review_specific_pr(pr_url: str):
     repo_detail = github.parse_pullrequest_url(pr_url)
     pr_data = await github.get_pullrequest(repo_detail.get_repo_fullname(), repo_detail.number)
     head_sha = pr_data['head']['sha']
-    commit_message = f"{pr_data['title']}\n\n{pr_data.get('body', '')}"
+    pr_body = pr_data.get('body', '')
+    pr_title = pr_data['title']
+    if not pr_body:
+        pr_body = ""
+    commit_message = f"{pr_title}\n\n{pr_body}"
     await review_pull_request(repo_detail.get_repo_fullname(), repo_detail.number, head_sha, commit_message)
 
diff --git a/apps/webhook/handles.py b/apps/webhook/handles.py
@@ -245,7 +245,8 @@ async def pull_request_handler(action: str, payload, event, delivery, headers):
     if not settings.REVIEW_MODEL.api_key:
         logger.info(f"Thread: {delivery}: No review model, skip")
         return
-
+    if not body:
+        body = ""
     await review.review_pull_request(repo_name, pr_number, head_sha, f"{title}\n\n{body}")
 
 
diff --git a/core/analyze/analyzer.py b/core/analyze/analyzer.py
@@ -52,6 +52,9 @@ def extract_names_from_patch(self, patch_content: str) -> Tuple[Set[str], Set[st
         pass
 
     @abstractmethod
+    def extract_functions_from_patch(self, patch_content: str) -> Set[str]:
+        pass
+    @abstractmethod
     def extract_definitions(self, content: str, names: Set[str]) -> Dict[str, str]:
         pass
 
@@ -196,6 +199,26 @@ def extract_names_from_patch(self, patch_content: str) -> Tuple[Set[str], Set[st
 
         return functions, variables
 
+    def extract_functions_from_patch(self, patch_content: str) -> Set[str]:
+        # 用于存储提取的信息
+        extracted_info = set()
+
+        # 正则表达式模式
+        type_pattern = r'\b([A-Z][a-zA-Z0-9_]*)\b'
+        function_pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\('
+        variable_pattern = r'\b([a-z_][a-zA-Z0-9_]*)\b'
+
+        # 逐行分析 patch 内容
+        for line in patch_content.split('\n'):
+            # 提取类型（假设以大写字母开头）
+            types = re.findall(type_pattern, line)
+            extracted_info.update(types)
+
+            # 提取函数（假设后面跟着括号）
+            functions = re.findall(function_pattern, line)
+            extracted_info.update(functions)
+        return extracted_info
+
     def extract_definitions(self, content: str, names: Set[str]) -> Dict[str, str]:
         tree = ast.parse(content)
         definitions = {}
@@ -293,7 +316,7 @@ def _is_from_project(self, node, current_file: str) -> bool:
             return False
         file_path = os.path.abspath(node.location.file.name)
         return file_path.startswith(self.project_root) and (
-                    file_path == current_file or not file_path.endswith(('.h', '.hpp')))
+                file_path == current_file or not file_path.endswith(('.h', '.hpp')))
 
     def _get_element_content(self, node) -> str:
         try:
@@ -315,23 +338,55 @@ def _is_likely_external(self, content: str) -> bool:
 
     def analyze_dependencies(self, file_path: str, content: str) -> List[str]:
         """
-                分析文件的依赖关系，并过滤掉非项目内的依赖
+        分析文件的依赖关系，并过滤掉非项目内的依赖
 
-                :param file_path: 当前分析的文件路径
-                :param content: 文件内容
-                :param base_path: 项目的基础路径
-                :return: 项目内的依赖列表
-                """
+        :param file_path: 当前分析的文件路径
+        :param content: 文件内容
+        :param base_path: 项目的基础路径
+        :return: 项目内的依赖列表
+        """
         # 查找所有的 #include 语句
         includes = re.findall(r'#include\s*[<"]([^>"]+)[>"]', content)
 
         # 转换和过滤依赖
         project_dependencies = self.find_dependencies(file_path, includes)
-        # 去重并返回
-        return list(set(project_dependencies))
+        # 对于每个头文件依赖，尝试找到对应的实现文件
+        implementation_dependencies = []
+        for dep in project_dependencies:
+            impl_file = self.find_implementation_file(dep)
+            if impl_file:
+                implementation_dependencies.append(impl_file)
+
+        # 合并头文件和实现文件的依赖，去重并返回
+        all_dependencies = list(set(project_dependencies + implementation_dependencies))
+        return all_dependencies
+
+    def find_implementation_file(self, header_path: str) -> Optional[str]:
+        """
+        根据头文件路径查找对应的实现文件
+
+        :param header_path: 头文件的相对路径
+        :return: 实现文件的相对路径，如果找不到则返回None
+        """
+        implementation_extensions = ['.cpp', '.cxx', '.cc', '.c']
+        base_name = os.path.splitext(header_path)[0]
+
+        for ext in implementation_extensions:
+            impl_path = base_name + ext
+            if impl_path in self.file_index.values():
+                return impl_path
+
+        # 如果在同一目录下找不到，尝试在整个项目中查找
+        file_name = os.path.basename(base_name)
+        for ext in implementation_extensions:
+            impl_file = file_name + ext
+            if impl_file in self.file_index:
+                return self.file_index[impl_file]
+
+        return None
 
     def extract_names_from_patch(self, patch_content: str) -> Tuple[Set[str], Set[str]]:
-        tu = self.index.parse('tmp.cpp', unsaved_files=[('tmp.cpp', patch_content)])
+        tu = self.index.parse('tmp.cpp', unsaved_files=[('tmp.cpp', patch_content)], args=['-std=c++11'])
         functions = set()
         variables = set()
 
@@ -344,9 +399,63 @@ def visit_node(node):
             for child in node.get_children():
                 visit_node(child)
 
-        visit_node(tu.cursor)
+        for child in tu.cursor.get_children():
+            visit_node(child)
+
+        # visit_node(tu.cursor)
         return functions, variables
 
+    def extract_functions_from_patch(self, patch_content: str) -> Set[str]:
+        functions = set()
+        variables = set()
+
+        # 正则表达式模式
+        # 匹配函数定义或声明，可能包含命名空间
+        function_def_pattern = r'(?:(?:\w+::)*\w+\s+)+(\w+(?:::\w+)*)\s*\([^)]*\)\s*(?:const)?\s*(?:{\s*)?'
+        # 匹配潜在的函数调用或控制结构
+        potential_call_pattern = r'(\w+(?:::\w+)*)\s*\([^)]*\)'
+        # 匹配变量声明，可能包含命名空间
+        variable_pattern = r'(?:(?:\w+::)*\w+\s+)+((?:\w+::)*\w+)\s*(?:=|;)'
+
+        # 系统函数和关键字列表（可以根据需要扩展）
+        system_functions = {'std::', 'boost::', 'printf', 'scanf', 'malloc', 'free', 'new', 'delete'}
+        control_structures = {'if', 'while', 'for', 'switch', 'catch'}
+
+        # 提取函数定义
+        for match in re.finditer(function_def_pattern, patch_content):
+            func_name = match.group(1)
+            if self._is_valid_function(func_name, system_functions, control_structures):
+                functions.add(func_name)
+
+        # 提取潜在的函数调用
+        for match in re.finditer(potential_call_pattern, patch_content):
+            func_name = match.group(1)
+            if self._is_valid_function(func_name, system_functions, control_structures):
+                # 检查是否为控制结构
+                prev_chars = patch_content[max(0, match.start() - 20):match.start()].split()
+                if prev_chars and prev_chars[-1] not in control_structures:
+                    functions.add(func_name)
+
+        # 提取变量名
+        for match in re.finditer(variable_pattern, patch_content):
+            var_name = match.group(1)
+            if self._is_valid_function(var_name, system_functions, control_structures):
+                variables.add(var_name)
+
+        return functions
+
+    def _is_valid_function(self, name: str, system_functions: Set[str], control_structures: Set[str]) -> bool:
+        """
+        检查名称是否为有效的函数名（不是系统函数或控制结构）
+
+        :param name: 要检查的名称
+        :param system_functions: 系统函数集合
+        :param control_structures: 控制结构集合
+        :return: 如果是有效的函数名则返回True，否则返回False
+        """
+        return not any(name.startswith(sys_func) for sys_func in system_functions) and name not in control_structures
+
+
     def extract_definitions(self, content: str, names: Set[str]) -> Dict[str, str]:
         tu = self.index.parse('tmp.cpp', unsaved_files=[('tmp.cpp', content)])
         definitions = {}
diff --git a/core/analyze/base.py b/core/analyze/base.py
@@ -16,6 +16,7 @@
 import glob
 import json
 import os
+import re
 import shutil
 from typing import List, Dict, Any, Optional
 
@@ -55,7 +56,7 @@ def __init__(self, repo_fullname: str, milvus_uri: Optional[str] = None):
         self.dependencies = {}
         self.exclude_path = []
         self.milvus_uri = milvus_uri
-        self.code_elements_collection = f"code_{self.repo_fullname.replace('/', '_').lower()}"
+        self.code_elements_collection = f"v1_code_{self.repo_fullname.replace('/', '_').lower()}"
         self.code_elements_collection_loaded = False
         self.init_lock = asyncio.Lock()
 
@@ -92,26 +93,26 @@ async def check_elements_collection(self) -> bool:
             FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=20),
             FieldSchema(name="element_type", dtype=DataType.VARCHAR, max_length=20),
             FieldSchema(name="element_name", dtype=DataType.VARCHAR, max_length=100),
-            FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=20000),
+            FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
             FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)
         ]
-        schema = CollectionSchema(fields=fields, description="代码元素嵌入向量")
+        schema = CollectionSchema(fields=fields, description="Code search collection")
         await milvus_manager.create_collection(
             dimension=768,
-            metric_type="COSINE",
+            metric_type="IP",
             collection_name=self.code_elements_collection,
             schema=schema,
             vector_field_name="embedding",
-            description="代码元素嵌入向量"
+            description="Code search collection"
         )
         # 创建向量索引
         index_params = IndexParams()
         try:
             # 判断milvus使用的模式, 本地或者内存
             if self.milvus_uri == "sqlite://:memory:" or not self.milvus_uri or self.milvus_uri.startswith("/"):
-                index_params.add_index("embedding", "FLAT", "embedding_index", metric_type="COSINE")
+                index_params.add_index("embedding", "FLAT", "embedding_index", metric_type="IP")
             else:
-                index_params.add_index("embedding", "IVF_FLAT", "embedding_index", nlist=1024, metric_type="COSINE")
+                index_params.add_index("embedding", "IVF_FLAT", "embedding_index", nlist=1024, metric_type="IP")
             await milvus_manager.create_index(
                 collection_name=self.code_elements_collection,
                 index_params=index_params
@@ -306,6 +307,8 @@ async def save_to_db(self, file_detail: FileDetails):
             #     logger.info(f"Too many code elements in {file_detail.file_name}, only saving the first 60.")
             exclude_types_list = [CodeElementType.CONSTANT, CodeElementType.VARIABLE]
             for element in file_detail.code_elements:
+                if not element['name'] or len(element['name']) == 0:
+                    continue
                 if element['type'] in exclude_types_list:
                     continue
                 if f'{element["type"]}_{element["name"]}' in added_set:
@@ -439,28 +442,65 @@ async def generate_project_overview(self, summary: Dict[str, Any]) -> str:
             messages, settings.REVIEW_MODEL, 0.3, 50, 0.9)
         return overview
 
+    def clean_patch(self, patch_content: str) -> str:
+        """
+        清理补丁内容，删除两个@@之间的字符, 忽略删除的行
+        """
+        cleaned_patch = []
+        for line in patch_content.split('\n'):
+            if line.startswith('@@'):
+                cleaned_patch.append(line.rsplit('@@', 1)[1])
+            elif line.startswith('-'):
+                continue
+            elif line.startswith('+'):
+                cleaned_patch.append(line[1:])
+            else:
+                cleaned_patch.append(line)
+        return '\n'.join(cleaned_patch)
+
     async def get_review_context(self, filename: str, patch_content: str) -> Dict[str, Any]:
         """
         审查所需要的上下文信息
         """
-        patch_embedding = await embedding_model.async_encode_text(patch_content)
-
-        search_params = {"metric_type": "COSINE", "params": {"nprobe": 20}}
+        patch_embedding = []
+        patch_content = self.clean_patch(patch_content)
+        language = utils.get_support_file_language(filename)
+        analyzer = self.analyzers.get(language)
+        code_elements = list(analyzer.extract_functions_from_patch(patch_content))
+        if not code_elements or len(code_elements) == 0:
+            code_elements = patch_content.split("\n")
+        code_elements_count = len(code_elements)
+        limit = 20 // code_elements_count
+        if limit < 1:
+            limit = 1
+            code_elements = code_elements[:20]
+        for element in code_elements:
+            element_v = await embedding_model.async_encode_text(element)
+            patch_embedding.append(element_v.tolist())
+        search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
         await self.check_elements_collection()
         results = await milvus_manager.search(
             collection_name=self.code_elements_collection,
-            data=[patch_embedding.tolist()],
+            # filter="element_name in ['" + "','".join(code_elements) + "']",
+            # filter="element_name != ''",
+            data=patch_embedding,
             anns_field="embedding",
             search_params=search_params,
-            limit=20,
+            limit=limit,
             output_fields=["file_path", "language", "element_type", "element_name", "content"]
         )
+        related_elements = []
+        for result in results:
+            if isinstance(result, dict):
+                continue
+            for code_element in result:
+                related_elements.append(code_element)
 
-        related_elements = results[0]
         # 获取相关元素的上下文信息
         context_info = self.get_context_info(related_elements)
         # 分析补丁中的依赖关系
         patch_dependencies = self.get_dependencies(filename)
+        logger.info("Dependencies: %s", patch_dependencies)
         # 项目的概述 "project_overview.md"
         project_overview = ""
         overview_path = os.path.join(self.analyze_data_path, "project_overview.md")
@@ -500,7 +540,7 @@ def get_dependencies(self, filename: str) -> Dict[str, str]:
         if not index_detail:
             return result
         for file_name in index_detail.dependencies:
-            if len(result) > 5:
+            if len(result) > 6:
                 return result
             # 读取依赖文件的内容
             file_path = os.path.join(self.project_source_path, file_name)
diff --git a/core/analyze/index.py b/core/analyze/index.py
@@ -39,6 +39,7 @@ class IndexItem(pydantic.BaseModel):
     language: str
     last_modified: float
     dependencies: List[str]
+    # code_elements: List[Dict[str, Any]]
 
 
 INDEX_PATH_PREFIX = '.index'
@@ -92,7 +93,8 @@ def insert_or_update(self, file_detail: FileDetails):
             code_hash=file_detail.code_hash,
             language=file_detail.language,
             last_modified=os.path.getmtime(os.path.join(self.source_path, file_detail.file_name)),
-            dependencies=file_detail.dependencies
+            dependencies=file_detail.dependencies,
+            # code_elements=file_detail.code_elements
         )
         with open(index_file_name, 'w') as f:
             f.write(index_item.json())
diff --git a/core/embedding.py b/core/embedding.py
@@ -59,6 +59,13 @@ def get_model(self) -> TextEmbedding:
             self.load()
         return self.embedding_model
 
+    def normalize_vector(self, vector: np.ndarray) -> np.ndarray:
+        """对向量进行L2归一化"""
+        norm = np.linalg.norm(vector)
+        if norm == 0:
+            return vector
+        return vector / norm
+
     def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
         """将文本分割成更小的块"""
         words = text.split()
@@ -72,15 +79,16 @@ def encode_text(self, text: str, chunk_size: int = 1000) -> np.ndarray:
             model = self.get_model()
             if len(text.split()) <= chunk_size:
                 embeddings = next(model.embed([text]))
-                return embeddings
             else:
                 chunks = self.chunk_text(text, chunk_size)
                 embeddings = []
                 for chunk in chunks:
                     chunk_embedding = next(model.embed([chunk]))
                     embeddings.append(chunk_embedding)
                     gc.collect()  # Force garbage collection after each chunk
-                return np.mean(embeddings, axis=0)
+                embeddings = np.mean(embeddings, axis=0)
+            gc.collect()
+            return self.normalize_vector(embeddings)
         except Exception as e:
             logger.error(f"Failed to encode text: {e}")
             return np.zeros(model.dim)  # Use the dimension from the model
@@ -98,7 +106,7 @@ def process_large_document(self, document: str, chunk_size: int = 1000) -> np.nd
         result = np.mean(embeddings, axis=0)
         del embeddings
         gc.collect()
-        return result
+        return self.normalize_vector(result)
 
     # 异步包装器
     async def async_encode_text(self, text: str, chunk_size: int = 1000) -> np.ndarray: