diff --git a/docs/building-with-codegen/semantic-code-search.mdx b/docs/building-with-codegen/semantic-code-search.mdx new file mode 100644 index 000000000..48c3b70d5 --- /dev/null +++ b/docs/building-with-codegen/semantic-code-search.mdx @@ -0,0 +1,111 @@ +--- +title: "Semantic Code Search" +sidebarTitle: "Semantic Code Search" +icon: "magnifying-glass" +iconType: "solid" +--- + +Codegen's `VectorIndex` enables semantic code search capabilities using embeddings. This allows you to search codebases using natural language queries and find semantically related code, even when the exact terms aren't present. + +This is under active development. Interested in an application? [Reach out to the team!](/introduction/about.tsx) + +## Basic Usage + +Create and save a vector index for your codebase: + +```python +from codegen.extensions import VectorIndex + +# Initialize with your codebase +index = VectorIndex(codebase) + +# Create embeddings for all files +index.create() + +# Save to disk (defaults to .codegen/vector_index.pkl) +index.save() +``` + +Later, load the index and perform semantic searches: + +```python +# Create a codebase +codebase = Codebase.from_repo('fastapi/fastapi') + +# Load a previously created index +index = VectorIndex(codebase) +index.load() + +# Search with natural language +results = index.similarity_search( + "How does FastAPI handle dependency injection?", + k=5 # number of results +) + +# Print results with previews +for filepath, score in results: + print(f"\nScore: {score:.3f} | File: {filepath}") + file = codebase.get_file(filepath) + print(f"Preview: {file.content[:200]}...") +``` + + +The search uses cosine similarity between embeddings to find the most semantically related files, regardless of exact keyword matches. + + +## Getting Embeddings + +You can also get embeddings for arbitrary text using the same model: + +```python +# Get embeddings for a list of texts +texts = [ + "Some code or text to embed", + "Another piece of text" +] +embeddings = index.get_embeddings(texts) # shape: (n_texts, embedding_dim) +``` + +## How It Works + +The `VectorIndex` class: +1. Processes each file in your codebase +2. Splits large files into chunks that fit within token limits +3. Uses OpenAI's text-embedding-3-small model to create embeddings +4. Stores embeddings in a numpy array for efficient similarity search +5. Saves the index to disk for reuse + +When searching: +1. Your query is converted to an embedding using the same model +2. Cosine similarity is computed between the query and all file embeddings +3. The most similar files are returned, along with their similarity scores + + +Creating embeddings requires an OpenAI API key with access to the embeddings endpoint. + + +## Example Searches + +Here are some example semantic searches that demonstrate the power of the system: + +```python +# Find authentication-related code +results = index.similarity_search( + "How is user authentication implemented?", + k=3 +) + +# Find error handling patterns +results = index.similarity_search( + "Show me examples of error handling and custom exceptions", + k=3 +) + +# Find configuration management +results = index.similarity_search( + "Where is the application configuration and settings handled?", + k=3 +) +``` + +The semantic search can understand concepts and return relevant results even when the exact terms aren't present in the code. diff --git a/docs/mint.json b/docs/mint.json index 147072fd0..7947f5aee 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -134,6 +134,7 @@ "building-with-codegen/codebase-visualization", "building-with-codegen/flagging-symbols", "building-with-codegen/calling-out-to-llms", + "building-with-codegen/semantic-code-search", "building-with-codegen/reducing-conditions" ] }, diff --git a/pyproject.toml b/pyproject.toml index ab61da8ba..9ff56fef8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ dependencies = [ "langchain[openai]", "langchain_core", "langchain_openai", + "numpy>=2.2.2", ] license = { text = "Apache-2.0" } diff --git a/src/codegen/extensions/__init__.py b/src/codegen/extensions/__init__.py index e69de29bb..3958271ac 100644 --- a/src/codegen/extensions/__init__.py +++ b/src/codegen/extensions/__init__.py @@ -0,0 +1,5 @@ +"""Extensions for the codegen package.""" + +from codegen.extensions.vector_index import VectorIndex + +__all__ = ["VectorIndex"] diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 6378e9eb5..f51872dce 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -19,6 +19,7 @@ reveal_symbol, search, semantic_edit, + semantic_search, view_file, ) @@ -317,3 +318,27 @@ def _run( include_dependencies=include_dependencies, ) return json.dumps(result, indent=2) + + +class SemanticSearchTool(BaseTool): + """Tool for semantic code search.""" + + name: ClassVar[str] = "semantic_search" + description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity" + args_schema: ClassVar[type[BaseModel]] = type( + "SemanticSearchInput", + (BaseModel,), + { + "query": (str, Field(..., description="The natural language search query")), + "k": (int, Field(default=5, description="Number of results to return")), + "preview_length": (int, Field(default=200, description="Length of content preview in characters")), + }, + ) + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str: + result = semantic_search(self.codebase, query, k=k, preview_length=preview_length) + return json.dumps(result, indent=2) diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py index be93d6aa4..9ce7b4f90 100644 --- a/src/codegen/extensions/tools/__init__.py +++ b/src/codegen/extensions/tools/__init__.py @@ -13,6 +13,7 @@ from .reveal_symbol import reveal_symbol from .search import search from .semantic_edit import semantic_edit +from .semantic_search import semantic_search __all__ = [ "commit", @@ -29,5 +30,6 @@ "search", # Semantic edit "semantic_edit", + "semantic_search", "view_file", ] diff --git a/src/codegen/extensions/tools/semantic_search.py b/src/codegen/extensions/tools/semantic_search.py new file mode 100644 index 000000000..cd1ac04fd --- /dev/null +++ b/src/codegen/extensions/tools/semantic_search.py @@ -0,0 +1,88 @@ +"""Semantic search over codebase files.""" + +from typing import Any, Optional + +from codegen import Codebase +from codegen.extensions.vector_index import VectorIndex + + +def semantic_search( + codebase: Codebase, + query: str, + k: int = 5, + preview_length: int = 200, + index_path: Optional[str] = None, +) -> dict[str, Any]: + """Search the codebase using semantic similarity. + + This function provides semantic search over a codebase by using OpenAI's embeddings. + Currently, it loads/saves the index from disk each time, but could be optimized to + maintain embeddings in memory for frequently accessed codebases. + + TODO(CG-XXXX): Add support for maintaining embeddings in memory across searches, + potentially with an LRU cache or similar mechanism to avoid recomputing embeddings + for frequently searched codebases. + + Args: + codebase: The codebase to search + query: The search query in natural language + k: Number of results to return (default: 5) + preview_length: Length of content preview in characters (default: 200) + index_path: Optional path to a saved vector index + + Returns: + Dict containing search results or error information. Format: + { + "status": "success", + "query": str, + "results": [ + { + "filepath": str, + "score": float, + "preview": str + }, + ... + ] + } + Or on error: + { + "error": str + } + """ + try: + # Initialize vector index + index = VectorIndex(codebase) + + # Try to load existing index + try: + if index_path: + index.load(index_path) + else: + index.load() + except FileNotFoundError: + # Create new index if none exists + index.create() + index.save(index_path) + + # Perform search + results = index.similarity_search(query, k=k) + + # Format results with previews + formatted_results = [] + for filepath, score in results: + try: + file = codebase.get_file(filepath) + preview = file.content[:preview_length].replace("\n", " ").strip() + if len(file.content) > preview_length: + preview += "..." + + formatted_results.append({"filepath": filepath, "score": float(score), "preview": preview}) + except Exception as e: + # Skip files that can't be read + print(f"Warning: Could not read file {filepath}: {e}") + continue + + return {"status": "success", "query": query, "results": formatted_results} + + except Exception as e: + return {"error": f"Failed to perform semantic search: {e!s}"} diff --git a/src/codegen/extensions/vector_index.py b/src/codegen/extensions/vector_index.py new file mode 100644 index 000000000..7459c0042 --- /dev/null +++ b/src/codegen/extensions/vector_index.py @@ -0,0 +1,226 @@ +"""Vector index for semantic search over codebase files.""" + +import pickle +from pathlib import Path +from typing import Optional + +import numpy as np +import tiktoken +from openai import OpenAI +from tqdm import tqdm + +from codegen import Codebase + + +class VectorIndex: + """A vector index for semantic search over codebase files. + + This class manages embeddings for all files in a codebase, allowing for semantic search + and similarity comparisons. It uses OpenAI's text-embedding model to generate embeddings + and stores them efficiently on disk. + + Attributes: + codebase (Codebase): The codebase to index + E (Optional[np.ndarray]): The embeddings matrix, shape (n_files, embedding_dim) + file_paths (Optional[np.ndarray]): Array of file paths corresponding to embeddings + """ + + DEFAULT_SAVE_DIR = ".codegen" + DEFAULT_SAVE_FILE = "vector_index.pkl" + EMBEDDING_MODEL = "text-embedding-3-small" + MAX_TOKENS = 8000 + BATCH_SIZE = 100 + + def __init__(self, codebase: Codebase): + """Initialize the vector index. + + Args: + codebase: The codebase to create embeddings for + """ + self.codebase = codebase + self.E: Optional[np.ndarray] = None + self.file_paths: Optional[np.ndarray] = None + + # Initialize OpenAI client and tokenizer + self.client = OpenAI() + self.encoding = tiktoken.get_encoding("cl100k_base") + + def _get_default_save_path(self) -> Path: + """Get the default save path for the vector index.""" + save_dir = Path(self.codebase.repo_path) / self.DEFAULT_SAVE_DIR + save_dir.mkdir(exist_ok=True) + return save_dir / self.DEFAULT_SAVE_FILE + + def _get_embeddings(self, texts: list[str]) -> list[list[float]]: + """Get embeddings for a batch of texts using OpenAI's API.""" + # Clean texts + texts = [text.replace("\\n", " ") for text in texts] + + response = self.client.embeddings.create(model=self.EMBEDDING_MODEL, input=texts, encoding_format="float") + return [data.embedding for data in response.data] + + def _split_by_tokens(self, text: str) -> list[str]: + """Split text into chunks that fit within token limit.""" + tokens = self.encoding.encode(text) + chunks = [] + current_chunk = [] + current_size = 0 + + for token in tokens: + if current_size + 1 > self.MAX_TOKENS: + chunks.append(self.encoding.decode(current_chunk)) + current_chunk = [token] + current_size = 1 + else: + current_chunk.append(token) + current_size += 1 + + if current_chunk: + chunks.append(self.encoding.decode(current_chunk)) + + return chunks + + def create(self) -> None: + """Create embeddings for all files in the codebase. + + This method processes all files in the codebase, generates embeddings using + OpenAI's API, and stores them in memory. The embeddings can then be saved + to disk using save(). + """ + # Store file paths and their embeddings + file_embeddings = {} + + # Collect all valid files and their chunks + chunks_to_process = [] + for file in tqdm(self.codebase.files, desc="Collecting files"): + content = file.content + if not content: # Skip empty files + continue + + # Split content into chunks by token count + content_chunks = self._split_by_tokens(content) + + if len(content_chunks) == 1: + # If only one chunk, store as is + chunks_to_process.append((file.filepath, content, 0)) + else: + # If multiple chunks, store with chunk index + for i, chunk in enumerate(content_chunks): + chunks_to_process.append((file.filepath, chunk, i)) + + # Process in batches + for i in tqdm(range(0, len(chunks_to_process), self.BATCH_SIZE), desc="Processing batches"): + batch = chunks_to_process[i : i + self.BATCH_SIZE] + filepaths, contents, chunk_indices = zip(*batch) + + try: + # Get embeddings for the batch + embeddings = self._get_embeddings(contents) + + # Store results + for filepath, content, chunk_idx, embedding in zip(filepaths, contents, chunk_indices, embeddings): + key = filepath if chunk_idx == 0 else f"{filepath}#chunk{chunk_idx}" + file_embeddings[key] = {"embedding": embedding, "content": content, "size": len(content), "chunk_index": chunk_idx} + except Exception as e: + print(f"Error processing batch {i // self.BATCH_SIZE}: {e}") + + # Convert to numpy arrays + embeddings_list = [] + file_paths = [] + + for filepath, data in file_embeddings.items(): + embeddings_list.append(data["embedding"]) + file_paths.append(filepath) + + self.E = np.array(embeddings_list) + self.file_paths = np.array(file_paths) + + def save(self, save_path: Optional[str] = None) -> None: + """Save the vector index to disk. + + Args: + save_path: Optional path to save the index to. If not provided, + saves to .codegen/vector_index.pkl in the repo root. + """ + if self.E is None or self.file_paths is None: + msg = "No embeddings to save. Call create() first." + raise ValueError(msg) + + save_path = Path(save_path) if save_path else self._get_default_save_path() + + # Ensure parent directory exists + save_path.parent.mkdir(parents=True, exist_ok=True) + + with open(save_path, "wb") as f: + pickle.dump({"E": self.E, "file_paths": self.file_paths}, f) + + def load(self, load_path: Optional[str] = None) -> None: + """Load a previously saved vector index from disk. + + Args: + load_path: Optional path to load the index from. If not provided, + loads from .codegen/vector_index.pkl in the repo root. + """ + load_path = Path(load_path) if load_path else self._get_default_save_path() + + if not load_path.exists(): + msg = f"No vector index found at {load_path}" + raise FileNotFoundError(msg) + + with open(load_path, "rb") as f: + data = pickle.load(f) + # Handle both old and new format + self.E = data.get("E", data.get("embeddings")) + self.file_paths = data["file_paths"] + + def get_embeddings(self, texts: list[str]) -> np.ndarray: + """Get embeddings for a list of texts using the same model as the index. + + Args: + texts: List of text strings to get embeddings for + + Returns: + np.ndarray: Array of embeddings with shape (len(texts), embedding_dim) + """ + # Clean and get embeddings + embeddings = self._get_embeddings(texts) + return np.array(embeddings) + + def similarity_search(self, query: str, k: int = 5) -> list[tuple[str, float]]: + """Find the k most similar files to a query text. + + Uses cosine similarity between the query embedding and all file embeddings + to find the most similar files. + + Args: + query: The text to search for + k: Number of results to return (default: 5) + + Returns: + List of tuples (filepath, similarity_score) sorted by similarity (highest first) + + Raises: + ValueError: If the index hasn't been created yet (E is None) + """ + if self.E is None or self.file_paths is None: + msg = "No embeddings available. Call create() or load() first." + raise ValueError(msg) + + # Get query embedding + query_embedding = self.get_embeddings([query])[0] + + # Compute cosine similarity + # Normalize vectors for cosine similarity + query_norm = query_embedding / np.linalg.norm(query_embedding) + E_norm = self.E / np.linalg.norm(self.E, axis=1)[:, np.newaxis] + similarities = np.dot(E_norm, query_norm) + + # Get top k indices + top_indices = np.argsort(similarities)[-k:][::-1] + + # Return filepath and similarity score pairs + results = [] + for idx in top_indices: + results.append((self.file_paths[idx], float(similarities[idx]))) + + return results