diff --git a/docs/building-with-codegen/semantic-code-search.mdx b/docs/building-with-codegen/semantic-code-search.mdx
new file mode 100644
index 000000000..48c3b70d5
--- /dev/null
+++ b/docs/building-with-codegen/semantic-code-search.mdx
@@ -0,0 +1,111 @@
+---
+title: "Semantic Code Search"
+sidebarTitle: "Semantic Code Search"
+icon: "magnifying-glass"
+iconType: "solid"
+---
+
+Codegen's `VectorIndex` enables semantic code search capabilities using embeddings. This allows you to search codebases using natural language queries and find semantically related code, even when the exact terms aren't present.
+
+This is under active development. Interested in an application? [Reach out to the team!](/introduction/about.tsx)
+
+## Basic Usage
+
+Create and save a vector index for your codebase:
+
+```python
+from codegen.extensions import VectorIndex
+
+# Initialize with your codebase
+index = VectorIndex(codebase)
+
+# Create embeddings for all files
+index.create()
+
+# Save to disk (defaults to .codegen/vector_index.pkl)
+index.save()
+```
+
+Later, load the index and perform semantic searches:
+
+```python
+# Create a codebase
+codebase = Codebase.from_repo('fastapi/fastapi')
+
+# Load a previously created index
+index = VectorIndex(codebase)
+index.load()
+
+# Search with natural language
+results = index.similarity_search(
+ "How does FastAPI handle dependency injection?",
+ k=5 # number of results
+)
+
+# Print results with previews
+for filepath, score in results:
+ print(f"\nScore: {score:.3f} | File: {filepath}")
+ file = codebase.get_file(filepath)
+ print(f"Preview: {file.content[:200]}...")
+```
+
+
+The search uses cosine similarity between embeddings to find the most semantically related files, regardless of exact keyword matches.
+
+
+## Getting Embeddings
+
+You can also get embeddings for arbitrary text using the same model:
+
+```python
+# Get embeddings for a list of texts
+texts = [
+ "Some code or text to embed",
+ "Another piece of text"
+]
+embeddings = index.get_embeddings(texts) # shape: (n_texts, embedding_dim)
+```
+
+## How It Works
+
+The `VectorIndex` class:
+1. Processes each file in your codebase
+2. Splits large files into chunks that fit within token limits
+3. Uses OpenAI's text-embedding-3-small model to create embeddings
+4. Stores embeddings in a numpy array for efficient similarity search
+5. Saves the index to disk for reuse
+
+When searching:
+1. Your query is converted to an embedding using the same model
+2. Cosine similarity is computed between the query and all file embeddings
+3. The most similar files are returned, along with their similarity scores
+
+
+Creating embeddings requires an OpenAI API key with access to the embeddings endpoint.
+
+
+## Example Searches
+
+Here are some example semantic searches that demonstrate the power of the system:
+
+```python
+# Find authentication-related code
+results = index.similarity_search(
+ "How is user authentication implemented?",
+ k=3
+)
+
+# Find error handling patterns
+results = index.similarity_search(
+ "Show me examples of error handling and custom exceptions",
+ k=3
+)
+
+# Find configuration management
+results = index.similarity_search(
+ "Where is the application configuration and settings handled?",
+ k=3
+)
+```
+
+The semantic search can understand concepts and return relevant results even when the exact terms aren't present in the code.
diff --git a/docs/mint.json b/docs/mint.json
index 147072fd0..7947f5aee 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -134,6 +134,7 @@
"building-with-codegen/codebase-visualization",
"building-with-codegen/flagging-symbols",
"building-with-codegen/calling-out-to-llms",
+ "building-with-codegen/semantic-code-search",
"building-with-codegen/reducing-conditions"
]
},
diff --git a/pyproject.toml b/pyproject.toml
index ab61da8ba..9ff56fef8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ dependencies = [
"langchain[openai]",
"langchain_core",
"langchain_openai",
+ "numpy>=2.2.2",
]
license = { text = "Apache-2.0" }
diff --git a/src/codegen/extensions/__init__.py b/src/codegen/extensions/__init__.py
index e69de29bb..3958271ac 100644
--- a/src/codegen/extensions/__init__.py
+++ b/src/codegen/extensions/__init__.py
@@ -0,0 +1,5 @@
+"""Extensions for the codegen package."""
+
+from codegen.extensions.vector_index import VectorIndex
+
+__all__ = ["VectorIndex"]
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 6378e9eb5..f51872dce 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -19,6 +19,7 @@
reveal_symbol,
search,
semantic_edit,
+ semantic_search,
view_file,
)
@@ -317,3 +318,27 @@ def _run(
include_dependencies=include_dependencies,
)
return json.dumps(result, indent=2)
+
+
+class SemanticSearchTool(BaseTool):
+ """Tool for semantic code search."""
+
+ name: ClassVar[str] = "semantic_search"
+ description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity"
+ args_schema: ClassVar[type[BaseModel]] = type(
+ "SemanticSearchInput",
+ (BaseModel,),
+ {
+ "query": (str, Field(..., description="The natural language search query")),
+ "k": (int, Field(default=5, description="Number of results to return")),
+ "preview_length": (int, Field(default=200, description="Length of content preview in characters")),
+ },
+ )
+ codebase: Codebase = Field(exclude=True)
+
+ def __init__(self, codebase: Codebase) -> None:
+ super().__init__(codebase=codebase)
+
+ def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str:
+ result = semantic_search(self.codebase, query, k=k, preview_length=preview_length)
+ return json.dumps(result, indent=2)
diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py
index be93d6aa4..9ce7b4f90 100644
--- a/src/codegen/extensions/tools/__init__.py
+++ b/src/codegen/extensions/tools/__init__.py
@@ -13,6 +13,7 @@
from .reveal_symbol import reveal_symbol
from .search import search
from .semantic_edit import semantic_edit
+from .semantic_search import semantic_search
__all__ = [
"commit",
@@ -29,5 +30,6 @@
"search",
# Semantic edit
"semantic_edit",
+ "semantic_search",
"view_file",
]
diff --git a/src/codegen/extensions/tools/semantic_search.py b/src/codegen/extensions/tools/semantic_search.py
new file mode 100644
index 000000000..cd1ac04fd
--- /dev/null
+++ b/src/codegen/extensions/tools/semantic_search.py
@@ -0,0 +1,88 @@
+"""Semantic search over codebase files."""
+
+from typing import Any, Optional
+
+from codegen import Codebase
+from codegen.extensions.vector_index import VectorIndex
+
+
+def semantic_search(
+ codebase: Codebase,
+ query: str,
+ k: int = 5,
+ preview_length: int = 200,
+ index_path: Optional[str] = None,
+) -> dict[str, Any]:
+ """Search the codebase using semantic similarity.
+
+ This function provides semantic search over a codebase by using OpenAI's embeddings.
+ Currently, it loads/saves the index from disk each time, but could be optimized to
+ maintain embeddings in memory for frequently accessed codebases.
+
+ TODO(CG-XXXX): Add support for maintaining embeddings in memory across searches,
+ potentially with an LRU cache or similar mechanism to avoid recomputing embeddings
+ for frequently searched codebases.
+
+ Args:
+ codebase: The codebase to search
+ query: The search query in natural language
+ k: Number of results to return (default: 5)
+ preview_length: Length of content preview in characters (default: 200)
+ index_path: Optional path to a saved vector index
+
+ Returns:
+ Dict containing search results or error information. Format:
+ {
+ "status": "success",
+ "query": str,
+ "results": [
+ {
+ "filepath": str,
+ "score": float,
+ "preview": str
+ },
+ ...
+ ]
+ }
+ Or on error:
+ {
+ "error": str
+ }
+ """
+ try:
+ # Initialize vector index
+ index = VectorIndex(codebase)
+
+ # Try to load existing index
+ try:
+ if index_path:
+ index.load(index_path)
+ else:
+ index.load()
+ except FileNotFoundError:
+ # Create new index if none exists
+ index.create()
+ index.save(index_path)
+
+ # Perform search
+ results = index.similarity_search(query, k=k)
+
+ # Format results with previews
+ formatted_results = []
+ for filepath, score in results:
+ try:
+ file = codebase.get_file(filepath)
+ preview = file.content[:preview_length].replace("\n", " ").strip()
+ if len(file.content) > preview_length:
+ preview += "..."
+
+ formatted_results.append({"filepath": filepath, "score": float(score), "preview": preview})
+ except Exception as e:
+ # Skip files that can't be read
+ print(f"Warning: Could not read file {filepath}: {e}")
+ continue
+
+ return {"status": "success", "query": query, "results": formatted_results}
+
+ except Exception as e:
+ return {"error": f"Failed to perform semantic search: {e!s}"}
diff --git a/src/codegen/extensions/vector_index.py b/src/codegen/extensions/vector_index.py
new file mode 100644
index 000000000..7459c0042
--- /dev/null
+++ b/src/codegen/extensions/vector_index.py
@@ -0,0 +1,226 @@
+"""Vector index for semantic search over codebase files."""
+
+import pickle
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import tiktoken
+from openai import OpenAI
+from tqdm import tqdm
+
+from codegen import Codebase
+
+
+class VectorIndex:
+ """A vector index for semantic search over codebase files.
+
+ This class manages embeddings for all files in a codebase, allowing for semantic search
+ and similarity comparisons. It uses OpenAI's text-embedding model to generate embeddings
+ and stores them efficiently on disk.
+
+ Attributes:
+ codebase (Codebase): The codebase to index
+ E (Optional[np.ndarray]): The embeddings matrix, shape (n_files, embedding_dim)
+ file_paths (Optional[np.ndarray]): Array of file paths corresponding to embeddings
+ """
+
+ DEFAULT_SAVE_DIR = ".codegen"
+ DEFAULT_SAVE_FILE = "vector_index.pkl"
+ EMBEDDING_MODEL = "text-embedding-3-small"
+ MAX_TOKENS = 8000
+ BATCH_SIZE = 100
+
+ def __init__(self, codebase: Codebase):
+ """Initialize the vector index.
+
+ Args:
+ codebase: The codebase to create embeddings for
+ """
+ self.codebase = codebase
+ self.E: Optional[np.ndarray] = None
+ self.file_paths: Optional[np.ndarray] = None
+
+ # Initialize OpenAI client and tokenizer
+ self.client = OpenAI()
+ self.encoding = tiktoken.get_encoding("cl100k_base")
+
+ def _get_default_save_path(self) -> Path:
+ """Get the default save path for the vector index."""
+ save_dir = Path(self.codebase.repo_path) / self.DEFAULT_SAVE_DIR
+ save_dir.mkdir(exist_ok=True)
+ return save_dir / self.DEFAULT_SAVE_FILE
+
+ def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
+ """Get embeddings for a batch of texts using OpenAI's API."""
+ # Clean texts
+ texts = [text.replace("\\n", " ") for text in texts]
+
+ response = self.client.embeddings.create(model=self.EMBEDDING_MODEL, input=texts, encoding_format="float")
+ return [data.embedding for data in response.data]
+
+ def _split_by_tokens(self, text: str) -> list[str]:
+ """Split text into chunks that fit within token limit."""
+ tokens = self.encoding.encode(text)
+ chunks = []
+ current_chunk = []
+ current_size = 0
+
+ for token in tokens:
+ if current_size + 1 > self.MAX_TOKENS:
+ chunks.append(self.encoding.decode(current_chunk))
+ current_chunk = [token]
+ current_size = 1
+ else:
+ current_chunk.append(token)
+ current_size += 1
+
+ if current_chunk:
+ chunks.append(self.encoding.decode(current_chunk))
+
+ return chunks
+
+ def create(self) -> None:
+ """Create embeddings for all files in the codebase.
+
+ This method processes all files in the codebase, generates embeddings using
+ OpenAI's API, and stores them in memory. The embeddings can then be saved
+ to disk using save().
+ """
+ # Store file paths and their embeddings
+ file_embeddings = {}
+
+ # Collect all valid files and their chunks
+ chunks_to_process = []
+ for file in tqdm(self.codebase.files, desc="Collecting files"):
+ content = file.content
+ if not content: # Skip empty files
+ continue
+
+ # Split content into chunks by token count
+ content_chunks = self._split_by_tokens(content)
+
+ if len(content_chunks) == 1:
+ # If only one chunk, store as is
+ chunks_to_process.append((file.filepath, content, 0))
+ else:
+ # If multiple chunks, store with chunk index
+ for i, chunk in enumerate(content_chunks):
+ chunks_to_process.append((file.filepath, chunk, i))
+
+ # Process in batches
+ for i in tqdm(range(0, len(chunks_to_process), self.BATCH_SIZE), desc="Processing batches"):
+ batch = chunks_to_process[i : i + self.BATCH_SIZE]
+ filepaths, contents, chunk_indices = zip(*batch)
+
+ try:
+ # Get embeddings for the batch
+ embeddings = self._get_embeddings(contents)
+
+ # Store results
+ for filepath, content, chunk_idx, embedding in zip(filepaths, contents, chunk_indices, embeddings):
+ key = filepath if chunk_idx == 0 else f"{filepath}#chunk{chunk_idx}"
+ file_embeddings[key] = {"embedding": embedding, "content": content, "size": len(content), "chunk_index": chunk_idx}
+ except Exception as e:
+ print(f"Error processing batch {i // self.BATCH_SIZE}: {e}")
+
+ # Convert to numpy arrays
+ embeddings_list = []
+ file_paths = []
+
+ for filepath, data in file_embeddings.items():
+ embeddings_list.append(data["embedding"])
+ file_paths.append(filepath)
+
+ self.E = np.array(embeddings_list)
+ self.file_paths = np.array(file_paths)
+
+ def save(self, save_path: Optional[str] = None) -> None:
+ """Save the vector index to disk.
+
+ Args:
+ save_path: Optional path to save the index to. If not provided,
+ saves to .codegen/vector_index.pkl in the repo root.
+ """
+ if self.E is None or self.file_paths is None:
+ msg = "No embeddings to save. Call create() first."
+ raise ValueError(msg)
+
+ save_path = Path(save_path) if save_path else self._get_default_save_path()
+
+ # Ensure parent directory exists
+ save_path.parent.mkdir(parents=True, exist_ok=True)
+
+ with open(save_path, "wb") as f:
+ pickle.dump({"E": self.E, "file_paths": self.file_paths}, f)
+
+ def load(self, load_path: Optional[str] = None) -> None:
+ """Load a previously saved vector index from disk.
+
+ Args:
+ load_path: Optional path to load the index from. If not provided,
+ loads from .codegen/vector_index.pkl in the repo root.
+ """
+ load_path = Path(load_path) if load_path else self._get_default_save_path()
+
+ if not load_path.exists():
+ msg = f"No vector index found at {load_path}"
+ raise FileNotFoundError(msg)
+
+ with open(load_path, "rb") as f:
+ data = pickle.load(f)
+ # Handle both old and new format
+ self.E = data.get("E", data.get("embeddings"))
+ self.file_paths = data["file_paths"]
+
+ def get_embeddings(self, texts: list[str]) -> np.ndarray:
+ """Get embeddings for a list of texts using the same model as the index.
+
+ Args:
+ texts: List of text strings to get embeddings for
+
+ Returns:
+ np.ndarray: Array of embeddings with shape (len(texts), embedding_dim)
+ """
+ # Clean and get embeddings
+ embeddings = self._get_embeddings(texts)
+ return np.array(embeddings)
+
+ def similarity_search(self, query: str, k: int = 5) -> list[tuple[str, float]]:
+ """Find the k most similar files to a query text.
+
+ Uses cosine similarity between the query embedding and all file embeddings
+ to find the most similar files.
+
+ Args:
+ query: The text to search for
+ k: Number of results to return (default: 5)
+
+ Returns:
+ List of tuples (filepath, similarity_score) sorted by similarity (highest first)
+
+ Raises:
+ ValueError: If the index hasn't been created yet (E is None)
+ """
+ if self.E is None or self.file_paths is None:
+ msg = "No embeddings available. Call create() or load() first."
+ raise ValueError(msg)
+
+ # Get query embedding
+ query_embedding = self.get_embeddings([query])[0]
+
+ # Compute cosine similarity
+ # Normalize vectors for cosine similarity
+ query_norm = query_embedding / np.linalg.norm(query_embedding)
+ E_norm = self.E / np.linalg.norm(self.E, axis=1)[:, np.newaxis]
+ similarities = np.dot(E_norm, query_norm)
+
+ # Get top k indices
+ top_indices = np.argsort(similarities)[-k:][::-1]
+
+ # Return filepath and similarity score pairs
+ results = []
+ for idx in top_indices:
+ results.append((self.file_paths[idx], float(similarities[idx])))
+
+ return results