diff --git a/docs/building-with-codegen/semantic-code-search.mdx b/docs/building-with-codegen/semantic-code-search.mdx
new file mode 100644
index 000000000..48c3b70d5
--- /dev/null
+++ b/docs/building-with-codegen/semantic-code-search.mdx
@@ -0,0 +1,111 @@
+---
+title: "Semantic Code Search"
+sidebarTitle: "Semantic Code Search"
+icon: "magnifying-glass"
+iconType: "solid"
+---
+
+Codegen's `VectorIndex` enables semantic code search capabilities using embeddings. This allows you to search codebases using natural language queries and find semantically related code, even when the exact terms aren't present.
+
+<Warning>This is under active development. Interested in an application? [Reach out to the team!](/introduction/about.tsx)</Warning>
+
+## Basic Usage
+
+Create and save a vector index for your codebase:
+
+```python
+from codegen.extensions import VectorIndex
+
+# Initialize with your codebase
+index = VectorIndex(codebase)
+
+# Create embeddings for all files
+index.create()
+
+# Save to disk (defaults to .codegen/vector_index.pkl)
+index.save()
+```
+
+Later, load the index and perform semantic searches:
+
+```python
+# Create a codebase
+codebase = Codebase.from_repo('fastapi/fastapi')
+
+# Load a previously created index
+index = VectorIndex(codebase)
+index.load()
+
+# Search with natural language
+results = index.similarity_search(
+    "How does FastAPI handle dependency injection?",
+    k=5  # number of results
+)
+
+# Print results with previews
+for filepath, score in results:
+    print(f"\nScore: {score:.3f} | File: {filepath}")
+    file = codebase.get_file(filepath)
+    print(f"Preview: {file.content[:200]}...")
+```
+
+<Note>
+The search uses cosine similarity between embeddings to find the most semantically related files, regardless of exact keyword matches.
+</Note>
+
+## Getting Embeddings
+
+You can also get embeddings for arbitrary text using the same model:
+
+```python
+# Get embeddings for a list of texts
+texts = [
+    "Some code or text to embed",
+    "Another piece of text"
+]
+embeddings = index.get_embeddings(texts)  # shape: (n_texts, embedding_dim)
+```
+
+## How It Works
+
+The `VectorIndex` class:
+1. Processes each file in your codebase
+2. Splits large files into chunks that fit within token limits
+3. Uses OpenAI's text-embedding-3-small model to create embeddings
+4. Stores embeddings in a numpy array for efficient similarity search
+5. Saves the index to disk for reuse
+
+When searching:
+1. Your query is converted to an embedding using the same model
+2. Cosine similarity is computed between the query and all file embeddings
+3. The most similar files are returned, along with their similarity scores
+
+<Warning>
+Creating embeddings requires an OpenAI API key with access to the embeddings endpoint.
+</Warning>
+
+## Example Searches
+
+Here are some example semantic searches that demonstrate the power of the system:
+
+```python
+# Find authentication-related code
+results = index.similarity_search(
+    "How is user authentication implemented?",
+    k=3
+)
+
+# Find error handling patterns
+results = index.similarity_search(
+    "Show me examples of error handling and custom exceptions",
+    k=3
+)
+
+# Find configuration management
+results = index.similarity_search(
+    "Where is the application configuration and settings handled?",
+    k=3
+)
+```
+
+The semantic search can understand concepts and return relevant results even when the exact terms aren't present in the code.
diff --git a/docs/mint.json b/docs/mint.json
index 147072fd0..7947f5aee 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -134,6 +134,7 @@
 				"building-with-codegen/codebase-visualization",
 				"building-with-codegen/flagging-symbols",
 				"building-with-codegen/calling-out-to-llms",
+				"building-with-codegen/semantic-code-search",
 				"building-with-codegen/reducing-conditions"
 			]
 		},
diff --git a/pyproject.toml b/pyproject.toml
index ab61da8ba..9ff56fef8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ dependencies = [
   "langchain[openai]",
   "langchain_core",
   "langchain_openai",
+  "numpy>=2.2.2",
 ]
 
 license = { text = "Apache-2.0" }
diff --git a/src/codegen/extensions/__init__.py b/src/codegen/extensions/__init__.py
index e69de29bb..3958271ac 100644
--- a/src/codegen/extensions/__init__.py
+++ b/src/codegen/extensions/__init__.py
@@ -0,0 +1,5 @@
+"""Extensions for the codegen package."""
+
+from codegen.extensions.vector_index import VectorIndex
+
+__all__ = ["VectorIndex"]
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 6378e9eb5..f51872dce 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -19,6 +19,7 @@
     reveal_symbol,
     search,
     semantic_edit,
+    semantic_search,
     view_file,
 )
 
@@ -317,3 +318,27 @@ def _run(
             include_dependencies=include_dependencies,
         )
         return json.dumps(result, indent=2)
+
+
+class SemanticSearchTool(BaseTool):
+    """Tool for semantic code search."""
+
+    name: ClassVar[str] = "semantic_search"
+    description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity"
+    args_schema: ClassVar[type[BaseModel]] = type(
+        "SemanticSearchInput",
+        (BaseModel,),
+        {
+            "query": (str, Field(..., description="The natural language search query")),
+            "k": (int, Field(default=5, description="Number of results to return")),
+            "preview_length": (int, Field(default=200, description="Length of content preview in characters")),
+        },
+    )
+    codebase: Codebase = Field(exclude=True)
+
+    def __init__(self, codebase: Codebase) -> None:
+        super().__init__(codebase=codebase)
+
+    def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str:
+        result = semantic_search(self.codebase, query, k=k, preview_length=preview_length)
+        return json.dumps(result, indent=2)
diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py
index be93d6aa4..9ce7b4f90 100644
--- a/src/codegen/extensions/tools/__init__.py
+++ b/src/codegen/extensions/tools/__init__.py
@@ -13,6 +13,7 @@
 from .reveal_symbol import reveal_symbol
 from .search import search
 from .semantic_edit import semantic_edit
+from .semantic_search import semantic_search
 
 __all__ = [
     "commit",
@@ -29,5 +30,6 @@
     "search",
     # Semantic edit
     "semantic_edit",
+    "semantic_search",
     "view_file",
 ]
diff --git a/src/codegen/extensions/tools/semantic_search.py b/src/codegen/extensions/tools/semantic_search.py
new file mode 100644
index 000000000..cd1ac04fd
--- /dev/null
+++ b/src/codegen/extensions/tools/semantic_search.py
@@ -0,0 +1,88 @@
+"""Semantic search over codebase files."""
+
+from typing import Any, Optional
+
+from codegen import Codebase
+from codegen.extensions.vector_index import VectorIndex
+
+
+def semantic_search(
+    codebase: Codebase,
+    query: str,
+    k: int = 5,
+    preview_length: int = 200,
+    index_path: Optional[str] = None,
+) -> dict[str, Any]:
+    """Search the codebase using semantic similarity.
+
+    This function provides semantic search over a codebase by using OpenAI's embeddings.
+    Currently, it loads/saves the index from disk each time, but could be optimized to
+    maintain embeddings in memory for frequently accessed codebases.
+
+    TODO(CG-XXXX): Add support for maintaining embeddings in memory across searches,
+    potentially with an LRU cache or similar mechanism to avoid recomputing embeddings
+    for frequently searched codebases.
+
+    Args:
+        codebase: The codebase to search
+        query: The search query in natural language
+        k: Number of results to return (default: 5)
+        preview_length: Length of content preview in characters (default: 200)
+        index_path: Optional path to a saved vector index
+
+    Returns:
+        Dict containing search results or error information. Format:
+        {
+            "status": "success",
+            "query": str,
+            "results": [
+                {
+                    "filepath": str,
+                    "score": float,
+                    "preview": str
+                },
+                ...
+            ]
+        }
+        Or on error:
+        {
+            "error": str
+        }
+    """
+    try:
+        # Initialize vector index
+        index = VectorIndex(codebase)
+
+        # Try to load existing index
+        try:
+            if index_path:
+                index.load(index_path)
+            else:
+                index.load()
+        except FileNotFoundError:
+            # Create new index if none exists
+            index.create()
+            index.save(index_path)
+
+        # Perform search
+        results = index.similarity_search(query, k=k)
+
+        # Format results with previews
+        formatted_results = []
+        for filepath, score in results:
+            try:
+                file = codebase.get_file(filepath)
+                preview = file.content[:preview_length].replace("\n", " ").strip()
+                if len(file.content) > preview_length:
+                    preview += "..."
+
+                formatted_results.append({"filepath": filepath, "score": float(score), "preview": preview})
+            except Exception as e:
+                # Skip files that can't be read
+                print(f"Warning: Could not read file {filepath}: {e}")
+                continue
+
+        return {"status": "success", "query": query, "results": formatted_results}
+
+    except Exception as e:
+        return {"error": f"Failed to perform semantic search: {e!s}"}
diff --git a/src/codegen/extensions/vector_index.py b/src/codegen/extensions/vector_index.py
new file mode 100644
index 000000000..7459c0042
--- /dev/null
+++ b/src/codegen/extensions/vector_index.py
@@ -0,0 +1,226 @@
+"""Vector index for semantic search over codebase files."""
+
+import pickle
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import tiktoken
+from openai import OpenAI
+from tqdm import tqdm
+
+from codegen import Codebase
+
+
+class VectorIndex:
+    """A vector index for semantic search over codebase files.
+
+    This class manages embeddings for all files in a codebase, allowing for semantic search
+    and similarity comparisons. It uses OpenAI's text-embedding model to generate embeddings
+    and stores them efficiently on disk.
+
+    Attributes:
+        codebase (Codebase): The codebase to index
+        E (Optional[np.ndarray]): The embeddings matrix, shape (n_files, embedding_dim)
+        file_paths (Optional[np.ndarray]): Array of file paths corresponding to embeddings
+    """
+
+    DEFAULT_SAVE_DIR = ".codegen"
+    DEFAULT_SAVE_FILE = "vector_index.pkl"
+    EMBEDDING_MODEL = "text-embedding-3-small"
+    MAX_TOKENS = 8000
+    BATCH_SIZE = 100
+
+    def __init__(self, codebase: Codebase):
+        """Initialize the vector index.
+
+        Args:
+            codebase: The codebase to create embeddings for
+        """
+        self.codebase = codebase
+        self.E: Optional[np.ndarray] = None
+        self.file_paths: Optional[np.ndarray] = None
+
+        # Initialize OpenAI client and tokenizer
+        self.client = OpenAI()
+        self.encoding = tiktoken.get_encoding("cl100k_base")
+
+    def _get_default_save_path(self) -> Path:
+        """Get the default save path for the vector index."""
+        save_dir = Path(self.codebase.repo_path) / self.DEFAULT_SAVE_DIR
+        save_dir.mkdir(exist_ok=True)
+        return save_dir / self.DEFAULT_SAVE_FILE
+
+    def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get embeddings for a batch of texts using OpenAI's API."""
+        # Clean texts
+        texts = [text.replace("\\n", " ") for text in texts]
+
+        response = self.client.embeddings.create(model=self.EMBEDDING_MODEL, input=texts, encoding_format="float")
+        return [data.embedding for data in response.data]
+
+    def _split_by_tokens(self, text: str) -> list[str]:
+        """Split text into chunks that fit within token limit."""
+        tokens = self.encoding.encode(text)
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for token in tokens:
+            if current_size + 1 > self.MAX_TOKENS:
+                chunks.append(self.encoding.decode(current_chunk))
+                current_chunk = [token]
+                current_size = 1
+            else:
+                current_chunk.append(token)
+                current_size += 1
+
+        if current_chunk:
+            chunks.append(self.encoding.decode(current_chunk))
+
+        return chunks
+
+    def create(self) -> None:
+        """Create embeddings for all files in the codebase.
+
+        This method processes all files in the codebase, generates embeddings using
+        OpenAI's API, and stores them in memory. The embeddings can then be saved
+        to disk using save().
+        """
+        # Store file paths and their embeddings
+        file_embeddings = {}
+
+        # Collect all valid files and their chunks
+        chunks_to_process = []
+        for file in tqdm(self.codebase.files, desc="Collecting files"):
+            content = file.content
+            if not content:  # Skip empty files
+                continue
+
+            # Split content into chunks by token count
+            content_chunks = self._split_by_tokens(content)
+
+            if len(content_chunks) == 1:
+                # If only one chunk, store as is
+                chunks_to_process.append((file.filepath, content, 0))
+            else:
+                # If multiple chunks, store with chunk index
+                for i, chunk in enumerate(content_chunks):
+                    chunks_to_process.append((file.filepath, chunk, i))
+
+        # Process in batches
+        for i in tqdm(range(0, len(chunks_to_process), self.BATCH_SIZE), desc="Processing batches"):
+            batch = chunks_to_process[i : i + self.BATCH_SIZE]
+            filepaths, contents, chunk_indices = zip(*batch)
+
+            try:
+                # Get embeddings for the batch
+                embeddings = self._get_embeddings(contents)
+
+                # Store results
+                for filepath, content, chunk_idx, embedding in zip(filepaths, contents, chunk_indices, embeddings):
+                    key = filepath if chunk_idx == 0 else f"{filepath}#chunk{chunk_idx}"
+                    file_embeddings[key] = {"embedding": embedding, "content": content, "size": len(content), "chunk_index": chunk_idx}
+            except Exception as e:
+                print(f"Error processing batch {i // self.BATCH_SIZE}: {e}")
+
+        # Convert to numpy arrays
+        embeddings_list = []
+        file_paths = []
+
+        for filepath, data in file_embeddings.items():
+            embeddings_list.append(data["embedding"])
+            file_paths.append(filepath)
+
+        self.E = np.array(embeddings_list)
+        self.file_paths = np.array(file_paths)
+
+    def save(self, save_path: Optional[str] = None) -> None:
+        """Save the vector index to disk.
+
+        Args:
+            save_path: Optional path to save the index to. If not provided,
+                      saves to .codegen/vector_index.pkl in the repo root.
+        """
+        if self.E is None or self.file_paths is None:
+            msg = "No embeddings to save. Call create() first."
+            raise ValueError(msg)
+
+        save_path = Path(save_path) if save_path else self._get_default_save_path()
+
+        # Ensure parent directory exists
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(save_path, "wb") as f:
+            pickle.dump({"E": self.E, "file_paths": self.file_paths}, f)
+
+    def load(self, load_path: Optional[str] = None) -> None:
+        """Load a previously saved vector index from disk.
+
+        Args:
+            load_path: Optional path to load the index from. If not provided,
+                      loads from .codegen/vector_index.pkl in the repo root.
+        """
+        load_path = Path(load_path) if load_path else self._get_default_save_path()
+
+        if not load_path.exists():
+            msg = f"No vector index found at {load_path}"
+            raise FileNotFoundError(msg)
+
+        with open(load_path, "rb") as f:
+            data = pickle.load(f)
+            # Handle both old and new format
+            self.E = data.get("E", data.get("embeddings"))
+            self.file_paths = data["file_paths"]
+
+    def get_embeddings(self, texts: list[str]) -> np.ndarray:
+        """Get embeddings for a list of texts using the same model as the index.
+
+        Args:
+            texts: List of text strings to get embeddings for
+
+        Returns:
+            np.ndarray: Array of embeddings with shape (len(texts), embedding_dim)
+        """
+        # Clean and get embeddings
+        embeddings = self._get_embeddings(texts)
+        return np.array(embeddings)
+
+    def similarity_search(self, query: str, k: int = 5) -> list[tuple[str, float]]:
+        """Find the k most similar files to a query text.
+
+        Uses cosine similarity between the query embedding and all file embeddings
+        to find the most similar files.
+
+        Args:
+            query: The text to search for
+            k: Number of results to return (default: 5)
+
+        Returns:
+            List of tuples (filepath, similarity_score) sorted by similarity (highest first)
+
+        Raises:
+            ValueError: If the index hasn't been created yet (E is None)
+        """
+        if self.E is None or self.file_paths is None:
+            msg = "No embeddings available. Call create() or load() first."
+            raise ValueError(msg)
+
+        # Get query embedding
+        query_embedding = self.get_embeddings([query])[0]
+
+        # Compute cosine similarity
+        # Normalize vectors for cosine similarity
+        query_norm = query_embedding / np.linalg.norm(query_embedding)
+        E_norm = self.E / np.linalg.norm(self.E, axis=1)[:, np.newaxis]
+        similarities = np.dot(E_norm, query_norm)
+
+        # Get top k indices
+        top_indices = np.argsort(similarities)[-k:][::-1]
+
+        # Return filepath and similarity score pairs
+        results = []
+        for idx in top_indices:
+            results.append((self.file_paths[idx], float(similarities[idx])))
+
+        return results