feat(cli): Allow truncating embeddings to specified dimensions. (#265)

Davidyz · web-flow · commit f997701ffcfb · 2025-08-03T08:34:33.000+01:00
* feat(cli): Truncate embeddings to specified dimensions

* fix(cli): Truncate query embeddings to specified dimensions

* tests(cli): Add tests for truncated embeddings

* docs(cli): Document embedding_dims configuration option

* Auto generate docs

---------

Co-authored-by: Davidyz &lt;Davidyz@users.noreply.github.com&gt;
diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt
@@ -322,17 +322,23 @@ embedding function takes. For `OllamaEmbeddingFunction`, if you set
 "model_name": "nomic-embed-text" }` Then the embedding function object will be
 initialised as
 `OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings",
-model_name="nomic-embed-text")`. Default: `{}`; - `db_url`string, the url that
-points to the Chromadb server. VectorCode will start an HTTP server for
-Chromadb at a randomly picked free port on `localhost` if your configured
-`http://host:port` is not accessible. Default: `http://127.0.0.1:8000`; -
-`db_path`string, Path to local persistent database. If you didn’t set up a
-standalone Chromadb server, this is where the files for your database will be
-stored. Default: `~/.local/share/vectorcode/chromadb/`; - `db_log_path`string,
-path to the _directory_ where the built-in chromadb server will write the log
-to. Default: `~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum
-number of characters per chunk. A larger value reduces the number of items in
-the database, and hence accelerates the search, but at the cost of potentially
+model_name="nomic-embed-text")`. Default: `{}`; - `embedding_dims`integer or
+`null`, the number of dimensions to truncate the embeddings to. _Make sure your
+model supports Matryoshka Representation Learning (MRL) before using this._
+Learn more about MRL here
+<https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings>.
+When set to `null` (or unset), the embeddings won’t be truncated; -
+`db_url`string, the url that points to the Chromadb server. VectorCode will
+start an HTTP server for Chromadb at a randomly picked free port on `localhost`
+if your configured `http://host:port` is not accessible. Default:
+`http://127.0.0.1:8000`; - `db_path`string, Path to local persistent database.
+If you didn’t set up a standalone Chromadb server, this is where the files
+for your database will be stored. Default:
+`~/.local/share/vectorcode/chromadb/`; - `db_log_path`string, path to the
+_directory_ where the built-in chromadb server will write the log to. Default:
+`~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum number of
+characters per chunk. A larger value reduces the number of items in the
+database, and hence accelerates the search, but at the cost of potentially
 truncated data and lost information. Default: `2500`. To disable chunking, set
 it to a negative number; - `overlap_ratio`float between 0 and 1, the ratio of
 overlapping/shared content between 2 adjacent chunks. A larger ratio improves
diff --git a/docs/cli.md b/docs/cli.md
@@ -275,6 +275,10 @@ The JSON configuration file may hold the following values:
   Then the embedding function object will be initialised as
   `OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings",
   model_name="nomic-embed-text")`. Default: `{}`;
+- `embedding_dims`: integer or `null`, the number of dimensions to truncate the embeddings
+  to. _Make sure your model supports Matryoshka Representation Learning (MRL) 
+  before using this._ Learn more about MRL [here](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings).
+  When set to `null` (or unset), the embeddings won't be truncated;
 - `db_url`: string, the url that points to the Chromadb server. VectorCode will start an
   HTTP server for Chromadb at a randomly picked free port on `localhost` if your 
   configured `http://host:port` is not accessible. Default: `http://127.0.0.1:8000`;
diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
@@ -89,6 +89,7 @@ class Config:
     db_url: str = "http://127.0.0.1:8000"
     embedding_function: str = "SentenceTransformerEmbeddingFunction"  # This should fallback to whatever the default is.
     embedding_params: dict[str, Any] = field(default_factory=(lambda: {}))
+    embedding_dims: Optional[int] = None
     n_result: int = 1
     force: bool = False
     db_path: Optional[str] = "~/.local/share/vectorcode/chromadb/"
@@ -139,6 +140,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
                 "embedding_params": config_dict.get(
                     "embedding_params", default_config.embedding_params
                 ),
+                "embedding_dims": config_dict.get(
+                    "embedding_dims", default_config.embedding_dims
+                ),
                 "db_url": config_dict.get("db_url", default_config.db_url),
                 "db_path": db_path,
                 "db_log_path": os.path.expanduser(
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
@@ -67,8 +67,11 @@ async def get_query_result_files(
                     await collection.count(),
                 )
                 logger.info(f"Querying {num_query} chunks for reranking.")
+        query_embeddings = get_embedding_function(configs)(query_chunks)
+        if isinstance(configs.embedding_dims, int) and configs.embedding_dims > 0:
+            query_embeddings = [e[: configs.embedding_dims] for e in query_embeddings]
         results = await collection.query(
-            query_embeddings=get_embedding_function(configs)(query_chunks),
+            query_embeddings=query_embeddings,
             n_results=num_query,
             include=[
                 IncludeEnum.metadatas,
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
@@ -146,12 +146,21 @@ async def chunked_add(
             async with collection_lock:
                 for idx in range(0, len(chunks), max_batch_size):
                     inserted_chunks = chunks[idx : idx + max_batch_size]
+                    embeddings = embedding_function(
+                        list(str(c) for c in inserted_chunks)
+                    )
+                    if (
+                        isinstance(configs.embedding_dims, int)
+                        and configs.embedding_dims > 0
+                    ):
+                        logger.debug(
+                            f"Truncating embeddings to {configs.embedding_dims} dimensions."
+                        )
+                        embeddings = [e[: configs.embedding_dims] for e in embeddings]
                     await collection.add(
                         ids=[get_uuid() for _ in inserted_chunks],
                         documents=[str(i) for i in inserted_chunks],
-                        embeddings=embedding_function(
-                            list(str(c) for c in inserted_chunks)
-                        ),
+                        embeddings=embeddings,
                         metadatas=metas,
                     )
     except (UnicodeDecodeError, UnicodeError):  # pragma: nocover
diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
@@ -327,14 +327,11 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config):
 async def test_get_query_result_files_multiple_queries(mock_collection, mock_config):
     # Set multiple query terms
     mock_config.query = ["term1", "term2", "term3"]
-    mock_embedding_function = MagicMock()
+    mock_config.embedding_dims = 10
+
     with (
         patch("vectorcode.subcommands.query.StringChunker") as MockChunker,
         patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker,
-        patch(
-            "vectorcode.subcommands.query.get_embedding_function",
-            return_value=mock_embedding_function,
-        ),
     ):
         # Set up MockChunker to return the query terms as is
         mock_chunker_instance = MagicMock()
@@ -354,7 +351,7 @@ async def test_get_query_result_files_multiple_queries(mock_collection, mock_con
         # Check query was called with all query terms
         mock_collection.query.assert_called_once()
         _, kwargs = mock_collection.query.call_args
-        mock_embedding_function.assert_called_once_with(["term1", "term2", "term3"])
+        assert all(len(i) == 10 for i in kwargs["query_embeddings"])
 
         # Check the result
         assert result == ["file1.py", "file2.py"]
diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py
@@ -103,6 +103,44 @@ async def test_chunked_add():
     assert collection.add.call_count == 1
 
 
+@pytest.mark.asyncio
+async def test_chunked_add_truncated():
+    file_path = "test_file.py"
+    collection = AsyncMock()
+    collection_lock = asyncio.Lock()
+    stats = VectoriseStats()
+    stats_lock = asyncio.Lock()
+    configs = Config(
+        chunk_size=100, overlap_ratio=0.2, project_root=".", embedding_dims=10
+    )
+    max_batch_size = 50
+    semaphore = asyncio.Semaphore(1)
+
+    with (
+        patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
+        patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
+    ):
+        mock_hash_file.return_value = "hash1"
+        mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
+        await chunked_add(
+            file_path,
+            collection,
+            collection_lock,
+            stats,
+            stats_lock,
+            configs,
+            max_batch_size,
+            semaphore,
+        )
+
+    assert stats.add == 1
+    assert stats.update == 0
+    collection.add.assert_called()
+    assert collection.add.call_count == 1
+
+    assert all(len(i) == 10 for i in collection.add.call_args.kwargs["embeddings"])
+
+
 @pytest.mark.asyncio
 async def test_chunked_add_with_existing():
     file_path = "test_file.py"