feat(cli): finished CLI mode (mostly).

Davidyz · Davidyz · commit d855ad76cb80 · 2025-03-30T18:55:02.000+01:00
diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
@@ -235,10 +235,11 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
         """
         assert os.path.isfile(data)
         with open(data) as fin:
-            content = fin.read()
-        if self.config.chunk_size < 0:
-            yield content
-            return
+            lines = fin.readlines()
+            content = "".join(lines)
+            if self.config.chunk_size < 0 and content:
+                yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1))
+                return
         parser = None
         language = None
         lexer = self.__guess_type(data, content)
diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py
@@ -24,7 +24,7 @@
 )
 from vectorcode.common import get_client, get_collection, try_server
 from vectorcode.subcommands.ls import get_collection_list
-from vectorcode.subcommands.query import get_query_result_files
+from vectorcode.subcommands.query import build_query_results
 
 cached_project_configs: dict[str, Config] = {}
 DEFAULT_PROJECT_ROOT: str | None = None
@@ -108,20 +108,9 @@ async def execute_command(ls: LanguageServer, args: list[str]):
             )
             final_results = []
             try:
-                for path in await get_query_result_files(
-                    collection=collection,
-                    configs=final_configs,
-                ):
-                    if os.path.isfile(path):
-                        with open(path) as fin:
-                            output_path = path
-                            if not final_configs.use_absolute_path:
-                                output_path = os.path.relpath(
-                                    path, final_configs.project_root
-                                )
-                            final_results.append(
-                                {"path": output_path, "document": fin.read()}
-                            )
+                final_results.extend(
+                    await build_query_results(collection, final_configs)
+                )
             finally:
                 ls.progress.end(
                     progress_token,
diff --git a/src/vectorcode/subcommands/chunks.py b/src/vectorcode/subcommands/chunks.py
@@ -9,5 +9,5 @@ async def chunks(configs: Config) -> int:
     result = []
     for file_path in configs.files:
         result.append(list(chunker.chunk(str(file_path))))
-    print(json.dumps(result))
+    print(json.dumps(str(result)))
     return 0
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
@@ -2,6 +2,7 @@
 import os
 import sys
 
+from chromadb import GetResult
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
@@ -33,18 +34,20 @@ async def get_query_result_files(
         print("Empty collection!", file=sys.stderr)
         return []
     try:
+        if len(configs.query_exclude):
+            filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}}
+        else:
+            filtered_files = {}
         num_query = configs.n_result
-        if QueryInclude.chunk not in configs.include:
+        if QueryInclude.chunk in configs.include:
+            filtered_files["start"] = {"$gte": 0}
+        else:
             num_query = await collection.count()
             if configs.query_multiplier > 0:
                 num_query = min(
                     int(configs.n_result * configs.query_multiplier),
                     await collection.count(),
                 )
-        if len(configs.query_exclude):
-            filtered_files = {"path": {"$nin": configs.query_exclude}}
-        else:
-            filtered_files = None
         results = await collection.query(
             query_texts=query_chunks,
             n_results=num_query,
@@ -72,6 +75,64 @@ async def get_query_result_files(
     return aggregated_results
 
 
+async def build_query_results(
+    collection: AsyncCollection, configs: Config
+) -> list[dict[str, str | int]]:
+    structured_result = []
+    for identifier in await get_query_result_files(collection, configs):
+        if os.path.isfile(identifier):
+            if configs.use_absolute_path:
+                output_path = os.path.abspath(identifier)
+            else:
+                output_path = os.path.relpath(identifier, configs.project_root)
+            full_result = {"path": output_path}
+            with open(identifier) as fin:
+                document = fin.read()
+                full_result["document"] = document
+
+            structured_result.append(
+                {str(key): full_result[str(key)] for key in configs.include}
+            )
+        elif QueryInclude.chunk in configs.include:
+            chunk: GetResult = await collection.get(
+                identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
+            )
+            meta = chunk.get(
+                "metadatas",
+            )
+            if meta is not None and len(meta) != 0:
+                full_result: dict[str, str | int] = {
+                    "chunk": str(chunk.get("documents", [""])[0])
+                }
+                if meta[0].get("start") is not None and meta[0].get("end") is not None:
+                    path = str(meta[0].get("path"))
+                    with open(path) as fin:
+                        start: int = meta[0]["start"]
+                        end: int = meta[0]["end"]
+                        full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
+                    full_result["start_line"] = start
+                    full_result["end_line"] = end
+                    full_result["path"] = str(
+                        meta[0]["path"]
+                        if configs.use_absolute_path
+                        else os.path.relpath(meta[0]["path"], str(configs.project_root))
+                    )
+
+                    structured_result.append(full_result)
+            else:
+                print(
+                    "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it.",
+                    file=sys.stderr,
+                )
+
+        else:
+            print(
+                f"{identifier} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.",
+                file=sys.stderr,
+            )
+    return structured_result
+
+
 async def query(configs: Config) -> int:
     if (
         QueryInclude.chunk in configs.include
@@ -108,28 +169,7 @@ async def query(configs: Config) -> int:
     if not configs.pipe:
         print("Starting querying...")
 
-    structured_result = []
-
-    for path in await get_query_result_files(collection, configs):
-        if os.path.isfile(path):
-            if configs.use_absolute_path:
-                output_path = os.path.abspath(path)
-            else:
-                output_path = os.path.relpath(path, configs.project_root)
-            full_result = {"path": output_path}
-            if QueryInclude.document in configs.include:
-                with open(path) as fin:
-                    document = fin.read()
-                    full_result["document"] = document
-
-            structured_result.append(
-                {str(key): full_result[str(key)] for key in configs.include}
-            )
-        else:
-            print(
-                f"{path} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.",
-                file=sys.stderr,
-            )
+    structured_result = await build_query_results(collection, configs)
 
     if configs.pipe:
         print(json.dumps(structured_result))
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
@@ -65,8 +65,9 @@ async def chunked_add(
             for chunk in chunks:
                 meta: dict[str, str | dict[str, int]] = {"path": full_path_str}
                 if isinstance(chunk, Chunk):
-                    meta["start"] = {"row": chunk.start.row, "col": chunk.start.column}
-                    meta["end"] = {"row": chunk.end.row, "col": chunk.end.column}
+                    meta["start"] = chunk.start.row
+                    meta["end"] = chunk.end.row
+
                 metas.append(meta)
             async with collection_lock:
                 for idx in range(0, len(chunks), max_batch_size):
diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
@@ -5,7 +5,7 @@
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
 
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import CliAction, Config, QueryInclude
 from vectorcode.subcommands.query import get_query_result_files, query
 
 
@@ -71,7 +71,7 @@ async def test_get_query_result_files(mock_collection, mock_config):
         assert IncludeEnum.metadatas in kwargs["include"]
         assert IncludeEnum.distances in kwargs["include"]
         assert IncludeEnum.documents in kwargs["include"]
-        assert kwargs["where"] is None  # Since query_exclude is empty
+        assert not kwargs["where"]  # Since query_exclude is empty
 
         # Check reranker was used correctly
         MockReranker.assert_called_once_with(mock_config)
@@ -444,3 +444,11 @@ async def test_query_invalid_ef(mock_config):
 
         # Verify the function returns error code
         assert result == 1
+
+
+@pytest.mark.asyncio
+async def test_query_invalid_include():
+    faulty_config = Config(
+        action=CliAction.query, include=[QueryInclude.chunk, QueryInclude.document]
+    )
+    assert await query(faulty_config) != 0
diff --git a/tests/test_lsp.py b/tests/test_lsp.py
@@ -4,7 +4,7 @@
 from pygls.server import LanguageServer
 
 from vectorcode import __version__
-from vectorcode.cli_utils import CliAction, Config
+from vectorcode.cli_utils import CliAction, Config, QueryInclude
 from vectorcode.lsp_main import (
     execute_command,
     lsp_start,
@@ -23,13 +23,18 @@ def mock_language_server():
 
 @pytest.fixture
 def mock_config():
-    config = MagicMock(spec=Config)
+    # config = MagicMock(spec=Config)
+    config = Config()
     config.host = "localhost"
     config.port = 8000
     config.action = CliAction.query
     config.project_root = "/test/project"
     config.use_absolute_path = True
     config.pipe = False
+    config.overlap_ratio = 0.2
+    config.query_exclude = []
+    config.include = [QueryInclude.path]
+    config.query_multipler = 10
     return config