|
2 | 2 | import os |
3 | 3 | import sys |
4 | 4 |
|
| 5 | +from chromadb import GetResult |
5 | 6 | from chromadb.api.models.AsyncCollection import AsyncCollection |
6 | 7 | from chromadb.api.types import IncludeEnum |
7 | 8 | from chromadb.errors import InvalidCollectionException, InvalidDimensionException |
@@ -33,18 +34,20 @@ async def get_query_result_files( |
33 | 34 | print("Empty collection!", file=sys.stderr) |
34 | 35 | return [] |
35 | 36 | try: |
| 37 | + if len(configs.query_exclude): |
| 38 | + filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}} |
| 39 | + else: |
| 40 | + filtered_files = {} |
36 | 41 | num_query = configs.n_result |
37 | | - if QueryInclude.chunk not in configs.include: |
| 42 | + if QueryInclude.chunk in configs.include: |
| 43 | + filtered_files["start"] = {"$gte": 0} |
| 44 | + else: |
38 | 45 | num_query = await collection.count() |
39 | 46 | if configs.query_multiplier > 0: |
40 | 47 | num_query = min( |
41 | 48 | int(configs.n_result * configs.query_multiplier), |
42 | 49 | await collection.count(), |
43 | 50 | ) |
44 | | - if len(configs.query_exclude): |
45 | | - filtered_files = {"path": {"$nin": configs.query_exclude}} |
46 | | - else: |
47 | | - filtered_files = None |
48 | 51 | results = await collection.query( |
49 | 52 | query_texts=query_chunks, |
50 | 53 | n_results=num_query, |
@@ -72,6 +75,64 @@ async def get_query_result_files( |
72 | 75 | return aggregated_results |
73 | 76 |
|
74 | 77 |
|
| 78 | +async def build_query_results( |
| 79 | + collection: AsyncCollection, configs: Config |
| 80 | +) -> list[dict[str, str | int]]: |
| 81 | + structured_result = [] |
| 82 | + for identifier in await get_query_result_files(collection, configs): |
| 83 | + if os.path.isfile(identifier): |
| 84 | + if configs.use_absolute_path: |
| 85 | + output_path = os.path.abspath(identifier) |
| 86 | + else: |
| 87 | + output_path = os.path.relpath(identifier, configs.project_root) |
| 88 | + full_result = {"path": output_path} |
| 89 | + with open(identifier) as fin: |
| 90 | + document = fin.read() |
| 91 | + full_result["document"] = document |
| 92 | + |
| 93 | + structured_result.append( |
| 94 | + {str(key): full_result[str(key)] for key in configs.include} |
| 95 | + ) |
| 96 | + elif QueryInclude.chunk in configs.include: |
| 97 | + chunk: GetResult = await collection.get( |
| 98 | + identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents] |
| 99 | + ) |
| 100 | + meta = chunk.get( |
| 101 | + "metadatas", |
| 102 | + ) |
| 103 | + if meta is not None and len(meta) != 0: |
| 104 | + full_result: dict[str, str | int] = { |
| 105 | + "chunk": str(chunk.get("documents", [""])[0]) |
| 106 | + } |
| 107 | + if meta[0].get("start") is not None and meta[0].get("end") is not None: |
| 108 | + path = str(meta[0].get("path")) |
| 109 | + with open(path) as fin: |
| 110 | + start: int = meta[0]["start"] |
| 111 | + end: int = meta[0]["end"] |
| 112 | + full_result["chunk"] = "".join(fin.readlines()[start : end + 1]) |
| 113 | + full_result["start_line"] = start |
| 114 | + full_result["end_line"] = end |
| 115 | + full_result["path"] = str( |
| 116 | + meta[0]["path"] |
| 117 | + if configs.use_absolute_path |
| 118 | + else os.path.relpath(meta[0]["path"], str(configs.project_root)) |
| 119 | + ) |
| 120 | + |
| 121 | + structured_result.append(full_result) |
| 122 | + else: |
| 123 | + print( |
| 124 | + "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it.", |
| 125 | + file=sys.stderr, |
| 126 | + ) |
| 127 | + |
| 128 | + else: |
| 129 | + print( |
| 130 | + f"{identifier} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.", |
| 131 | + file=sys.stderr, |
| 132 | + ) |
| 133 | + return structured_result |
| 134 | + |
| 135 | + |
75 | 136 | async def query(configs: Config) -> int: |
76 | 137 | if ( |
77 | 138 | QueryInclude.chunk in configs.include |
@@ -108,28 +169,7 @@ async def query(configs: Config) -> int: |
108 | 169 | if not configs.pipe: |
109 | 170 | print("Starting querying...") |
110 | 171 |
|
111 | | - structured_result = [] |
112 | | - |
113 | | - for path in await get_query_result_files(collection, configs): |
114 | | - if os.path.isfile(path): |
115 | | - if configs.use_absolute_path: |
116 | | - output_path = os.path.abspath(path) |
117 | | - else: |
118 | | - output_path = os.path.relpath(path, configs.project_root) |
119 | | - full_result = {"path": output_path} |
120 | | - if QueryInclude.document in configs.include: |
121 | | - with open(path) as fin: |
122 | | - document = fin.read() |
123 | | - full_result["document"] = document |
124 | | - |
125 | | - structured_result.append( |
126 | | - {str(key): full_result[str(key)] for key in configs.include} |
127 | | - ) |
128 | | - else: |
129 | | - print( |
130 | | - f"{path} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.", |
131 | | - file=sys.stderr, |
132 | | - ) |
| 172 | + structured_result = await build_query_results(collection, configs) |
133 | 173 |
|
134 | 174 | if configs.pipe: |
135 | 175 | print(json.dumps(structured_result)) |
|
0 commit comments