Skip to content

Commit 708f0eb

Browse files
authored
feat(cli): Support loading exclude specs from nested repo/submodules (#244)
* feat(cli): Support recursive .gitignore resolution * feat(cli): support recursive gitignores * tests(cli): tests for the new spec resolver * feat(cli): make spec resolver "scoped" * refactor(cli): rewrote the `match` method to avoid making new lists. * Auto generate docs * tests(cli): Add recursive `.gitignore` resolution tests * chore(cli): doc string update and mark unimportant branch as nocover --------- Co-authored-by: Davidyz <[email protected]>
1 parent bb52bcd commit 708f0eb

File tree

6 files changed

+337
-168
lines changed

6 files changed

+337
-168
lines changed

doc/VectorCode-cli.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,10 @@ certain conditions. See the wiki
459459
<https://github.com/Davidyz/VectorCode/wiki/Tips-and-Tricks#git-hooks> for an
460460
example to use it with git hooks.
461461

462+
If you’re working with nested repos, you can pass `--recursive`/`-r` so that
463+
the `vectorise` command will honour the `.gitignore`s and `vectorcode.exclude`s
464+
in the nested repos.
465+
462466

463467
MAKING A QUERY ~
464468

docs/cli.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,10 @@ on certain conditions. See
426426
[the wiki](https://github.com/Davidyz/VectorCode/wiki/Tips-and-Tricks#git-hooks)
427427
for an example to use it with git hooks.
428428

429+
If you're working with nested repos, you can pass `--recursive`/`-r` so that
430+
the `vectorise` command will honour the `.gitignore`s and `vectorcode.exclude`s
431+
in the nested repos.
432+
429433
### Making a Query
430434
431435
To retrieve a list of documents from the database, you can use the following command:

src/vectorcode/cli_utils.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
from datetime import datetime
99
from enum import Enum, StrEnum
1010
from pathlib import Path
11-
from typing import Any, Optional, Sequence, Union
11+
from typing import Any, Generator, Iterable, Optional, Sequence, Union
1212

1313
import json5
1414
import shtab
1515
from filelock import AsyncFileLock
16+
from pathspec import GitIgnoreSpec
1617

1718
from vectorcode import __version__
1819

@@ -671,3 +672,58 @@ def get_lock(self, path: str | os.PathLike) -> AsyncFileLock:
671672
if self.__locks.get(path) is None:
672673
self.__locks[path] = AsyncFileLock(path) # pyright: ignore[reportArgumentType]
673674
return self.__locks[path]
675+
676+
677+
class SpecResolver:
678+
"""
679+
This class is a wrapper around filespec that makes it easier to work with file specs that are not in cwd.
680+
"""
681+
682+
@classmethod
683+
def from_path(cls, spec_path: str, project_root: Optional[str] = None):
684+
"""
685+
Automatically determine the appropriate `base_dir` for resolving file specs that are outside of the project root.
686+
Only supports `.gitignore` and `.vectorcode/vectorcode.{include,exclude}`.
687+
Raises `ValueError` if the spec path is not one of them.
688+
"""
689+
base_dir = "."
690+
if spec_path.endswith(".gitignore"):
691+
base_dir = spec_path.replace(".gitignore", "")
692+
else:
693+
path_obj = Path(spec_path)
694+
if path_obj.name in {"vectorcode.include", "vectorcode.exclude"}:
695+
if path_obj.parent.name == ".vectorcode":
696+
# project config
697+
base_dir = str(path_obj.parent.parent)
698+
else:
699+
# assume to be global config
700+
base_dir = project_root or "."
701+
else: # pragma: nocover
702+
raise ValueError(f"Unsupported spec path: {spec_path}")
703+
return cls(spec_path, base_dir)
704+
705+
def __init__(self, spec: str | GitIgnoreSpec, base_dir: str = "."):
706+
if isinstance(spec, str):
707+
with open(spec) as fin:
708+
self.spec = GitIgnoreSpec.from_lines(
709+
(i.strip() for i in fin.readlines())
710+
)
711+
else:
712+
self.spec = spec
713+
self.base_dir = base_dir
714+
715+
def match(
716+
self, paths: Iterable[str], negated: bool = False
717+
) -> Generator[str, None, None]:
718+
# get paths relative to `base_dir`
719+
720+
base = Path(self.base_dir).resolve()
721+
for p in paths:
722+
if base in Path(p).resolve().parents:
723+
should_yield = self.spec.match_file(os.path.relpath(p, self.base_dir))
724+
if negated:
725+
should_yield = not should_yield
726+
if should_yield:
727+
yield p
728+
else:
729+
yield p

src/vectorcode/subcommands/vectorise.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import glob
23
import hashlib
34
import json
45
import logging
@@ -20,6 +21,7 @@
2021
GLOBAL_EXCLUDE_SPEC,
2122
GLOBAL_INCLUDE_SPEC,
2223
Config,
24+
SpecResolver,
2325
expand_globs,
2426
expand_path,
2527
)
@@ -187,22 +189,13 @@ def show_stats(configs: Config, stats: VectoriseStats):
187189

188190

189191
def exclude_paths_by_spec(
190-
paths: Iterable[str], specs: pathspec.PathSpec | str
192+
paths: Iterable[str], spec_path: str, project_root: Optional[str] = None
191193
) -> list[str]:
192194
"""
193195
Files matched by the specs will be excluded.
194196
"""
195-
if isinstance(specs, str):
196-
with open(specs) as fin:
197-
specs = pathspec.GitIgnoreSpec.from_lines(fin.readlines())
198-
return [path for path in paths if not specs.match_file(path)]
199197

200-
201-
def include_paths_by_spec(paths: Iterable[str], specs: pathspec.PathSpec) -> list[str]:
202-
"""
203-
Only include paths matched by the specs.
204-
"""
205-
return [path for path in paths if specs.match_file(path)]
198+
return list(SpecResolver.from_path(spec_path, project_root).match(paths, True))
206199

207200

208201
def load_files_from_include(project_root: str) -> list[str]:
@@ -235,17 +228,25 @@ def find_exclude_specs(configs: Config) -> list[str]:
235228
Load a list of paths to exclude specs.
236229
Can be `.gitignore` or local/global `vectorcode.exclude`
237230
"""
238-
gitignore_path = os.path.join(str(configs.project_root), ".gitignore")
239-
specs = [
240-
gitignore_path,
241-
]
231+
if configs.recursive:
232+
specs = glob.glob(
233+
os.path.join(str(configs.project_root), "**", ".gitignore"), recursive=True
234+
) + glob.glob(
235+
os.path.join(str(configs.project_root), "**", "vectorcode.exclude"),
236+
recursive=True,
237+
)
238+
else:
239+
specs = [os.path.join(str(configs.project_root), ".gitignore")]
240+
242241
exclude_spec_path = os.path.join(
243242
str(configs.project_root), ".vectorcode", "vectorcode.exclude"
244243
)
245244
if os.path.isfile(exclude_spec_path):
246245
specs.append(exclude_spec_path)
247246
elif os.path.isfile(GLOBAL_EXCLUDE_SPEC):
248247
specs.append(GLOBAL_EXCLUDE_SPEC)
248+
specs = [i for i in specs if os.path.isfile(i)]
249+
logger.debug(f"Loaded exclude specs: {specs}")
249250
return specs
250251

251252

@@ -272,7 +273,10 @@ async def vectorise(configs: Config) -> int:
272273
for spec_path in find_exclude_specs(configs):
273274
if os.path.isfile(spec_path):
274275
logger.info(f"Loading ignore specs from {spec_path}.")
275-
files = exclude_paths_by_spec((str(i) for i in files), spec_path)
276+
files = exclude_paths_by_spec(
277+
(str(i) for i in files), spec_path, str(configs.project_root)
278+
)
279+
logger.debug(f"Files after excluding: {files}")
276280
else: # pragma: nocover
277281
logger.info("Ignoring exclude specs.")
278282

0 commit comments

Comments
 (0)