Skip to content

Commit 5fc056b

Browse files
committed
DOP-2430: Harden the getfiles() implementation to avoid crawling above snooty.toml
1 parent da23a6c commit 5fc056b

File tree

11 files changed

+63
-12
lines changed

11 files changed

+63
-12
lines changed

snooty/parser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,7 +1406,9 @@ def build(
14061406
pool = multiprocessing.Pool(max_workers)
14071407
with util.PerformanceLogger.singleton().start("parse rst"):
14081408
try:
1409-
paths = util.get_files(self.config.source_path, RST_EXTENSIONS)
1409+
paths = util.get_files(
1410+
self.config.source_path, RST_EXTENSIONS, self.config.root
1411+
)
14101412
logger.debug("Processing rst files")
14111413
results = pool.imap_unordered(partial(parse_rst, self.parser), paths)
14121414
for sequence in results:
@@ -1421,7 +1423,9 @@ def build(
14211423
# Categorize our YAML files
14221424
logger.debug("Categorizing YAML files")
14231425
categorized: Dict[str, List[Path]] = collections.defaultdict(list)
1424-
for path in util.get_files(self.config.source_path, (".yaml",)):
1426+
for path in util.get_files(
1427+
self.config.source_path, (".yaml",), self.config.root
1428+
):
14251429
prefix = get_giza_category(path)
14261430
if prefix in self.yaml_mapping:
14271431
categorized[prefix].append(path)

snooty/test_util.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,22 @@ def test_option_flag() -> None:
5050

5151
def test_get_files() -> None:
5252
# The test_data/getfiles path tests how we handle symbolic links. To wit,
53-
# we ensure that we don't fail on loops; files with the same inode only
54-
# report once; and that we enter symlinks.
53+
# we ensure that we don't fail on loops; files with the same resolved path
54+
# only report once; and that we enter symlinks.
5555

56-
assert set(util.get_files(Path("test_data/getfiles/files1"), (".toml",))) == {
56+
reference_set = {
5757
Path("test_data/getfiles/files1/1.toml"),
5858
Path("test_data/getfiles/files1/2.toml"),
59-
Path("test_data/getfiles/files1/files2/subdirectory/4.toml"),
60-
Path("test_data/getfiles/files1/files2/3.toml"),
59+
Path("test_data/getfiles/files1/loop1/loop1.toml"),
60+
Path("test_data/getfiles/files1/loop2/loop2.toml"),
6161
}
6262

63+
# Either subdirectory or dup are acceptable, but not both
64+
assert set(util.get_files(Path("test_data/getfiles/files1"), (".toml",))) in [
65+
reference_set.union({Path("test_data/getfiles/files1/subdirectory/5.toml")}),
66+
reference_set.union({Path("test_data/getfiles/files1/dup/5.toml")}),
67+
]
68+
6369

6470
def test_add_doc_target_ext() -> None:
6571
# Set up target filenames

snooty/util.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,59 @@ def reroot_path(
6161
return rel_fn, project_root.joinpath(rel_fn).resolve()
6262

6363

64-
def get_files(root: Path, extensions: Container[str]) -> Iterator[Path]:
64+
def is_relative_to(a: Path, b: Path) -> bool:
65+
try:
66+
a.relative_to(b)
67+
return True
68+
except ValueError:
69+
return False
70+
71+
72+
def get_files(
73+
root: Path, extensions: Container[str], must_be_relative_to: Optional[Path] = None
74+
) -> Iterator[Path]:
6575
"""Recursively iterate over files underneath the given root, yielding
6676
only filenames with the given extensions. Symlinks are followed, but
67-
any given concrete directory is only scanned once."""
77+
any given concrete directory is only scanned once.
78+
79+
By default, directories above the given root in the filesystem are not
80+
scanned, but this can be overridden with the must_be_relative_to parameter."""
81+
root_resolved = root.resolve()
6882
seen: Set[Path] = set()
6983

84+
if must_be_relative_to is None:
85+
must_be_relative_to = root_resolved
86+
7087
for base, dirs, files in os.walk(root, followlinks=True):
71-
dirs_set = set(Path(d).resolve() for d in dirs)
72-
dirs[:] = [d.name for d in (dirs_set - seen)]
88+
base_resolved = Path(base).resolve()
89+
if not is_relative_to(base_resolved, must_be_relative_to):
90+
# Prevent a race between our checking if a symlink is valid, and our
91+
# actually entering it.
92+
continue
93+
94+
# Preserve both the actual resolved path and the directory name
95+
dirs_set = dict(((base_resolved.joinpath(d).resolve(), d) for d in dirs))
96+
97+
# Only recurse into directories which are within our prefix
98+
dirs[:] = [
99+
d_name
100+
for d_path, d_name in ((k, v) for k, v in dirs_set.items() if k not in seen)
101+
if is_relative_to(
102+
base_resolved.joinpath(d_path).resolve(), must_be_relative_to
103+
)
104+
]
105+
73106
seen.update(dirs_set)
74107

75108
for name in files:
76109
ext = os.path.splitext(name)[1]
77110
if ext not in extensions:
78111
continue
79112

80-
yield Path(os.path.join(base, name))
113+
path = Path(os.path.join(base, name))
114+
# Detect and ignore symlinks outside of our jail
115+
if is_relative_to(path.resolve(), must_be_relative_to):
116+
yield path
81117

82118

83119
def get_line(node: docutils.nodes.Node) -> int:

test_data/getfiles/files1/3.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../files2/3.toml

test_data/getfiles/files1/dup

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
subdirectory

test_data/getfiles/files1/escape

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../

test_data/getfiles/files1/loop1/loop1.toml

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../loop2
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../loop1

test_data/getfiles/files1/loop2/loop2.toml

Whitespace-only changes.

0 commit comments

Comments
 (0)