Skip to content

Commit 3ee0037

Browse files
committed
update pydra.engine.helpers_file.hash_dir to return single sha string
1 parent 6dc5616 commit 3ee0037

File tree

2 files changed

+87
-28
lines changed

2 files changed

+87
-28
lines changed

pydra/engine/helpers_file.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,60 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True):
8787
return crypto_obj.hexdigest()
8888

8989

90-
def hash_dir(dirpath, raise_notfound=True):
90+
def hash_dir(
91+
dirpath, ignore_hidden_files=False, ignore_hidden_dirs=False, raise_notfound=True
92+
):
93+
"""Compute hash of directory contents.
94+
95+
This function computes the hash of every file in directory `dirpath` and then
96+
computes the hash of that list of hashes to return a single hash value. The
97+
directory is traversed recursively.
98+
99+
Parameters
100+
----------
101+
dirpath : :obj:`str`
102+
Path to directory.
103+
ignore_hidden_files : :obj:`bool`
104+
If `True`, ignore filenames that begin with `.`.
105+
ignore_hidden_dirs : :obj:`bool`
106+
If `True`, ignore files in directories that begin with `.`.
107+
raise_notfound : :obj:`bool`
108+
If `True` and `dirpath` does not exist, raise `FileNotFound` exception. If
109+
`False` and `dirpath` does not exist, return `None`.
110+
111+
Returns
112+
-------
113+
hash : :obj:`str`
114+
Hash of the directory contents.
115+
"""
91116
from .specs import LazyField
92117

93118
if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list):
94119
return None
95120
if not Path(dirpath).is_dir():
96121
if raise_notfound:
97-
raise RuntimeError(f"Directory {dirpath} not found.")
122+
raise FileNotFoundError(f"Directory {dirpath} not found.")
98123
return None
99124

100-
def search_dir(path):
101-
path = Path(path)
102-
file_list = []
103-
for el in path.iterdir():
104-
if el.is_file():
105-
file_list.append(hash_file(el))
106-
else:
107-
file_list.append(search_dir(path / el))
108-
return file_list
125+
file_hashes = []
126+
for dpath, dirnames, filenames in os.walk(dirpath):
127+
# Sort in-place to guarantee order.
128+
dirnames.sort()
129+
filenames.sort()
130+
dpath = Path(dpath)
131+
if ignore_hidden_dirs and dpath.name.startswith(".") and str(dpath) != dirpath:
132+
continue
133+
for filename in filenames:
134+
if ignore_hidden_files and filename.startswith("."):
135+
continue
136+
this_hash = hash_file(dpath / filename)
137+
file_hashes.append(this_hash)
138+
139+
sha = sha256()
140+
for h in file_hashes:
141+
sha.update(h.encode())
109142

110-
return search_dir(dirpath)
143+
return sha.hexdigest()
111144

112145

113146
def _parse_mount_table(exit_code, output):

pydra/engine/tests/test_helpers.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import hashlib
12
from pathlib import Path
3+
import random
24

35
import pytest
46
import cloudpickle as cp
@@ -135,25 +137,49 @@ def test_hash_value_dir(tmpdir):
135137
with open(file_2, "w") as f:
136138
f.write("hi")
137139

138-
assert sorted(hash_value(tmpdir, tp=Directory)) == sorted(
139-
hash_value([file_1, file_2], tp=File)
140-
)
141-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
140+
test_sha = hashlib.sha256()
141+
for fx in [file_1, file_2]:
142+
test_sha.update(helpers_file.hash_file(fx).encode())
143+
144+
bad_sha = hashlib.sha256()
145+
for fx in [file_2, file_1]:
146+
bad_sha.update(helpers_file.hash_file(fx).encode())
147+
148+
orig_hash = helpers_file.hash_dir(tmpdir)
149+
150+
assert orig_hash == test_sha.hexdigest()
151+
assert orig_hash != bad_sha.hexdigest()
152+
assert orig_hash == hash_value(tmpdir, tp=Directory)
142153

143154

144155
def test_hash_value_nested(tmpdir):
156+
hidden = tmpdir.mkdir(".hidden")
145157
nested = tmpdir.mkdir("nested")
146158
file_1 = tmpdir.join("file_1.txt")
147-
file_2 = nested.join("file_2.txt")
148-
file_3 = nested.join("file_3.txt")
149-
with open(file_1, "w") as f:
150-
f.write("hello")
151-
with open(file_2, "w") as f:
152-
f.write("hi")
153-
with open(file_3, "w") as f:
154-
f.write("hola")
159+
file_2 = hidden.join("file_2.txt")
160+
file_3 = nested.join(".file_3.txt")
161+
file_4 = nested.join("file_4.txt")
155162

156-
assert hash_value(tmpdir, tp=Directory) == hash_value(
157-
[file_1, [file_2, file_3]], tp=File
158-
)
159-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
163+
test_sha = hashlib.sha256()
164+
for fx in [file_1, file_2, file_3, file_4]:
165+
with open(fx, "w") as f:
166+
f.write(str(random.randint(0, 1000)))
167+
test_sha.update(helpers_file.hash_file(fx).encode())
168+
169+
orig_hash = helpers_file.hash_dir(tmpdir)
170+
171+
assert orig_hash == test_sha.hexdigest()
172+
assert orig_hash == hash_value(tmpdir, tp=Directory)
173+
174+
nohidden_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_dirs=True, ignore_hidden_files=True)
175+
nohiddendirs_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_dirs=True)
176+
nohiddenfiles_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_files=True)
177+
178+
assert orig_hash != nohidden_hash
179+
assert orig_hash != nohiddendirs_hash
180+
assert orig_hash != nohiddenfiles_hash
181+
182+
file_3.remove()
183+
assert helpers_file.hash_dir(tmpdir) == nohiddenfiles_hash
184+
hidden.remove()
185+
assert helpers_file.hash_dir(tmpdir) == nohidden_hash

0 commit comments

Comments
 (0)