Skip to content

Commit 811dc45

Browse files
authored
Merge pull request #700 from tclose/local-cache-ids
Caching of file-set hashes by local path and mtimes
2 parents ff01e4c + 921979c commit 811dc45

File tree

8 files changed

+433
-56
lines changed

8 files changed

+433
-56
lines changed

pydra/engine/specs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]:
102102
if "container_path" in field.metadata:
103103
continue
104104
inp_dict[field.name] = getattr(self, field.name)
105-
hash_cache = Cache({})
105+
hash_cache = Cache()
106106
field_hashes = {
107107
k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items()
108108
}

pydra/engine/submitter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .workers import Worker, WORKERS
88
from .core import is_workflow
99
from .helpers import get_open_loop, load_and_run_async
10+
from ..utils.hash import PersistentCache
1011

1112
import logging
1213

@@ -54,6 +55,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
5455
self.loop.run_until_complete(
5556
self.submit_from_call(runnable, rerun, environment)
5657
)
58+
PersistentCache().clean_up()
5759
return runnable.result()
5860

5961
async def submit_from_call(self, runnable, rerun, environment):

pydra/engine/tests/test_node_task.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
11
import os
22
import shutil
33
import attr
4+
import typing as ty
45
import numpy as np
6+
import time
7+
from unittest import mock
8+
from pathlib import Path
59
import pytest
10+
import time
11+
from fileformats.generic import File
12+
import pydra.mark
613

714
from .utils import (
815
fun_addtwo,
@@ -306,6 +313,7 @@ def test_task_init_7(tmp_path):
306313
output_dir1 = nn1.output_dir
307314

308315
# changing the content of the file
316+
time.sleep(2) # need the mtime to be different
309317
file2 = tmp_path / "file2.txt"
310318
with open(file2, "w") as f:
311319
f.write("from pydra")
@@ -1560,3 +1568,98 @@ def test_task_state_cachelocations_updated(plugin, tmp_path):
15601568
# both workflows should be run
15611569
assert all([dir.exists() for dir in nn.output_dir])
15621570
assert all([dir.exists() for dir in nn2.output_dir])
1571+
1572+
1573+
def test_task_files_cachelocations(plugin_dask_opt, tmp_path):
1574+
"""
1575+
Two identical tasks with provided cache_dir that use file as an input;
1576+
the second task has cache_locations and should not recompute the results
1577+
"""
1578+
cache_dir = tmp_path / "test_task_nostate"
1579+
cache_dir.mkdir()
1580+
cache_dir2 = tmp_path / "test_task_nostate2"
1581+
cache_dir2.mkdir()
1582+
input_dir = tmp_path / "input"
1583+
input_dir.mkdir()
1584+
1585+
input1 = input_dir / "input1.txt"
1586+
input1.write_text("test")
1587+
input2 = input_dir / "input2.txt"
1588+
input2.write_text("test")
1589+
1590+
nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir)
1591+
with Submitter(plugin=plugin_dask_opt) as sub:
1592+
sub(nn)
1593+
1594+
nn2 = fun_file(
1595+
name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir
1596+
)
1597+
with Submitter(plugin=plugin_dask_opt) as sub:
1598+
sub(nn2)
1599+
1600+
# checking the results
1601+
results2 = nn2.result()
1602+
assert results2.output.out == "test"
1603+
1604+
# checking if the second task didn't run the interface again
1605+
assert nn.output_dir.exists()
1606+
assert not nn2.output_dir.exists()
1607+
1608+
1609+
class OverriddenContentsFile(File):
1610+
"""A class for testing purposes, to that enables you to override the contents
1611+
of the file to allow you to check whether the persistent cache is used."""
1612+
1613+
def __init__(
1614+
self,
1615+
fspaths: ty.Iterator[Path],
1616+
contents: ty.Optional[bytes] = None,
1617+
metadata: ty.Dict[str, ty.Any] = None,
1618+
):
1619+
super().__init__(fspaths, metadata=metadata)
1620+
self._contents = contents
1621+
1622+
def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]:
1623+
if self._contents is not None:
1624+
yield (str(self.fspath), iter([self._contents]))
1625+
else:
1626+
yield from super().byte_chunks(**kwargs)
1627+
1628+
@property
1629+
def contents(self):
1630+
if self._contents is not None:
1631+
return self._contents
1632+
return super().contents
1633+
1634+
1635+
def test_task_files_persistentcache(tmp_path):
1636+
"""
1637+
Two identical tasks with provided cache_dir that use file as an input;
1638+
the second task has cache_locations and should not recompute the results
1639+
"""
1640+
test_file_path = tmp_path / "test_file.txt"
1641+
test_file_path.write_bytes(b"foo")
1642+
cache_dir = tmp_path / "cache-dir"
1643+
cache_dir.mkdir()
1644+
test_file = OverriddenContentsFile(test_file_path)
1645+
1646+
@pydra.mark.task
1647+
def read_contents(x: OverriddenContentsFile) -> bytes:
1648+
return x.contents
1649+
1650+
assert (
1651+
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
1652+
== b"foo"
1653+
)
1654+
test_file._contents = b"bar"
1655+
# should return result from the first run using the persistent cache
1656+
assert (
1657+
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
1658+
== b"foo"
1659+
)
1660+
time.sleep(2) # Windows has a 2-second resolution for mtime
1661+
test_file_path.touch() # update the mtime to invalidate the persistent cache value
1662+
assert (
1663+
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
1664+
== b"bar"
1665+
) # returns the overridden value

pydra/engine/tests/test_specs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import attrs
55
from copy import deepcopy
6+
import time
67

78
from ..specs import (
89
BaseSpec,
@@ -163,6 +164,7 @@ def test_input_file_hash_2(tmp_path):
163164
assert hash1 == hash2
164165

165166
# checking if different content (the same name) affects the hash
167+
time.sleep(2) # ensure mtime is different
166168
file_diffcontent = tmp_path / "in_file_1.txt"
167169
with open(file_diffcontent, "w") as f:
168170
f.write("hi")
@@ -193,6 +195,7 @@ def test_input_file_hash_2a(tmp_path):
193195
assert hash1 == hash2
194196

195197
# checking if different content (the same name) affects the hash
198+
time.sleep(2) # ensure mtime is different
196199
file_diffcontent = tmp_path / "in_file_1.txt"
197200
with open(file_diffcontent, "w") as f:
198201
f.write("hi")
@@ -234,6 +237,7 @@ def test_input_file_hash_3(tmp_path):
234237
# assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])
235238

236239
# recreating the file
240+
time.sleep(2) # ensure mtime is different
237241
with open(file, "w") as f:
238242
f.write("hello")
239243

@@ -288,6 +292,7 @@ def test_input_file_hash_4(tmp_path):
288292
assert hash1 == hash2
289293

290294
# checking if different content (the same name) affects the hash
295+
time.sleep(2) # need the mtime to be different
291296
file_diffcontent = tmp_path / "in_file_1.txt"
292297
with open(file_diffcontent, "w") as f:
293298
f.write("hi")
@@ -324,6 +329,7 @@ def test_input_file_hash_5(tmp_path):
324329
assert hash1 == hash2
325330

326331
# checking if different content (the same name) affects the hash
332+
time.sleep(2) # ensure mtime is different
327333
file_diffcontent = tmp_path / "in_file_1.txt"
328334
with open(file_diffcontent, "w") as f:
329335
f.write("hi")

pydra/utils/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from pathlib import Path
2+
import platformdirs
3+
from pydra._version import __version__
4+
5+
user_cache_dir = Path(
6+
platformdirs.user_cache_dir(
7+
appname="pydra",
8+
appauthor="nipype",
9+
version=__version__,
10+
)
11+
)

0 commit comments

Comments
 (0)