Skip to content

Commit 5712be8

Browse files
committed
Use os.walk to speedup reading from DirectoryStore
usage of os.walk for tree (os.scandir for folders) is faster than listdir as it avoids many stats call. This Should make the DirectoryStore faster.
1 parent bb6b905 commit 5712be8

File tree

3 files changed

+51
-9
lines changed

3 files changed

+51
-9
lines changed

docs/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Release notes
55
Next release
66
------------
77

8+
* `DirectoryStore` now uses `os.scandir`, which should make listing large store
9+
faster, :issue:`563`
810
* Fix minor bug in `N5Store`.
911
By :user:`gsakkis`, :issue:`550`.
1012

zarr/storage.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -845,15 +845,27 @@ def __eq__(self, other):
845845

846846
def keys(self):
847847
if os.path.exists(self.path):
848-
directories = [(self.path, '')]
849-
while directories:
850-
dir_name, prefix = directories.pop()
851-
for name in os.listdir(dir_name):
852-
path = os.path.join(dir_name, name)
853-
if os.path.isfile(path):
854-
yield prefix + name
855-
elif os.path.isdir(path):
856-
directories.append((path, prefix + name + '/'))
848+
yield from self._keys_fast(self.path)
849+
850+
@staticmethod
851+
def _keys_fast(path, walker=os.walk):
852+
"""
853+
854+
Faster logic on platform where the separator is `/` and using
855+
`os.walk()` to decrease the number of stats.call.
856+
857+
"""
858+
it = iter(walker(path))
859+
d0, dirnames, filenames = next(it)
860+
if d0.endswith('/'):
861+
root_len = len(d0)
862+
else:
863+
root_len = len(d0)+1
864+
for f in filenames:
865+
yield f
866+
for dirpath, _, filenames in it:
867+
for f in filenames:
868+
yield dirpath[root_len:].replace('\\', '/')+'/'+f
857869

858870
def __iter__(self):
859871
return self.keys()

zarr/tests/test_storage.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,34 @@ def test_normalize_keys(self):
827827
assert 'FOO' in store
828828
assert 'foo' in store
829829

830+
def test_listing_keys_slash(self):
831+
832+
def mock_walker_slash(_path):
833+
yield from [
834+
# trailing slash in first key
835+
('root_with_slash/', ['d1', 'g1'], ['.zgroup']),
836+
('root_with_slash/d1', [], ['.zarray']),
837+
('root_with_slash/g1', [], ['.zgroup'])
838+
]
839+
840+
res = set(DirectoryStore._keys_fast('root_with_slash/', walker=mock_walker_slash))
841+
assert res == {'.zgroup', 'g1/.zgroup', 'd1/.zarray'}
842+
843+
def test_listing_keys_no_slash(self):
844+
845+
def mock_walker_no_slash(_path):
846+
yield from [
847+
# no trainling slash in first key
848+
('root_with_no_slash', ['d1', 'g1'], ['.zgroup']),
849+
('root_with_no_slash/d1', [], ['.zarray']),
850+
('root_with_no_slash/g1', [], ['.zgroup'])
851+
]
852+
853+
res = set(
854+
DirectoryStore._keys_fast('root_with_no_slash', mock_walker_no_slash)
855+
)
856+
assert res == {'.zgroup', 'g1/.zgroup', 'd1/.zarray'}
857+
830858

831859
@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec")
832860
class TestFSStore(StoreTests, unittest.TestCase):

0 commit comments

Comments
 (0)