Skip to content

Commit 58541c1

Browse files
authored
repo.py: use fstatat(2) in FastPackageChecker (spack#51710)
* Use `fstatat(2)` so the kernel does not have to resolve the (long) absolute path, instead it just has to resolve `<pkg name>/package.py` relative to the packages dir. * Avoid `os.path.join` in the inner loop, which does all sorts of redundant defensive checks. All we need is append `/package.py`. * Store only mtime, not `stat_result` object, as the rest is unused. On Linux, this reduces runtime by 25% on an NVME disk, from 46.6ms to 35.8ms. Signed-off-by: Harmen Stoppels <me@harmenstoppels.nl>
1 parent 15cdbdb commit 58541c1

File tree

1 file changed

+47
-30
lines changed

1 file changed

+47
-30
lines changed

lib/spack/spack/repo.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -343,17 +343,27 @@ def __getattr__(self, name):
343343
return getattr(self, name)
344344

345345

346-
class FastPackageChecker(Mapping[str, os.stat_result]):
347-
"""Cache that maps package names to the stats obtained on the
348-
``package.py`` files associated with them.
346+
@contextlib.contextmanager
347+
def _directory_fd(path: str) -> Generator[Optional[int], None, None]:
348+
if sys.platform == "win32":
349+
yield None
350+
return
351+
352+
fd = os.open(path, os.O_RDONLY)
353+
try:
354+
yield fd
355+
finally:
356+
os.close(fd)
349357

350-
For each repository a cache is maintained at class level, and shared among
351-
all instances referring to it. Update of the global cache is done lazily
352-
during instance initialization.
353-
"""
358+
359+
class FastPackageChecker(Mapping[str, float]):
360+
"""Cache that maps package names to the modification times of their ``package.py`` files.
361+
362+
For each repository a cache is maintained at class level, and shared among all instances
363+
referring to it. Update of the global cache is done lazily during instance initialization."""
354364

355365
#: Global cache, reused by every instance
356-
_paths_cache: Dict[str, Dict[str, os.stat_result]] = {}
366+
_paths_cache: Dict[str, Dict[str, float]] = {}
357367

358368
def __init__(self, packages_path: str, package_api: Tuple[int, int]) -> None:
359369
# The path of the repository managed by this instance
@@ -365,39 +375,45 @@ def __init__(self, packages_path: str, package_api: Tuple[int, int]) -> None:
365375
self._paths_cache[packages_path] = self._create_new_cache()
366376

367377
#: Reference to the appropriate entry in the global cache
368-
self._packages_to_stats = self._paths_cache[packages_path]
378+
self._packages_to_mtime = self._paths_cache[packages_path]
369379

370380
def invalidate(self) -> None:
371381
"""Regenerate cache for this checker."""
372382
self._paths_cache[self.packages_path] = self._create_new_cache()
373-
self._packages_to_stats = self._paths_cache[self.packages_path]
383+
self._packages_to_mtime = self._paths_cache[self.packages_path]
374384

375-
def _create_new_cache(self) -> Dict[str, os.stat_result]:
385+
def _create_new_cache(self) -> Dict[str, float]:
376386
"""Create a new cache for packages in a repo.
377387
378-
The implementation here should try to minimize filesystem
379-
calls. At the moment, it is O(number of packages) and makes
380-
about one stat call per package. This is reasonably fast, and
381-
avoids actually importing packages in Spack, which is slow.
382-
"""
388+
The implementation here should try to minimize filesystem calls. At the moment, it makes
389+
one stat call per package. This is reasonably fast, and avoids actually importing packages
390+
in Spack, which is slow."""
383391
# Create a dictionary that will store the mapping between a
384-
# package name and its stat info
385-
cache: Dict[str, os.stat_result] = {}
386-
with os.scandir(self.packages_path) as entries:
392+
# package name and its mtime
393+
cache: Dict[str, float] = {}
394+
# Don't use os.path.join in the loop cause it's slow and redundant.
395+
package_py_suffix = f"{os.path.sep}{package_file_name}"
396+
397+
# Use a file descriptor for the packages directory to avoid repeated path resolution.
398+
with _directory_fd(self.packages_path) as fd, os.scandir(self.packages_path) as entries:
387399
for entry in entries:
388400
# Construct the file name from the directory
389-
pkg_file = os.path.join(entry.path, package_file_name)
401+
if sys.platform == "win32":
402+
pkg_file = f"{entry.path}{package_py_suffix}"
403+
else:
404+
pkg_file = f"{entry.name}{package_py_suffix}"
390405

391406
try:
392-
sinfo = os.stat(pkg_file)
407+
sinfo = os.stat(pkg_file, dir_fd=fd)
393408
except OSError as e:
394409
if e.errno in (errno.ENOENT, errno.ENOTDIR):
395410
# No package.py file here.
396411
continue
397412
elif e.errno == errno.EACCES:
413+
pkg_file = os.path.join(self.packages_path, entry.name, package_file_name)
398414
tty.warn(f"Can't read package file {pkg_file}.")
399415
continue
400-
raise e
416+
raise
401417

402418
# If it's not a file, skip it.
403419
if not stat.S_ISREG(sinfo.st_mode):
@@ -407,31 +423,32 @@ def _create_new_cache(self) -> Dict[str, os.stat_result]:
407423
# the current package API
408424
if not nm.valid_module_name(entry.name, self.package_api):
409425
x, y = self.package_api
426+
pkg_file = os.path.join(self.packages_path, entry.name, package_file_name)
410427
tty.warn(
411428
f"Package {pkg_file} cannot be used because `{entry.name}` is not a valid "
412429
f"Spack package module name for Package API v{x}.{y}."
413430
)
414431
continue
415432

416-
# Store the stat info by package name.
417-
cache[nm.pkg_dir_to_pkg_name(entry.name, self.package_api)] = sinfo
433+
# Store the mtime by package name.
434+
cache[nm.pkg_dir_to_pkg_name(entry.name, self.package_api)] = sinfo.st_mtime
418435

419436
return cache
420437

421438
def last_mtime(self) -> float:
422-
return max(sinfo.st_mtime for sinfo in self._packages_to_stats.values())
439+
return max(self._packages_to_mtime.values())
423440

424441
def modified_since(self, since: float) -> List[str]:
425-
return [name for name, sinfo in self._packages_to_stats.items() if sinfo.st_mtime > since]
442+
return [name for name, mtime in self._packages_to_mtime.items() if mtime > since]
426443

427-
def __getitem__(self, item: str) -> os.stat_result:
428-
return self._packages_to_stats[item]
444+
def __getitem__(self, item: str) -> float:
445+
return self._packages_to_mtime[item]
429446

430447
def __iter__(self) -> Iterator[str]:
431-
return iter(self._packages_to_stats)
448+
return iter(self._packages_to_mtime)
432449

433450
def __len__(self) -> int:
434-
return len(self._packages_to_stats)
451+
return len(self._packages_to_mtime)
435452

436453

437454
class Indexer(metaclass=abc.ABCMeta):

0 commit comments

Comments
 (0)