Skip to content

Commit 9b223ca

Browse files
committed
GH-125413: pathlib: use scandir() to speed up copy()
Use the new `PathBase.scandir()` method in `PathBase.copy()`, which greatly reduces the number of `PathBase.stat()` calls needed when copying. This also speeds up `Path.copy()`, which inherits the superclass implementation. Under the hood, we use directory entries to distinguish between files, directories and symlinks, and to retrieve a `stat_result` when reading metadata. This logic is extracted into a new `pathlib._abc.CopierBase` class, which helps reduce the number of underscore-prefixed support methods in the path interface.
1 parent 260843d commit 9b223ca

File tree

5 files changed

+170
-144
lines changed

5 files changed

+170
-144
lines changed

Lib/pathlib/_abc.py

Lines changed: 115 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,110 @@ def isabs(self, path):
8787
raise UnsupportedOperation(self._unsupported_msg('isabs()'))
8888

8989

90+
class CopierBase:
91+
"""Base class for path copiers, which transfer files and directories from
92+
one path object to another.
93+
94+
A reference to this class is available as PathBase._copier. When
95+
PathBase.copy() is called, it uses the copier type of the *target* path to
96+
perform the copy; this allows writing of data and metadata to occur
97+
together (or in a particular order) where supported or required by the
98+
path type.
99+
"""
100+
__slots__ = ('follow_symlinks', 'dirs_exist_ok', 'preserve_metadata')
101+
102+
def __init__(self, follow_symlinks=True, dirs_exist_ok=False,
103+
preserve_metadata=False):
104+
self.follow_symlinks = follow_symlinks
105+
self.dirs_exist_ok = dirs_exist_ok
106+
self.preserve_metadata = preserve_metadata
107+
108+
@classmethod
109+
def ensure_different_files(cls, source, target):
110+
"""Raise OSError(EINVAL) if both paths refer to the same file."""
111+
try:
112+
if not target.samefile(source):
113+
return
114+
except (OSError, ValueError):
115+
return
116+
err = OSError(EINVAL, "Source and target are the same file")
117+
err.filename = str(source)
118+
err.filename2 = str(target)
119+
raise err
120+
121+
@classmethod
122+
def ensure_distinct_paths(cls, source, target):
123+
"""Raise OSError(EINVAL) if the target is within the source path."""
124+
# Note: there is no straightforward, foolproof algorithm to determine
125+
# if one directory is within another (a particularly perverse example
126+
# would be a single network share mounted in one location via NFS, and
127+
# in another location via CIFS), so we simply checks whether the
128+
# other path is lexically equal to, or within, this path.
129+
if source == target:
130+
err = OSError(EINVAL, "Source and target are the same path")
131+
elif source in target.parents:
132+
err = OSError(EINVAL, "Source path is a parent of target path")
133+
else:
134+
return
135+
err.filename = str(source)
136+
err.filename2 = str(target)
137+
raise err
138+
139+
def copy(self, source, target):
140+
"""Copy the given file or directory tree to the given target."""
141+
self.ensure_distinct_paths(source, target)
142+
if self.preserve_metadata:
143+
metadata_keys = source._readable_metadata & target._writable_metadata
144+
else:
145+
metadata_keys = frozenset()
146+
if not self.follow_symlinks and source.is_symlink():
147+
self.copy_symlink(source, target, metadata_keys)
148+
elif source.is_dir():
149+
self.copy_dir(source, target, metadata_keys)
150+
else:
151+
self.copy_file(source, target, metadata_keys)
152+
153+
def copy_dir(self, source, target, metadata_keys, dir_entry=None):
154+
"""Copy the given directory to the given target."""
155+
metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry)
156+
with source.scandir() as entries:
157+
target.mkdir(exist_ok=self.dirs_exist_ok)
158+
for entry in entries:
159+
src = source.joinpath(entry.name)
160+
dst = target.joinpath(entry.name)
161+
if not self.follow_symlinks and entry.is_symlink():
162+
self.copy_symlink(src, dst, metadata_keys, entry)
163+
elif entry.is_dir():
164+
self.copy_dir(src, dst, metadata_keys, entry)
165+
else:
166+
self.copy_file(src, dst, metadata_keys, entry)
167+
target._write_metadata(metadata)
168+
169+
def copy_file(self, source, target, metadata_keys, dir_entry=None):
170+
"""Copy the given file to the given target."""
171+
self.ensure_different_files(source, target)
172+
metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry)
173+
with source.open('rb') as source_f:
174+
try:
175+
with target.open('wb') as target_f:
176+
copyfileobj(source_f, target_f)
177+
except IsADirectoryError as e:
178+
if not target.exists():
179+
# Raise a less confusing exception.
180+
raise FileNotFoundError(
181+
f'Directory does not exist: {target}') from e
182+
else:
183+
raise
184+
target._write_metadata(metadata)
185+
186+
def copy_symlink(self, source, target, metadata_keys, dir_entry=None):
187+
"""Copy the given symlink to the given target."""
188+
metadata = source._read_metadata(
189+
metadata_keys, follow_symlinks=False, dir_entry=dir_entry)
190+
target.symlink_to(source.readlink())
191+
target._write_metadata(metadata, follow_symlinks=False)
192+
193+
90194
class PathGlobber(_GlobberBase):
91195
"""
92196
Class providing shell-style globbing for path objects.
@@ -425,6 +529,9 @@ class PathBase(PurePathBase):
425529

426530
# Maximum number of symlinks to follow in resolve()
427531
_max_symlinks = 40
532+
_copier = CopierBase
533+
_readable_metadata = frozenset()
534+
_writable_metadata = frozenset()
428535

429536
@classmethod
430537
def _unsupported_msg(cls, attribute):
@@ -565,39 +672,6 @@ def samefile(self, other_path):
565672
return (st.st_ino == other_st.st_ino and
566673
st.st_dev == other_st.st_dev)
567674

568-
def _ensure_different_file(self, other_path):
569-
"""
570-
Raise OSError(EINVAL) if both paths refer to the same file.
571-
"""
572-
try:
573-
if not self.samefile(other_path):
574-
return
575-
except (OSError, ValueError):
576-
return
577-
err = OSError(EINVAL, "Source and target are the same file")
578-
err.filename = str(self)
579-
err.filename2 = str(other_path)
580-
raise err
581-
582-
def _ensure_distinct_path(self, other_path):
583-
"""
584-
Raise OSError(EINVAL) if the other path is within this path.
585-
"""
586-
# Note: there is no straightforward, foolproof algorithm to determine
587-
# if one directory is within another (a particularly perverse example
588-
# would be a single network share mounted in one location via NFS, and
589-
# in another location via CIFS), so we simply checks whether the
590-
# other path is lexically equal to, or within, this path.
591-
if self == other_path:
592-
err = OSError(EINVAL, "Source and target are the same path")
593-
elif self in other_path.parents:
594-
err = OSError(EINVAL, "Source path is a parent of target path")
595-
else:
596-
return
597-
err.filename = str(self)
598-
err.filename2 = str(other_path)
599-
raise err
600-
601675
def open(self, mode='r', buffering=-1, encoding=None,
602676
errors=None, newline=None):
603677
"""
@@ -805,13 +879,6 @@ def symlink_to(self, target, target_is_directory=False):
805879
"""
806880
raise UnsupportedOperation(self._unsupported_msg('symlink_to()'))
807881

808-
def _symlink_to_target_of(self, link):
809-
"""
810-
Make this path a symlink with the same target as the given link. This
811-
is used by copy().
812-
"""
813-
self.symlink_to(link.readlink())
814-
815882
def hardlink_to(self, target):
816883
"""
817884
Make this path a hard link pointing to the same file as *target*.
@@ -832,74 +899,31 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False):
832899
"""
833900
raise UnsupportedOperation(self._unsupported_msg('mkdir()'))
834901

835-
# Metadata keys supported by this path type.
836-
_readable_metadata = _writable_metadata = frozenset()
837-
838-
def _read_metadata(self, keys=None, *, follow_symlinks=True):
902+
def _read_metadata(self, metadata_keys, *, follow_symlinks=True, dir_entry=None):
839903
"""
840904
Returns path metadata as a dict with string keys.
841905
"""
906+
if not metadata_keys:
907+
return {}
842908
raise UnsupportedOperation(self._unsupported_msg('_read_metadata()'))
843909

844910
def _write_metadata(self, metadata, *, follow_symlinks=True):
845911
"""
846912
Sets path metadata from the given dict with string keys.
847913
"""
914+
if not metadata:
915+
return
848916
raise UnsupportedOperation(self._unsupported_msg('_write_metadata()'))
849917

850-
def _copy_metadata(self, target, *, follow_symlinks=True):
851-
"""
852-
Copies metadata (permissions, timestamps, etc) from this path to target.
853-
"""
854-
# Metadata types supported by both source and target.
855-
keys = self._readable_metadata & target._writable_metadata
856-
if keys:
857-
metadata = self._read_metadata(keys, follow_symlinks=follow_symlinks)
858-
target._write_metadata(metadata, follow_symlinks=follow_symlinks)
859-
860-
def _copy_file(self, target):
861-
"""
862-
Copy the contents of this file to the given target.
863-
"""
864-
self._ensure_different_file(target)
865-
with self.open('rb') as source_f:
866-
try:
867-
with target.open('wb') as target_f:
868-
copyfileobj(source_f, target_f)
869-
except IsADirectoryError as e:
870-
if not target.exists():
871-
# Raise a less confusing exception.
872-
raise FileNotFoundError(
873-
f'Directory does not exist: {target}') from e
874-
else:
875-
raise
876-
877918
def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False,
878919
preserve_metadata=False):
879920
"""
880921
Recursively copy this file or directory tree to the given destination.
881922
"""
882923
if not isinstance(target, PathBase):
883924
target = self.with_segments(target)
884-
self._ensure_distinct_path(target)
885-
stack = [(self, target)]
886-
while stack:
887-
src, dst = stack.pop()
888-
if not follow_symlinks and src.is_symlink():
889-
dst._symlink_to_target_of(src)
890-
if preserve_metadata:
891-
src._copy_metadata(dst, follow_symlinks=False)
892-
elif src.is_dir():
893-
children = src.iterdir()
894-
dst.mkdir(exist_ok=dirs_exist_ok)
895-
stack.extend((child, dst.joinpath(child.name))
896-
for child in children)
897-
if preserve_metadata:
898-
src._copy_metadata(dst)
899-
else:
900-
src._copy_file(dst)
901-
if preserve_metadata:
902-
src._copy_metadata(dst)
925+
copier = target._copier(follow_symlinks, dirs_exist_ok, preserve_metadata)
926+
copier.copy(self, target)
903927
return target
904928

905929
def copy_into(self, target_dir, *, follow_symlinks=True,
@@ -946,7 +970,7 @@ def move(self, target):
946970
"""
947971
Recursively move this file or directory tree to the given destination.
948972
"""
949-
self._ensure_different_file(target)
973+
target._copier.ensure_different_files(self, target)
950974
try:
951975
return self.replace(target)
952976
except UnsupportedOperation:

Lib/pathlib/_local.py

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from pathlib._os import (copyfile, file_metadata_keys, read_file_metadata,
2121
write_file_metadata)
22-
from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase
22+
from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase, CopierBase
2323

2424

2525
__all__ = [
@@ -57,6 +57,33 @@ def __repr__(self):
5757
return "<{}.parents>".format(type(self._path).__name__)
5858

5959

60+
class _Copier(CopierBase):
61+
"""Copier class that uses fast OS copy routine where possible, and ensures
62+
symlinks' target_is_directory argument is properly set on Windows.
63+
"""
64+
__slots__ = ()
65+
66+
if copyfile:
67+
def copy_file(self, source, target, metadata_keys, dir_entry=None):
68+
"""Copy the given file to the given target."""
69+
try:
70+
source = os.fspath(source)
71+
except TypeError:
72+
if not isinstance(source, PathBase):
73+
raise
74+
CopierBase.copy_file(self, source, target, metadata_keys, dir_entry)
75+
else:
76+
copyfile(source, os.fspath(target))
77+
78+
if os.name == 'nt':
79+
def copy_symlink(self, source, target, metadata_keys, dir_entry=None):
80+
"""Copy the given symlink to the given target."""
81+
metadata = source._read_metadata(
82+
metadata_keys, follow_symlinks=False, dir_entry=dir_entry)
83+
target.symlink_to(source.readlink(), (dir_entry or source).is_dir())
84+
target._write_metadata(metadata, follow_symlinks=False)
85+
86+
6087
class PurePath(PurePathBase):
6188
"""Base class for manipulating paths without I/O.
6289
@@ -512,6 +539,11 @@ class Path(PathBase, PurePath):
512539
but cannot instantiate a WindowsPath on a POSIX system or vice versa.
513540
"""
514541
__slots__ = ()
542+
_copier = _Copier
543+
_readable_metadata = file_metadata_keys
544+
_writable_metadata = file_metadata_keys
545+
_read_metadata = read_file_metadata
546+
_write_metadata = write_file_metadata
515547
as_uri = PurePath.as_uri
516548

517549
@classmethod
@@ -789,24 +821,6 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False):
789821
if not exist_ok or not self.is_dir():
790822
raise
791823

792-
_readable_metadata = _writable_metadata = file_metadata_keys
793-
_read_metadata = read_file_metadata
794-
_write_metadata = write_file_metadata
795-
796-
if copyfile:
797-
def _copy_file(self, target):
798-
"""
799-
Copy the contents of this file to the given target.
800-
"""
801-
try:
802-
target = os.fspath(target)
803-
except TypeError:
804-
if not isinstance(target, PathBase):
805-
raise
806-
PathBase._copy_file(self, target)
807-
else:
808-
copyfile(os.fspath(self), target)
809-
810824
def chmod(self, mode, *, follow_symlinks=True):
811825
"""
812826
Change the permissions of the path, like os.chmod().
@@ -869,14 +883,6 @@ def symlink_to(self, target, target_is_directory=False):
869883
"""
870884
os.symlink(target, self, target_is_directory)
871885

872-
if os.name == 'nt':
873-
def _symlink_to_target_of(self, link):
874-
"""
875-
Make this path a symlink with the same target as the given link.
876-
This is used by copy().
877-
"""
878-
self.symlink_to(link.readlink(), link.is_dir())
879-
880886
if hasattr(os, "link"):
881887
def hardlink_to(self, target):
882888
"""

0 commit comments

Comments
 (0)