Skip to content

Commit 3fb2637

Browse files
committed
Optimize _FlatDirectorySource to only scan each path once
1 parent d1f0981 commit 3fb2637

File tree

2 files changed

+75
-11
lines changed

2 files changed

+75
-11
lines changed

src/pip/_internal/index/collector.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,7 @@ def collect_sources(
473473
page_validator=self.session.is_secure_origin,
474474
expand_dir=False,
475475
cache_link_parsing=False,
476+
project_name=project_name,
476477
)
477478
for loc in self.search_scope.get_index_urls_locations(project_name)
478479
).values()
@@ -483,6 +484,7 @@ def collect_sources(
483484
page_validator=self.session.is_secure_origin,
484485
expand_dir=True,
485486
cache_link_parsing=True,
487+
project_name=project_name,
486488
)
487489
for loc in self.find_links
488490
).values()

src/pip/_internal/index/sources.py

Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
11
import logging
22
import mimetypes
33
import os
4-
import pathlib
5-
from typing import Callable, Iterable, Optional, Tuple
4+
from collections import defaultdict
5+
from typing import Callable, Dict, Iterable, List, Optional, Tuple
6+
7+
from pip._vendor.packaging.utils import (
8+
InvalidSdistFilename,
9+
InvalidVersion,
10+
InvalidWheelFilename,
11+
canonicalize_name,
12+
parse_sdist_filename,
13+
parse_wheel_filename,
14+
)
615

716
from pip._internal.models.candidate import InstallationCandidate
817
from pip._internal.models.link import Link
@@ -36,6 +45,53 @@ def _is_html_file(file_url: str) -> bool:
3645
return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"
3746

3847

48+
class _FlatDirectoryToUrls:
49+
"""Scans directory and caches results"""
50+
51+
def __init__(self, path: str) -> None:
52+
self._path = path
53+
self._page_candidates: List[str] = []
54+
self._project_name_to_urls: Dict[str, List[str]] = defaultdict(list)
55+
self._scanned_directory = False
56+
57+
def _scan_directory(self) -> None:
58+
"""Scans directory once and populates both page_candidates
59+
and project_name_to_urls at the same time
60+
"""
61+
for entry in os.scandir(self._path):
62+
url = path_to_url(entry.path)
63+
if _is_html_file(url):
64+
self._page_candidates.append(url)
65+
continue
66+
67+
# File must have a valid wheel or sdist name,
68+
# otherwise not worth considering as a package
69+
try:
70+
project_filename = parse_wheel_filename(entry.name)[0]
71+
except (InvalidWheelFilename, InvalidVersion):
72+
try:
73+
project_filename = parse_sdist_filename(entry.name)[0]
74+
except (InvalidSdistFilename, InvalidVersion):
75+
continue
76+
77+
self._project_name_to_urls[project_filename].append(url)
78+
self._scanned_directory = True
79+
80+
@property
81+
def page_candidates(self) -> List[str]:
82+
if not self._scanned_directory:
83+
self._scan_directory()
84+
85+
return self._page_candidates
86+
87+
@property
88+
def project_name_to_urls(self) -> Dict[str, List[str]]:
89+
if not self._scanned_directory:
90+
self._scan_directory()
91+
92+
return self._project_name_to_urls
93+
94+
3995
class _FlatDirectorySource(LinkSource):
4096
"""Link source specified by ``--find-links=<path-to-dir>``.
4197
@@ -45,30 +101,34 @@ class _FlatDirectorySource(LinkSource):
45101
* ``file_candidates``: Archives in the directory.
46102
"""
47103

104+
_paths_to_urls: Dict[str, _FlatDirectoryToUrls] = {}
105+
48106
def __init__(
49107
self,
50108
candidates_from_page: CandidatesFromPage,
51109
path: str,
110+
project_name: str,
52111
) -> None:
53112
self._candidates_from_page = candidates_from_page
54-
self._path = pathlib.Path(os.path.realpath(path))
113+
self._project_name = canonicalize_name(project_name)
114+
115+
# Get existing instance of _FlatDirectoryToUrls if it exists
116+
if path in self._paths_to_urls:
117+
self._path_to_urls = self._paths_to_urls[path]
118+
else:
119+
self._path_to_urls = _FlatDirectoryToUrls(path=path)
120+
self._paths_to_urls[path] = self._path_to_urls
55121

56122
@property
57123
def link(self) -> Optional[Link]:
58124
return None
59125

60126
def page_candidates(self) -> FoundCandidates:
61-
for path in self._path.iterdir():
62-
url = path_to_url(str(path))
63-
if not _is_html_file(url):
64-
continue
127+
for url in self._path_to_urls.page_candidates:
65128
yield from self._candidates_from_page(Link(url))
66129

67130
def file_links(self) -> FoundLinks:
68-
for path in self._path.iterdir():
69-
url = path_to_url(str(path))
70-
if _is_html_file(url):
71-
continue
131+
for url in self._path_to_urls.project_name_to_urls[self._project_name]:
72132
yield Link(url)
73133

74134

@@ -170,6 +230,7 @@ def build_source(
170230
page_validator: PageValidator,
171231
expand_dir: bool,
172232
cache_link_parsing: bool,
233+
project_name: str,
173234
) -> Tuple[Optional[str], Optional[LinkSource]]:
174235
path: Optional[str] = None
175236
url: Optional[str] = None
@@ -203,6 +264,7 @@ def build_source(
203264
source = _FlatDirectorySource(
204265
candidates_from_page=candidates_from_page,
205266
path=path,
267+
project_name=project_name,
206268
)
207269
else:
208270
source = _IndexDirectorySource(

0 commit comments

Comments
 (0)