From 3fb263701829541db79529369da3a129264dd5bc Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Wed, 18 Oct 2023 18:58:38 -0400 Subject: [PATCH 1/3] Optimize _FlatDirectorySource to only scan each path once --- src/pip/_internal/index/collector.py | 2 + src/pip/_internal/index/sources.py | 84 ++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index b3e293ea3..08c8bddcb 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -473,6 +473,7 @@ class LinkCollector: page_validator=self.session.is_secure_origin, expand_dir=False, cache_link_parsing=False, + project_name=project_name, ) for loc in self.search_scope.get_index_urls_locations(project_name) ).values() @@ -483,6 +484,7 @@ class LinkCollector: page_validator=self.session.is_secure_origin, expand_dir=True, cache_link_parsing=True, + project_name=project_name, ) for loc in self.find_links ).values() diff --git a/src/pip/_internal/index/sources.py b/src/pip/_internal/index/sources.py index cd9cb8d40..f4626d71a 100644 --- a/src/pip/_internal/index/sources.py +++ b/src/pip/_internal/index/sources.py @@ -1,8 +1,17 @@ import logging import mimetypes import os -import pathlib -from typing import Callable, Iterable, Optional, Tuple +from collections import defaultdict +from typing import Callable, Dict, Iterable, List, Optional, Tuple + +from pip._vendor.packaging.utils import ( + InvalidSdistFilename, + InvalidVersion, + InvalidWheelFilename, + canonicalize_name, + parse_sdist_filename, + parse_wheel_filename, +) from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.link import Link @@ -36,6 +45,53 @@ def _is_html_file(file_url: str) -> bool: return mimetypes.guess_type(file_url, strict=False)[0] == "text/html" +class _FlatDirectoryToUrls: + """Scans directory and caches results""" + + def __init__(self, path: str) -> None: + self._path = path + self._page_candidates: List[str] = [] + self._project_name_to_urls: Dict[str, List[str]] = defaultdict(list) + self._scanned_directory = False + + def _scan_directory(self) -> None: + """Scans directory once and populates both page_candidates + and project_name_to_urls at the same time + """ + for entry in os.scandir(self._path): + url = path_to_url(entry.path) + if _is_html_file(url): + self._page_candidates.append(url) + continue + + # File must have a valid wheel or sdist name, + # otherwise not worth considering as a package + try: + project_filename = parse_wheel_filename(entry.name)[0] + except (InvalidWheelFilename, InvalidVersion): + try: + project_filename = parse_sdist_filename(entry.name)[0] + except (InvalidSdistFilename, InvalidVersion): + continue + + self._project_name_to_urls[project_filename].append(url) + self._scanned_directory = True + + @property + def page_candidates(self) -> List[str]: + if not self._scanned_directory: + self._scan_directory() + + return self._page_candidates + + @property + def project_name_to_urls(self) -> Dict[str, List[str]]: + if not self._scanned_directory: + self._scan_directory() + + return self._project_name_to_urls + + class _FlatDirectorySource(LinkSource): """Link source specified by ``--find-links=``. @@ -45,30 +101,34 @@ class _FlatDirectorySource(LinkSource): * ``file_candidates``: Archives in the directory. """ + _paths_to_urls: Dict[str, _FlatDirectoryToUrls] = {} + def __init__( self, candidates_from_page: CandidatesFromPage, path: str, + project_name: str, ) -> None: self._candidates_from_page = candidates_from_page - self._path = pathlib.Path(os.path.realpath(path)) + self._project_name = canonicalize_name(project_name) + + # Get existing instance of _FlatDirectoryToUrls if it exists + if path in self._paths_to_urls: + self._path_to_urls = self._paths_to_urls[path] + else: + self._path_to_urls = _FlatDirectoryToUrls(path=path) + self._paths_to_urls[path] = self._path_to_urls @property def link(self) -> Optional[Link]: return None def page_candidates(self) -> FoundCandidates: - for path in self._path.iterdir(): - url = path_to_url(str(path)) - if not _is_html_file(url): - continue + for url in self._path_to_urls.page_candidates: yield from self._candidates_from_page(Link(url)) def file_links(self) -> FoundLinks: - for path in self._path.iterdir(): - url = path_to_url(str(path)) - if _is_html_file(url): - continue + for url in self._path_to_urls.project_name_to_urls[self._project_name]: yield Link(url) @@ -170,6 +230,7 @@ def build_source( page_validator: PageValidator, expand_dir: bool, cache_link_parsing: bool, + project_name: str, ) -> Tuple[Optional[str], Optional[LinkSource]]: path: Optional[str] = None url: Optional[str] = None @@ -203,6 +264,7 @@ def build_source( source = _FlatDirectorySource( candidates_from_page=candidates_from_page, path=path, + project_name=project_name, ) else: source = _IndexDirectorySource( From 125ce542cc77d20a65f39a05f828fa38aff1d094 Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Wed, 18 Oct 2023 18:58:53 -0400 Subject: [PATCH 2/3] Fix and add tests for optimized _FlatDirectorySource --- tests/functional/test_install_config.py | 6 --- tests/unit/test_collector.py | 55 ++++++++++++++++++++++--- tests/unit/test_finder.py | 5 ++- 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/tests/functional/test_install_config.py b/tests/functional/test_install_config.py index ecaf2f705..5c7b23ed7 100644 --- a/tests/functional/test_install_config.py +++ b/tests/functional/test_install_config.py @@ -125,9 +125,6 @@ def test_command_line_append_flags( "Fetching project page and analyzing links: https://test.pypi.org" in result.stdout ) - assert ( - f"Skipping link: not a file: {data.find_links}" in result.stdout - ), f"stdout: {result.stdout}" @pytest.mark.network @@ -151,9 +148,6 @@ def test_command_line_appends_correctly( "Fetching project page and analyzing links: https://test.pypi.org" in result.stdout ), result.stdout - assert ( - f"Skipping link: not a file: {data.find_links}" in result.stdout - ), f"stdout: {result.stdout}" def test_config_file_override_stack( diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 3c8b81de4..1ad431e39 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -862,7 +862,7 @@ def test_collect_sources__file_expand_dir(data: TestData) -> None: ) sources = collector.collect_sources( # Shouldn't be used. - project_name=None, # type: ignore[arg-type] + project_name="", candidates_from_page=None, # type: ignore[arg-type] ) assert ( @@ -960,7 +960,7 @@ class TestLinkCollector: session=link_collector.session, ) - def test_collect_sources( + def test_collect_page_sources( self, caplog: pytest.LogCaptureFixture, data: TestData ) -> None: caplog.set_level(logging.DEBUG) @@ -993,9 +993,8 @@ class TestLinkCollector: files = list(files_it) pages = list(pages_it) - # Spot-check the returned sources. - assert len(files) > 20 - check_links_include(files, names=["simple-1.0.tar.gz"]) + # Only "twine" should return from collecting sources + assert len(files) == 1 assert [page.link for page in pages] == [Link("https://pypi.org/simple/twine/")] # Check that index URLs are marked as *un*cacheable. @@ -1010,6 +1009,52 @@ class TestLinkCollector: ("pip._internal.index.collector", logging.DEBUG, expected_message), ] + def test_collect_file_sources( + self, caplog: pytest.LogCaptureFixture, data: TestData + ) -> None: + caplog.set_level(logging.DEBUG) + + link_collector = make_test_link_collector( + find_links=[data.find_links], + # Include two copies of the URL to check that the second one + # is skipped. + index_urls=[PyPI.simple_url, PyPI.simple_url], + ) + collected_sources = link_collector.collect_sources( + "singlemodule", + candidates_from_page=lambda link: [ + InstallationCandidate("singlemodule", "0.0.1", link) + ], + ) + + files_it = itertools.chain.from_iterable( + source.file_links() + for sources in collected_sources + for source in sources + if source is not None + ) + pages_it = itertools.chain.from_iterable( + source.page_candidates() + for sources in collected_sources + for source in sources + if source is not None + ) + files = list(files_it) + _ = list(pages_it) + + # singlemodule should return files + assert len(files) > 0 + check_links_include(files, names=["singlemodule-0.0.1.tar.gz"]) + + expected_message = dedent( + """\ + 1 location(s) to search for versions of singlemodule: + * https://pypi.org/simple/singlemodule/""" + ) + assert caplog.record_tuples == [ + ("pip._internal.index.collector", logging.DEBUG, expected_message), + ] + @pytest.mark.parametrize( "find_links, no_index, suppress_no_index, expected", diff --git a/tests/unit/test_finder.py b/tests/unit/test_finder.py index 3404d1498..35c7e89b7 100644 --- a/tests/unit/test_finder.py +++ b/tests/unit/test_finder.py @@ -128,7 +128,10 @@ class TestWheel: with pytest.raises(DistributionNotFound): finder.find_requirement(req, True) - assert "Skipping link: invalid wheel filename:" in caplog.text + assert ( + "Could not find a version that satisfies the requirement invalid" + " (from versions:" in caplog.text + ) def test_not_find_wheel_not_supported(self, data: TestData) -> None: """ From dc0b3138e89aa6ec8a8a8361f62482d2e41c1856 Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Wed, 18 Oct 2023 18:59:02 -0400 Subject: [PATCH 3/3] Add news entry --- news/12327.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 news/12327.bugfix.rst diff --git a/news/12327.bugfix.rst b/news/12327.bugfix.rst new file mode 100644 index 000000000..b07ef130a --- /dev/null +++ b/news/12327.bugfix.rst @@ -0,0 +1 @@ +Optimized usage of ``--find-links=``, by only scanning the relevant directory once, only considering file names that are valid wheel or sdist names, and only considering files in the directory that are related to the install.