Merge dc0b3138e8 into a15dd75d98

2023-11-28 19:25:11 -05:00 · 2023-11-28 19:25:11 -05:00 · ba3b272c06
parent a15dd75d98 dc0b3138e8
commit ba3b272c06
6 changed files with 130 additions and 23 deletions
--- a/news/12327.bugfix.rst
+++ b/news/12327.bugfix.rst
@ -0,0 +1 @@
+Optimized usage of ``--find-links=<path-to-dir>``, by only scanning the relevant directory once, only considering file names that are valid wheel or sdist names, and only considering files in the directory that are related to the install.
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@ -473,6 +473,7 @@ class LinkCollector:
                page_validator=self.session.is_secure_origin,
                expand_dir=False,
                cache_link_parsing=False,
+                project_name=project_name,
            )
            for loc in self.search_scope.get_index_urls_locations(project_name)
        ).values()
@ -483,6 +484,7 @@ class LinkCollector:
                page_validator=self.session.is_secure_origin,
                expand_dir=True,
                cache_link_parsing=True,
+                project_name=project_name,
            )
            for loc in self.find_links
        ).values()
--- a/src/pip/_internal/index/sources.py
+++ b/src/pip/_internal/index/sources.py
@ -1,8 +1,17 @@
 import logging
 import mimetypes
 import os
-import pathlib
-from typing import Callable, Iterable, Optional, Tuple
+from collections import defaultdict
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
+
+from pip._vendor.packaging.utils import (
+    InvalidSdistFilename,
+    InvalidVersion,
+    InvalidWheelFilename,
+    canonicalize_name,
+    parse_sdist_filename,
+    parse_wheel_filename,
+)

 from pip._internal.models.candidate import InstallationCandidate
 from pip._internal.models.link import Link
@ -36,6 +45,53 @@ def _is_html_file(file_url: str) -> bool:
    return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"


+class _FlatDirectoryToUrls:
+    """Scans directory and caches results"""
+
+    def __init__(self, path: str) -> None:
+        self._path = path
+        self._page_candidates: List[str] = []
+        self._project_name_to_urls: Dict[str, List[str]] = defaultdict(list)
+        self._scanned_directory = False
+
+    def _scan_directory(self) -> None:
+        """Scans directory once and populates both page_candidates
+        and project_name_to_urls at the same time
+        """
+        for entry in os.scandir(self._path):
+            url = path_to_url(entry.path)
+            if _is_html_file(url):
+                self._page_candidates.append(url)
+                continue
+
+            # File must have a valid wheel or sdist name,
+            # otherwise not worth considering as a package
+            try:
+                project_filename = parse_wheel_filename(entry.name)[0]
+            except (InvalidWheelFilename, InvalidVersion):
+                try:
+                    project_filename = parse_sdist_filename(entry.name)[0]
+                except (InvalidSdistFilename, InvalidVersion):
+                    continue
+
+            self._project_name_to_urls[project_filename].append(url)
+        self._scanned_directory = True
+
+    @property
+    def page_candidates(self) -> List[str]:
+        if not self._scanned_directory:
+            self._scan_directory()
+
+        return self._page_candidates
+
+    @property
+    def project_name_to_urls(self) -> Dict[str, List[str]]:
+        if not self._scanned_directory:
+            self._scan_directory()
+
+        return self._project_name_to_urls
+
+
 class _FlatDirectorySource(LinkSource):
    """Link source specified by ``--find-links=<path-to-dir>``.

@ -45,30 +101,34 @@ class _FlatDirectorySource(LinkSource):
    * ``file_candidates``: Archives in the directory.
    """

+    _paths_to_urls: Dict[str, _FlatDirectoryToUrls] = {}
+
    def __init__(
        self,
        candidates_from_page: CandidatesFromPage,
        path: str,
+        project_name: str,
    ) -> None:
        self._candidates_from_page = candidates_from_page
-        self._path = pathlib.Path(os.path.realpath(path))
+        self._project_name = canonicalize_name(project_name)
+
+        # Get existing instance of _FlatDirectoryToUrls if it exists
+        if path in self._paths_to_urls:
+            self._path_to_urls = self._paths_to_urls[path]
+        else:
+            self._path_to_urls = _FlatDirectoryToUrls(path=path)
+            self._paths_to_urls[path] = self._path_to_urls

    @property
    def link(self) -> Optional[Link]:
        return None

    def page_candidates(self) -> FoundCandidates:
-        for path in self._path.iterdir():
-            url = path_to_url(str(path))
-            if not _is_html_file(url):
-                continue
+        for url in self._path_to_urls.page_candidates:
            yield from self._candidates_from_page(Link(url))

    def file_links(self) -> FoundLinks:
-        for path in self._path.iterdir():
-            url = path_to_url(str(path))
-            if _is_html_file(url):
-                continue
+        for url in self._path_to_urls.project_name_to_urls[self._project_name]:
            yield Link(url)


@ -170,6 +230,7 @@ def build_source(
    page_validator: PageValidator,
    expand_dir: bool,
    cache_link_parsing: bool,
+    project_name: str,
 ) -> Tuple[Optional[str], Optional[LinkSource]]:
    path: Optional[str] = None
    url: Optional[str] = None
@ -203,6 +264,7 @@ def build_source(
            source = _FlatDirectorySource(
                candidates_from_page=candidates_from_page,
                path=path,
+                project_name=project_name,
            )
        else:
            source = _IndexDirectorySource(
--- a/tests/functional/test_install_config.py
+++ b/tests/functional/test_install_config.py
@ -125,9 +125,6 @@ def test_command_line_append_flags(
        "Fetching project page and analyzing links: https://test.pypi.org"
        in result.stdout
    )
-    assert (
-        f"Skipping link: not a file: {data.find_links}" in result.stdout
-    ), f"stdout: {result.stdout}"


@pytest.mark.network
@ -151,9 +148,6 @@ def test_command_line_appends_correctly(
        "Fetching project page and analyzing links: https://test.pypi.org"
        in result.stdout
    ), result.stdout
-    assert (
-        f"Skipping link: not a file: {data.find_links}" in result.stdout
-    ), f"stdout: {result.stdout}"


 def test_config_file_override_stack(
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@ -862,7 +862,7 @@ def test_collect_sources__file_expand_dir(data: TestData) -> None:
    )
    sources = collector.collect_sources(
        # Shouldn't be used.
-        project_name=None,  # type: ignore[arg-type]
+        project_name="",
        candidates_from_page=None,  # type: ignore[arg-type]
    )
    assert (
@ -960,7 +960,7 @@ class TestLinkCollector:
            session=link_collector.session,
        )

-    def test_collect_sources(
+    def test_collect_page_sources(
        self, caplog: pytest.LogCaptureFixture, data: TestData
    ) -> None:
        caplog.set_level(logging.DEBUG)
@ -993,9 +993,8 @@ class TestLinkCollector:
        files = list(files_it)
        pages = list(pages_it)

-        # Spot-check the returned sources.
-        assert len(files) > 20
-        check_links_include(files, names=["simple-1.0.tar.gz"])
+        # Only "twine" should return from collecting sources
+        assert len(files) == 1

        assert [page.link for page in pages] == [Link("https://pypi.org/simple/twine/")]
        # Check that index URLs are marked as *un*cacheable.
@ -1010,6 +1009,52 @@ class TestLinkCollector:
            ("pip._internal.index.collector", logging.DEBUG, expected_message),
        ]

+    def test_collect_file_sources(
+        self, caplog: pytest.LogCaptureFixture, data: TestData
+    ) -> None:
+        caplog.set_level(logging.DEBUG)
+
+        link_collector = make_test_link_collector(
+            find_links=[data.find_links],
+            # Include two copies of the URL to check that the second one
+            # is skipped.
+            index_urls=[PyPI.simple_url, PyPI.simple_url],
+        )
+        collected_sources = link_collector.collect_sources(
+            "singlemodule",
+            candidates_from_page=lambda link: [
+                InstallationCandidate("singlemodule", "0.0.1", link)
+            ],
+        )
+
+        files_it = itertools.chain.from_iterable(
+            source.file_links()
+            for sources in collected_sources
+            for source in sources
+            if source is not None
+        )
+        pages_it = itertools.chain.from_iterable(
+            source.page_candidates()
+            for sources in collected_sources
+            for source in sources
+            if source is not None
+        )
+        files = list(files_it)
+        _ = list(pages_it)
+
+        # singlemodule should return files
+        assert len(files) > 0
+        check_links_include(files, names=["singlemodule-0.0.1.tar.gz"])
+
+        expected_message = dedent(
+            """\
+        1 location(s) to search for versions of singlemodule:
+        * https://pypi.org/simple/singlemodule/"""
+        )
+        assert caplog.record_tuples == [
+            ("pip._internal.index.collector", logging.DEBUG, expected_message),
+        ]
+

@pytest.mark.parametrize(
    "find_links, no_index, suppress_no_index, expected",
--- a/tests/unit/test_finder.py
+++ b/tests/unit/test_finder.py
@ -128,7 +128,10 @@ class TestWheel:
        with pytest.raises(DistributionNotFound):
            finder.find_requirement(req, True)

-        assert "Skipping link: invalid wheel filename:" in caplog.text
+        assert (
+            "Could not find a version that satisfies the requirement invalid"
+            " (from versions:" in caplog.text
+        )

    def test_not_find_wheel_not_supported(self, data: TestData) -> None:
        """
				`@ -0,0 +1 @@`
				Optimized usage of ``--find-links=<path-to-dir>``, by only scanning the relevant directory once, only considering file names that are valid wheel or sdist names, and only considering files in the directory that are related to the install.