Rewrite collect_links

This introduces a collect_sources() method to do the same thing, but instead of flattening links eagerly, return each repository entry separately (and return a None for invalid repository options), so subsequent code can better distinguish which link comes from which repository.
2023-12-13 21:30:23 +01:00 · 2021-03-23 22:34:20 +08:00 · 2021-03-23 22:34:20 +08:00 · a912c5530d
parent a0f604164e
commit a912c5530d
5 changed files with 389 additions and 218 deletions
--- a/docs/html/development/architecture/package-finding.rst
+++ b/docs/html/development/architecture/package-finding.rst
@ -101,7 +101,7 @@ One of ``PackageFinder``'s main top-level methods is
 1. Calls its ``find_all_candidates()`` method, which gathers all
   possible package links by reading and parsing the index URL's and
   locations provided by the user (the :ref:`LinkCollector
-   <link-collector-class>` class's ``collect_links()`` method), constructs a
+   <link-collector-class>` class's ``collect_sources()`` method), constructs a
   :ref:`LinkEvaluator <link-evaluator-class>` object to filter out some of
   those links, and then returns a list of ``InstallationCandidates`` (aka
   candidates for install). This corresponds to steps 1-3 of the
@ -131,7 +131,7 @@ responsible for collecting the raw list of "links" to package files
 The ``LinkCollector`` class takes into account the user's :ref:`--find-links
 <install_--find-links>`, :ref:`--extra-index-url <install_--extra-index-url>`,
 and related options when deciding which locations to collect links from. The
-class's main method is the ``collect_links()`` method. The :ref:`PackageFinder
+class's main method is the ``collect_sources()`` method. The :ref:`PackageFinder
 <package-finder-class>` class invokes this method as the first step of its
 ``find_all_candidates()`` method.
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@ -1,28 +1,27 @@
 """
-The main purpose of this module is to expose LinkCollector.collect_links().
+The main purpose of this module is to expose LinkCollector.collect_sources().
 """
 import cgi
 import collections
 import functools
 import html
 import itertools
 import logging
 import mimetypes
 import os
 import re
 import urllib.parse
 import urllib.request
 import xml.etree.ElementTree
 from collections import OrderedDict
 from optparse import Values
 from typing import (
    Callable,
    Iterable,
    List,
    MutableMapping,
    NamedTuple,
    Optional,
    Sequence,
    Tuple,
    Union,
 )
@ -37,8 +36,9 @@ from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
 from pip._internal.utils.filetypes import is_archive_file
 from pip._internal.utils.misc import pairwise, redact_auth_from_url
-from pip._internal.utils.urls import path_to_url, url_to_path
+from pip._internal.vcs import vcs
-from pip._internal.vcs import is_url, vcs
+
 from .sources import CandidatesFromPage, LinkSource, build_source
 logger = logging.getLogger(__name__)
@ -449,107 +449,9 @@ def _get_html_page(link, session=None):
    return None
-def _remove_duplicate_links(links):
+class CollectedSources(NamedTuple):
-    # type: (Iterable[Link]) -> List[Link]
+    find_links: Sequence[Optional[LinkSource]]
-    """
+    index_urls: Sequence[Optional[LinkSource]]
    Return a list of links, with duplicates removed and ordering preserved.
    """
    # We preserve the ordering when removing duplicates because we can.
    return list(OrderedDict.fromkeys(links))
 def group_locations(locations, expand_dir=False):
    # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
    """
    Divide a list of locations into two groups: "files" (archives) and "urls."
    :return: A pair of lists (files, urls).
    """
    files = []
    urls = []
    # puts the url for the given file path into the appropriate list
    def sort_path(path):
        # type: (str) -> None
        url = path_to_url(path)
        if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
            urls.append(url)
        else:
            files.append(url)
    for url in locations:
        is_local_path = os.path.exists(url)
        is_file_url = url.startswith('file:')
        if is_local_path or is_file_url:
            if is_local_path:
                path = url
            else:
                path = url_to_path(url)
            if os.path.isdir(path):
                if expand_dir:
                    path = os.path.realpath(path)
                    for item in os.listdir(path):
                        sort_path(os.path.join(path, item))
                elif is_file_url:
                    urls.append(url)
                else:
                    logger.warning(
                        "Path '%s' is ignored: it is a directory.", path,
                    )
            elif os.path.isfile(path):
                sort_path(path)
            else:
                logger.warning(
                    "Url '%s' is ignored: it is neither a file "
                    "nor a directory.", url,
                )
        elif is_url(url):
            # Only add url with clear scheme
            urls.append(url)
        else:
            logger.warning(
                "Url '%s' is ignored. It is either a non-existing "
                "path or lacks a specific scheme.", url,
            )
    return files, urls
 class CollectedLinks:
    """
    Encapsulates the return value of a call to LinkCollector.collect_links().
    The return value includes both URLs to project pages containing package
    links, as well as individual package Link objects collected from other
    sources.
    This info is stored separately as:
    (1) links from the configured file locations,
    (2) links from the configured find_links, and
    (3) urls to HTML project pages, as described by the PEP 503 simple
        repository API.
    """
    def __init__(
        self,
        files,         # type: List[Link]
        find_links,    # type: List[Link]
        project_urls,  # type: List[Link]
    ):
        # type: (...) -> None
        """
        :param files: Links from file locations.
        :param find_links: Links from find_links.
        :param project_urls: URLs to HTML project pages, as described by
            the PEP 503 simple repository API.
        """
        self.files = files
        self.find_links = find_links
        self.project_urls = project_urls
 class LinkCollector:
@ -558,7 +460,7 @@ class LinkCollector:
    Responsible for collecting Link objects from all configured locations,
    making network requests as needed.
-    The class's main method is its collect_links() method.
+    The class's main method is its collect_sources() method.
    """
    def __init__(
@ -609,51 +511,46 @@ class LinkCollector:
        """
        return _get_html_page(location, session=self.session)
-    def collect_links(self, project_name):
+    def collect_sources(
-        # type: (str) -> CollectedLinks
+        self,
-        """Find all available links for the given project name.
+        project_name: str,
-
+        candidates_from_page: CandidatesFromPage,
-        :return: All the Link objects (unfiltered), as a CollectedLinks object.
+    ) -> CollectedSources:
-        """
+        # The OrderedDict calls deduplicate sources by URL.
-        search_scope = self.search_scope
+        index_url_sources = collections.OrderedDict(
-        index_locations = search_scope.get_index_urls_locations(project_name)
+            build_source(
-        index_file_loc, index_url_loc = group_locations(index_locations)
+                loc,
-        fl_file_loc, fl_url_loc = group_locations(
+                candidates_from_page=candidates_from_page,
-            self.find_links, expand_dir=True,
+                page_validator=self.session.is_secure_origin,
-        )
+                expand_dir=False,
-
+                cache_link_parsing=False,
        file_links = [
            Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
        ]
        # We trust every directly linked archive in find_links
        find_link_links = [Link(url, '-f') for url in self.find_links]
        # We trust every url that the user has given us whether it was given
        # via --index-url or --find-links.
        # We want to filter out anything that does not have a secure origin.
        url_locations = [
            link for link in itertools.chain(
                # Mark PyPI indices as "cache_link_parsing == False" -- this
                # will avoid caching the result of parsing the page for links.
                (Link(url, cache_link_parsing=False) for url in index_url_loc),
                (Link(url) for url in fl_url_loc),
            )
-            if self.session.is_secure_origin(link)
+            for loc in self.search_scope.get_index_urls_locations(project_name)
-        ]
+        ).values()
        find_links_sources = collections.OrderedDict(
            build_source(
                loc,
                candidates_from_page=candidates_from_page,
                page_validator=self.session.is_secure_origin,
                expand_dir=True,
                cache_link_parsing=True,
            )
            for loc in self.find_links
        ).values()
-        url_locations = _remove_duplicate_links(url_locations)
+        if logger.isEnabledFor(logging.DEBUG):
-        lines = [
+            lines = [
-            '{} location(s) to search for versions of {}:'.format(
+                f"* {s.link}"
-                len(url_locations), project_name,
+                for s in itertools.chain(find_links_sources, index_url_sources)
-            ),
+                if s is not None and s.link is not None
-        ]
+            ]
-        for link in url_locations:
+            lines = [
-            lines.append(f'* {link}')
+                f"{len(lines)} location(s) to search "
-        logger.debug('\n'.join(lines))
+                f"for versions of {project_name}:"
            ] + lines
            logger.debug("\n".join(lines))
-        return CollectedLinks(
+        return CollectedSources(
-            files=file_links,
+            find_links=list(find_links_sources),
-            find_links=find_link_links,
+            index_urls=list(index_url_sources),
            project_urls=url_locations,
        )
--- a/src/pip/_internal/index/package_finder.py
+++ b/src/pip/_internal/index/package_finder.py
@ -4,6 +4,7 @@
 # mypy: strict-optional=False
 import functools
 import itertools
 import logging
 import re
 from typing import FrozenSet, Iterable, List, Optional, Set, Tuple, Union
@ -804,38 +805,41 @@ class PackageFinder:
        See LinkEvaluator.evaluate_link() for details on which files
        are accepted.
        """
        collected_links = self._link_collector.collect_links(project_name)
        link_evaluator = self.make_link_evaluator(project_name)
-        find_links_versions = self.evaluate_links(
+        collected_sources = self._link_collector.collect_sources(
-            link_evaluator,
+            project_name=project_name,
-            links=collected_links.find_links,
+            candidates_from_page=functools.partial(
                self.process_project_url,
                link_evaluator=link_evaluator,
            ),
        )
-        page_versions = []
+        page_candidates_it = itertools.chain.from_iterable(
-        for project_url in collected_links.project_urls:
+            source.page_candidates()
-            package_links = self.process_project_url(
+            for sources in collected_sources
-                project_url, link_evaluator=link_evaluator,
+            for source in sources
-            )
+            if source is not None
            page_versions.extend(package_links)
        file_versions = self.evaluate_links(
            link_evaluator,
            links=collected_links.files,
        )
-        if file_versions:
+        page_candidates = list(page_candidates_it)
-            file_versions.sort(reverse=True)
+
-            logger.debug(
+        file_links_it = itertools.chain.from_iterable(
-                'Local files found: %s',
+            source.file_links()
-                ', '.join([
+            for sources in collected_sources
-                    url_to_path(candidate.link.url)
+            for source in sources
-                    for candidate in file_versions
+            if source is not None
-                ])
+        )
-            )
+        file_candidates = self.evaluate_links(
            link_evaluator,
            sorted(file_links_it, reverse=True),
        )
        if logger.isEnabledFor(logging.DEBUG) and file_candidates:
            paths = [url_to_path(c.link.url) for c in file_candidates]
            logger.debug("Local files found: %s", ", ".join(paths))
        # This is an intentional priority ordering
-        return file_versions + find_links_versions + page_versions
+        return file_candidates + page_candidates
    def make_candidate_evaluator(
        self,
--- a/src/pip/_internal/index/sources.py
+++ b/src/pip/_internal/index/sources.py
@ -0,0 +1,224 @@
 import logging
 import mimetypes
 import os
 import pathlib
 from typing import Callable, Iterable, Optional, Tuple
 from pip._internal.models.candidate import InstallationCandidate
 from pip._internal.models.link import Link
 from pip._internal.utils.urls import path_to_url, url_to_path
 from pip._internal.vcs import is_url
 logger = logging.getLogger(__name__)
 FoundCandidates = Iterable[InstallationCandidate]
 FoundLinks = Iterable[Link]
 CandidatesFromPage = Callable[[Link], Iterable[InstallationCandidate]]
 PageValidator = Callable[[Link], bool]
 class LinkSource:
    @property
    def link(self) -> Optional[Link]:
        """Returns the underlying link, if there's one."""
        raise NotImplementedError()
    def page_candidates(self) -> FoundCandidates:
        """Candidates found by parsing an archive listing HTML file."""
        raise NotImplementedError()
    def file_links(self) -> FoundLinks:
        """Links found by specifying archives directly."""
        raise NotImplementedError()
 def _is_html_file(file_url: str) -> bool:
    return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"
 class _FlatDirectorySource(LinkSource):
    """Link source specified by ``--find-links=<path-to-dir>``.
    This looks the content of the directory, and returns:
    * ``page_candidates``: Links listed on each HTML file in the directory.
    * ``file_candidates``: Archives in the directory.
    """
    def __init__(
        self,
        candidates_from_page: CandidatesFromPage,
        path: str,
    ) -> None:
        self._candidates_from_page = candidates_from_page
        self._path = pathlib.Path(os.path.realpath(path))
    @property
    def link(self) -> Optional[Link]:
        return None
    def page_candidates(self) -> FoundCandidates:
        for path in self._path.iterdir():
            url = path_to_url(str(path))
            if not _is_html_file(url):
                continue
            yield from self._candidates_from_page(Link(url))
    def file_links(self) -> FoundLinks:
        for path in self._path.iterdir():
            url = path_to_url(str(path))
            if _is_html_file(url):
                continue
            yield Link(url)
 class _LocalFileSource(LinkSource):
    """``--find-links=<path-or-url>`` or ``--[extra-]index-url=<path-or-url>``.
    If a URL is supplied, it must be a ``file:`` URL. If a path is supplied to
    the option, it is converted to a URL first. This returns:
    * ``page_candidates``: Links listed on an HTML file.
    * ``file_candidates``: The non-HTML file.
    """
    def __init__(
        self,
        candidates_from_page: CandidatesFromPage,
        link: Link,
    ) -> None:
        self._candidates_from_page = candidates_from_page
        self._link = link
    @property
    def link(self) -> Optional[Link]:
        return self._link
    def page_candidates(self) -> FoundCandidates:
        if not _is_html_file(self._link.url):
            return
        yield from self._candidates_from_page(self._link)
    def file_links(self) -> FoundLinks:
        if _is_html_file(self._link.url):
            return
        yield self._link
 class _RemoteFileSource(LinkSource):
    """``--find-links=<url>`` or ``--[extra-]index-url=<url>``.
    This returns:
    * ``page_candidates``: Links listed on an HTML file.
    * ``file_candidates``: The non-HTML file.
    """
    def __init__(
        self,
        candidates_from_page: CandidatesFromPage,
        page_validator: PageValidator,
        link: Link,
    ) -> None:
        self._candidates_from_page = candidates_from_page
        self._page_validator = page_validator
        self._link = link
    @property
    def link(self) -> Optional[Link]:
        return self._link
    def page_candidates(self) -> FoundCandidates:
        if not self._page_validator(self._link):
            return
        yield from self._candidates_from_page(self._link)
    def file_links(self) -> FoundLinks:
        yield self._link
 class _IndexDirectorySource(LinkSource):
    """``--[extra-]index-url=<path-to-directory>``.
    This is treated like a remote URL; ``candidates_from_page`` contains logic
    for this by appending ``index.html`` to the link.
    """
    def __init__(
        self,
        candidates_from_page: CandidatesFromPage,
        link: Link,
    ) -> None:
        self._candidates_from_page = candidates_from_page
        self._link = link
    @property
    def link(self) -> Optional[Link]:
        return self._link
    def page_candidates(self) -> FoundCandidates:
        yield from self._candidates_from_page(self._link)
    def file_links(self) -> FoundLinks:
        return ()
 def build_source(
    location: str,
    *,
    candidates_from_page: CandidatesFromPage,
    page_validator: PageValidator,
    expand_dir: bool,
    cache_link_parsing: bool,
 ) -> Tuple[Optional[str], Optional[LinkSource]]:
    path: Optional[str] = None
    url: Optional[str] = None
    if os.path.exists(location):  # Is a local path.
        url = path_to_url(location)
        path = location
    elif location.startswith("file:"):  # A file: URL.
        url = location
        path = url_to_path(location)
    elif is_url(location):
        url = location
    if url is None:
        msg = (
            "Location '%s' is ignored: "
            "it is either a non-existing path or lacks a specific scheme."
        )
        logger.warning(msg, location)
        return (None, None)
    if path is None:
        source: LinkSource = _RemoteFileSource(
            candidates_from_page=candidates_from_page,
            page_validator=page_validator,
            link=Link(url, cache_link_parsing=cache_link_parsing),
        )
        return (url, source)
    if os.path.isdir(path):
        if expand_dir:
            source = _FlatDirectorySource(
                candidates_from_page=candidates_from_page,
                path=path,
            )
        else:
            source = _IndexDirectorySource(
                candidates_from_page=candidates_from_page,
                link=Link(url, cache_link_parsing=cache_link_parsing),
            )
        return (url, source)
    elif os.path.isfile(path):
        source = _LocalFileSource(
            candidates_from_page=candidates_from_page,
            link=Link(url, cache_link_parsing=cache_link_parsing),
        )
        return (url, source)
    logger.warning(
        "Location '%s' is ignored: it is neither a file nor a directory.",
        location,
    )
    return (url, None)
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@ -1,3 +1,4 @@
 import itertools
 import logging
 import os.path
 import re
@ -23,10 +24,9 @@ from pip._internal.index.collector import (
    _make_html_page,
    _NotHTML,
    _NotHTTP,
    _remove_duplicate_links,
    group_locations,
    parse_links,
 )
 from pip._internal.index.sources import _FlatDirectorySource, _IndexDirectorySource
 from pip._internal.models.index import PyPI
 from pip._internal.models.link import Link
 from pip._internal.network.session import PipSession
@ -587,46 +587,79 @@ def test_get_html_page_directory_append_index(tmpdir):
        assert actual.url == expected_url
-def test_remove_duplicate_links():
+def test_collect_sources__file_expand_dir(data):
    links = [
        # We choose Links that will test that ordering is preserved.
        Link('https://example.com/2'),
        Link('https://example.com/1'),
        Link('https://example.com/2'),
    ]
    actual = _remove_duplicate_links(links)
    assert actual == [
        Link('https://example.com/2'),
        Link('https://example.com/1'),
    ]
 def test_group_locations__file_expand_dir(data):
    """
-    Test that a file:// dir gets listdir run with expand_dir
+    Test that a file:// dir from --find-links becomes _FlatDirectorySource
    """
-    files, urls = group_locations([data.find_links], expand_dir=True)
+    collector = LinkCollector.create(
-    assert files and not urls, (
+        session=pretend.stub(is_secure_origin=None),  # Shouldn't be used.
-        "files and not urls should have been found "
+        options=pretend.stub(
            index_url="ignored-by-no-index",
            extra_index_urls=[],
            no_index=True,
            find_links=[data.find_links],
        ),
    )
    sources = collector.collect_sources(
        project_name=None,  # Shouldn't be used.
        candidates_from_page=None,  # Shouldn't be used.
    )
    assert (
        not sources.index_urls
        and len(sources.find_links) == 1
        and isinstance(sources.find_links[0], _FlatDirectorySource)
    ), (
        "Directory source should have been found "
        f"at find-links url: {data.find_links}"
    )
-def test_group_locations__file_not_find_link(data):
+def test_collect_sources__file_not_find_link(data):
    """
-    Test that a file:// url dir that's not a find-link, doesn't get a listdir
+    Test that a file:// dir from --index-url doesn't become _FlatDirectorySource
    run
    """
-    files, urls = group_locations([data.index_url("empty_with_pkg")])
+    collector = LinkCollector.create(
-    assert urls and not files, "urls, but not files should have been found"
+        session=pretend.stub(is_secure_origin=None),  # Shouldn't be used.
        options=pretend.stub(
            index_url=data.index_url("empty_with_pkg"),
            extra_index_urls=[],
            no_index=False,
            find_links=[],
        ),
    )
    sources = collector.collect_sources(
        project_name="",
        candidates_from_page=None,  # Shouldn't be used.
    )
    assert (
        not sources.find_links
        and len(sources.index_urls) == 1
        and isinstance(sources.index_urls[0], _IndexDirectorySource)
    ), "Directory specified as index should be treated as a page"
-def test_group_locations__non_existing_path():
+def test_collect_sources__non_existing_path():
    """
    Test that a non-existing path is ignored.
    """
-    files, urls = group_locations([os.path.join('this', 'doesnt', 'exist')])
+    collector = LinkCollector.create(
-    assert not urls and not files, "nothing should have been found"
+        session=pretend.stub(is_secure_origin=None),  # Shouldn't be used.
        options=pretend.stub(
            index_url="ignored-by-no-index",
            extra_index_urls=[],
            no_index=True,
            find_links=[os.path.join("this", "doesnt", "exist")],
        ),
    )
    sources = collector.collect_sources(
        project_name=None,  # Shouldn't be used.
        candidates_from_page=None,  # Shouldn't be used.
    )
    assert (
        not sources.index_urls
        and sources.find_links == [None]
    ), "Nothing should have been found"
 def check_links_include(links, names):
@ -664,7 +697,7 @@ class TestLinkCollector:
            url, session=link_collector.session,
        )
-    def test_collect_links(self, caplog, data):
+    def test_collect_sources(self, caplog, data):
        caplog.set_level(logging.DEBUG)
        link_collector = make_test_link_collector(
@ -673,20 +706,33 @@ class TestLinkCollector:
            # is skipped.
            index_urls=[PyPI.simple_url, PyPI.simple_url],
        )
-        actual = link_collector.collect_links('twine')
+        collected_sources = link_collector.collect_sources(
            "twine",
            candidates_from_page=lambda link: [link],
        )
-        # Spot-check the CollectedLinks return value.
+        files_it = itertools.chain.from_iterable(
-        assert len(actual.files) > 20
+            source.file_links()
-        check_links_include(actual.files, names=['simple-1.0.tar.gz'])
+            for sources in collected_sources
            for source in sources
            if source is not None
        )
        pages_it = itertools.chain.from_iterable(
            source.page_candidates()
            for sources in collected_sources
            for source in sources
            if source is not None
        )
        files = list(files_it)
        pages = list(pages_it)
-        assert len(actual.find_links) == 1
+        # Spot-check the returned sources.
-        check_links_include(actual.find_links, names=['packages'])
+        assert len(files) > 20
-        # Check that find-links URLs are marked as cacheable.
+        check_links_include(files, names=["simple-1.0.tar.gz"])
        assert actual.find_links[0].cache_link_parsing
-        assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
+        assert pages == [Link('https://pypi.org/simple/twine/')]
        # Check that index URLs are marked as *un*cacheable.
-        assert not actual.project_urls[0].cache_link_parsing
+        assert not pages[0].cache_link_parsing
        expected_message = dedent("""\
        1 location(s) to search for versions of twine: