Rewrite collect_links

This introduces a collect_sources() method to do the same thing, but
instead of flattening links eagerly, return each repository entry
separately (and return a None for invalid repository options), so
subsequent code can better distinguish which link comes from which
repository.
This commit is contained in:
Tzu-ping Chung 2021-03-23 22:34:20 +08:00
parent a0f604164e
commit a912c5530d
5 changed files with 389 additions and 218 deletions

View File

@ -101,7 +101,7 @@ One of ``PackageFinder``'s main top-level methods is
1. Calls its ``find_all_candidates()`` method, which gathers all
possible package links by reading and parsing the index URL's and
locations provided by the user (the :ref:`LinkCollector
<link-collector-class>` class's ``collect_links()`` method), constructs a
<link-collector-class>` class's ``collect_sources()`` method), constructs a
:ref:`LinkEvaluator <link-evaluator-class>` object to filter out some of
those links, and then returns a list of ``InstallationCandidates`` (aka
candidates for install). This corresponds to steps 1-3 of the
@ -131,7 +131,7 @@ responsible for collecting the raw list of "links" to package files
The ``LinkCollector`` class takes into account the user's :ref:`--find-links
<install_--find-links>`, :ref:`--extra-index-url <install_--extra-index-url>`,
and related options when deciding which locations to collect links from. The
class's main method is the ``collect_links()`` method. The :ref:`PackageFinder
class's main method is the ``collect_sources()`` method. The :ref:`PackageFinder
<package-finder-class>` class invokes this method as the first step of its
``find_all_candidates()`` method.

View File

@ -1,28 +1,27 @@
"""
The main purpose of this module is to expose LinkCollector.collect_links().
The main purpose of this module is to expose LinkCollector.collect_sources().
"""
import cgi
import collections
import functools
import html
import itertools
import logging
import mimetypes
import os
import re
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from collections import OrderedDict
from optparse import Values
from typing import (
Callable,
Iterable,
List,
MutableMapping,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)
@ -37,8 +36,9 @@ from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url, vcs
from pip._internal.vcs import vcs
from .sources import CandidatesFromPage, LinkSource, build_source
logger = logging.getLogger(__name__)
@ -449,107 +449,9 @@ def _get_html_page(link, session=None):
return None
def _remove_duplicate_links(links):
# type: (Iterable[Link]) -> List[Link]
"""
Return a list of links, with duplicates removed and ordering preserved.
"""
# We preserve the ordering when removing duplicates because we can.
return list(OrderedDict.fromkeys(links))
def group_locations(locations, expand_dir=False):
# type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
"""
Divide a list of locations into two groups: "files" (archives) and "urls."
:return: A pair of lists (files, urls).
"""
files = []
urls = []
# puts the url for the given file path into the appropriate list
def sort_path(path):
# type: (str) -> None
url = path_to_url(path)
if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
urls.append(url)
else:
files.append(url)
for url in locations:
is_local_path = os.path.exists(url)
is_file_url = url.startswith('file:')
if is_local_path or is_file_url:
if is_local_path:
path = url
else:
path = url_to_path(url)
if os.path.isdir(path):
if expand_dir:
path = os.path.realpath(path)
for item in os.listdir(path):
sort_path(os.path.join(path, item))
elif is_file_url:
urls.append(url)
else:
logger.warning(
"Path '%s' is ignored: it is a directory.", path,
)
elif os.path.isfile(path):
sort_path(path)
else:
logger.warning(
"Url '%s' is ignored: it is neither a file "
"nor a directory.", url,
)
elif is_url(url):
# Only add url with clear scheme
urls.append(url)
else:
logger.warning(
"Url '%s' is ignored. It is either a non-existing "
"path or lacks a specific scheme.", url,
)
return files, urls
class CollectedLinks:
"""
Encapsulates the return value of a call to LinkCollector.collect_links().
The return value includes both URLs to project pages containing package
links, as well as individual package Link objects collected from other
sources.
This info is stored separately as:
(1) links from the configured file locations,
(2) links from the configured find_links, and
(3) urls to HTML project pages, as described by the PEP 503 simple
repository API.
"""
def __init__(
self,
files, # type: List[Link]
find_links, # type: List[Link]
project_urls, # type: List[Link]
):
# type: (...) -> None
"""
:param files: Links from file locations.
:param find_links: Links from find_links.
:param project_urls: URLs to HTML project pages, as described by
the PEP 503 simple repository API.
"""
self.files = files
self.find_links = find_links
self.project_urls = project_urls
class CollectedSources(NamedTuple):
find_links: Sequence[Optional[LinkSource]]
index_urls: Sequence[Optional[LinkSource]]
class LinkCollector:
@ -558,7 +460,7 @@ class LinkCollector:
Responsible for collecting Link objects from all configured locations,
making network requests as needed.
The class's main method is its collect_links() method.
The class's main method is its collect_sources() method.
"""
def __init__(
@ -609,51 +511,46 @@ class LinkCollector:
"""
return _get_html_page(location, session=self.session)
def collect_links(self, project_name):
# type: (str) -> CollectedLinks
"""Find all available links for the given project name.
:return: All the Link objects (unfiltered), as a CollectedLinks object.
"""
search_scope = self.search_scope
index_locations = search_scope.get_index_urls_locations(project_name)
index_file_loc, index_url_loc = group_locations(index_locations)
fl_file_loc, fl_url_loc = group_locations(
self.find_links, expand_dir=True,
)
file_links = [
Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
]
# We trust every directly linked archive in find_links
find_link_links = [Link(url, '-f') for url in self.find_links]
# We trust every url that the user has given us whether it was given
# via --index-url or --find-links.
# We want to filter out anything that does not have a secure origin.
url_locations = [
link for link in itertools.chain(
# Mark PyPI indices as "cache_link_parsing == False" -- this
# will avoid caching the result of parsing the page for links.
(Link(url, cache_link_parsing=False) for url in index_url_loc),
(Link(url) for url in fl_url_loc),
def collect_sources(
self,
project_name: str,
candidates_from_page: CandidatesFromPage,
) -> CollectedSources:
# The OrderedDict calls deduplicate sources by URL.
index_url_sources = collections.OrderedDict(
build_source(
loc,
candidates_from_page=candidates_from_page,
page_validator=self.session.is_secure_origin,
expand_dir=False,
cache_link_parsing=False,
)
if self.session.is_secure_origin(link)
]
for loc in self.search_scope.get_index_urls_locations(project_name)
).values()
find_links_sources = collections.OrderedDict(
build_source(
loc,
candidates_from_page=candidates_from_page,
page_validator=self.session.is_secure_origin,
expand_dir=True,
cache_link_parsing=True,
)
for loc in self.find_links
).values()
url_locations = _remove_duplicate_links(url_locations)
lines = [
'{} location(s) to search for versions of {}:'.format(
len(url_locations), project_name,
),
]
for link in url_locations:
lines.append(f'* {link}')
logger.debug('\n'.join(lines))
if logger.isEnabledFor(logging.DEBUG):
lines = [
f"* {s.link}"
for s in itertools.chain(find_links_sources, index_url_sources)
if s is not None and s.link is not None
]
lines = [
f"{len(lines)} location(s) to search "
f"for versions of {project_name}:"
] + lines
logger.debug("\n".join(lines))
return CollectedLinks(
files=file_links,
find_links=find_link_links,
project_urls=url_locations,
return CollectedSources(
find_links=list(find_links_sources),
index_urls=list(index_url_sources),
)

View File

@ -4,6 +4,7 @@
# mypy: strict-optional=False
import functools
import itertools
import logging
import re
from typing import FrozenSet, Iterable, List, Optional, Set, Tuple, Union
@ -804,38 +805,41 @@ class PackageFinder:
See LinkEvaluator.evaluate_link() for details on which files
are accepted.
"""
collected_links = self._link_collector.collect_links(project_name)
link_evaluator = self.make_link_evaluator(project_name)
find_links_versions = self.evaluate_links(
link_evaluator,
links=collected_links.find_links,
collected_sources = self._link_collector.collect_sources(
project_name=project_name,
candidates_from_page=functools.partial(
self.process_project_url,
link_evaluator=link_evaluator,
),
)
page_versions = []
for project_url in collected_links.project_urls:
package_links = self.process_project_url(
project_url, link_evaluator=link_evaluator,
)
page_versions.extend(package_links)
file_versions = self.evaluate_links(
link_evaluator,
links=collected_links.files,
page_candidates_it = itertools.chain.from_iterable(
source.page_candidates()
for sources in collected_sources
for source in sources
if source is not None
)
if file_versions:
file_versions.sort(reverse=True)
logger.debug(
'Local files found: %s',
', '.join([
url_to_path(candidate.link.url)
for candidate in file_versions
])
)
page_candidates = list(page_candidates_it)
file_links_it = itertools.chain.from_iterable(
source.file_links()
for sources in collected_sources
for source in sources
if source is not None
)
file_candidates = self.evaluate_links(
link_evaluator,
sorted(file_links_it, reverse=True),
)
if logger.isEnabledFor(logging.DEBUG) and file_candidates:
paths = [url_to_path(c.link.url) for c in file_candidates]
logger.debug("Local files found: %s", ", ".join(paths))
# This is an intentional priority ordering
return file_versions + find_links_versions + page_versions
return file_candidates + page_candidates
def make_candidate_evaluator(
self,

View File

@ -0,0 +1,224 @@
import logging
import mimetypes
import os
import pathlib
from typing import Callable, Iterable, Optional, Tuple
from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.link import Link
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url
logger = logging.getLogger(__name__)
FoundCandidates = Iterable[InstallationCandidate]
FoundLinks = Iterable[Link]
CandidatesFromPage = Callable[[Link], Iterable[InstallationCandidate]]
PageValidator = Callable[[Link], bool]
class LinkSource:
@property
def link(self) -> Optional[Link]:
"""Returns the underlying link, if there's one."""
raise NotImplementedError()
def page_candidates(self) -> FoundCandidates:
"""Candidates found by parsing an archive listing HTML file."""
raise NotImplementedError()
def file_links(self) -> FoundLinks:
"""Links found by specifying archives directly."""
raise NotImplementedError()
def _is_html_file(file_url: str) -> bool:
return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"
class _FlatDirectorySource(LinkSource):
"""Link source specified by ``--find-links=<path-to-dir>``.
This looks the content of the directory, and returns:
* ``page_candidates``: Links listed on each HTML file in the directory.
* ``file_candidates``: Archives in the directory.
"""
def __init__(
self,
candidates_from_page: CandidatesFromPage,
path: str,
) -> None:
self._candidates_from_page = candidates_from_page
self._path = pathlib.Path(os.path.realpath(path))
@property
def link(self) -> Optional[Link]:
return None
def page_candidates(self) -> FoundCandidates:
for path in self._path.iterdir():
url = path_to_url(str(path))
if not _is_html_file(url):
continue
yield from self._candidates_from_page(Link(url))
def file_links(self) -> FoundLinks:
for path in self._path.iterdir():
url = path_to_url(str(path))
if _is_html_file(url):
continue
yield Link(url)
class _LocalFileSource(LinkSource):
"""``--find-links=<path-or-url>`` or ``--[extra-]index-url=<path-or-url>``.
If a URL is supplied, it must be a ``file:`` URL. If a path is supplied to
the option, it is converted to a URL first. This returns:
* ``page_candidates``: Links listed on an HTML file.
* ``file_candidates``: The non-HTML file.
"""
def __init__(
self,
candidates_from_page: CandidatesFromPage,
link: Link,
) -> None:
self._candidates_from_page = candidates_from_page
self._link = link
@property
def link(self) -> Optional[Link]:
return self._link
def page_candidates(self) -> FoundCandidates:
if not _is_html_file(self._link.url):
return
yield from self._candidates_from_page(self._link)
def file_links(self) -> FoundLinks:
if _is_html_file(self._link.url):
return
yield self._link
class _RemoteFileSource(LinkSource):
"""``--find-links=<url>`` or ``--[extra-]index-url=<url>``.
This returns:
* ``page_candidates``: Links listed on an HTML file.
* ``file_candidates``: The non-HTML file.
"""
def __init__(
self,
candidates_from_page: CandidatesFromPage,
page_validator: PageValidator,
link: Link,
) -> None:
self._candidates_from_page = candidates_from_page
self._page_validator = page_validator
self._link = link
@property
def link(self) -> Optional[Link]:
return self._link
def page_candidates(self) -> FoundCandidates:
if not self._page_validator(self._link):
return
yield from self._candidates_from_page(self._link)
def file_links(self) -> FoundLinks:
yield self._link
class _IndexDirectorySource(LinkSource):
"""``--[extra-]index-url=<path-to-directory>``.
This is treated like a remote URL; ``candidates_from_page`` contains logic
for this by appending ``index.html`` to the link.
"""
def __init__(
self,
candidates_from_page: CandidatesFromPage,
link: Link,
) -> None:
self._candidates_from_page = candidates_from_page
self._link = link
@property
def link(self) -> Optional[Link]:
return self._link
def page_candidates(self) -> FoundCandidates:
yield from self._candidates_from_page(self._link)
def file_links(self) -> FoundLinks:
return ()
def build_source(
location: str,
*,
candidates_from_page: CandidatesFromPage,
page_validator: PageValidator,
expand_dir: bool,
cache_link_parsing: bool,
) -> Tuple[Optional[str], Optional[LinkSource]]:
path: Optional[str] = None
url: Optional[str] = None
if os.path.exists(location): # Is a local path.
url = path_to_url(location)
path = location
elif location.startswith("file:"): # A file: URL.
url = location
path = url_to_path(location)
elif is_url(location):
url = location
if url is None:
msg = (
"Location '%s' is ignored: "
"it is either a non-existing path or lacks a specific scheme."
)
logger.warning(msg, location)
return (None, None)
if path is None:
source: LinkSource = _RemoteFileSource(
candidates_from_page=candidates_from_page,
page_validator=page_validator,
link=Link(url, cache_link_parsing=cache_link_parsing),
)
return (url, source)
if os.path.isdir(path):
if expand_dir:
source = _FlatDirectorySource(
candidates_from_page=candidates_from_page,
path=path,
)
else:
source = _IndexDirectorySource(
candidates_from_page=candidates_from_page,
link=Link(url, cache_link_parsing=cache_link_parsing),
)
return (url, source)
elif os.path.isfile(path):
source = _LocalFileSource(
candidates_from_page=candidates_from_page,
link=Link(url, cache_link_parsing=cache_link_parsing),
)
return (url, source)
logger.warning(
"Location '%s' is ignored: it is neither a file nor a directory.",
location,
)
return (url, None)

View File

@ -1,3 +1,4 @@
import itertools
import logging
import os.path
import re
@ -23,10 +24,9 @@ from pip._internal.index.collector import (
_make_html_page,
_NotHTML,
_NotHTTP,
_remove_duplicate_links,
group_locations,
parse_links,
)
from pip._internal.index.sources import _FlatDirectorySource, _IndexDirectorySource
from pip._internal.models.index import PyPI
from pip._internal.models.link import Link
from pip._internal.network.session import PipSession
@ -587,46 +587,79 @@ def test_get_html_page_directory_append_index(tmpdir):
assert actual.url == expected_url
def test_remove_duplicate_links():
links = [
# We choose Links that will test that ordering is preserved.
Link('https://example.com/2'),
Link('https://example.com/1'),
Link('https://example.com/2'),
]
actual = _remove_duplicate_links(links)
assert actual == [
Link('https://example.com/2'),
Link('https://example.com/1'),
]
def test_group_locations__file_expand_dir(data):
def test_collect_sources__file_expand_dir(data):
"""
Test that a file:// dir gets listdir run with expand_dir
Test that a file:// dir from --find-links becomes _FlatDirectorySource
"""
files, urls = group_locations([data.find_links], expand_dir=True)
assert files and not urls, (
"files and not urls should have been found "
collector = LinkCollector.create(
session=pretend.stub(is_secure_origin=None), # Shouldn't be used.
options=pretend.stub(
index_url="ignored-by-no-index",
extra_index_urls=[],
no_index=True,
find_links=[data.find_links],
),
)
sources = collector.collect_sources(
project_name=None, # Shouldn't be used.
candidates_from_page=None, # Shouldn't be used.
)
assert (
not sources.index_urls
and len(sources.find_links) == 1
and isinstance(sources.find_links[0], _FlatDirectorySource)
), (
"Directory source should have been found "
f"at find-links url: {data.find_links}"
)
def test_group_locations__file_not_find_link(data):
def test_collect_sources__file_not_find_link(data):
"""
Test that a file:// url dir that's not a find-link, doesn't get a listdir
Test that a file:// dir from --index-url doesn't become _FlatDirectorySource
run
"""
files, urls = group_locations([data.index_url("empty_with_pkg")])
assert urls and not files, "urls, but not files should have been found"
collector = LinkCollector.create(
session=pretend.stub(is_secure_origin=None), # Shouldn't be used.
options=pretend.stub(
index_url=data.index_url("empty_with_pkg"),
extra_index_urls=[],
no_index=False,
find_links=[],
),
)
sources = collector.collect_sources(
project_name="",
candidates_from_page=None, # Shouldn't be used.
)
assert (
not sources.find_links
and len(sources.index_urls) == 1
and isinstance(sources.index_urls[0], _IndexDirectorySource)
), "Directory specified as index should be treated as a page"
def test_group_locations__non_existing_path():
def test_collect_sources__non_existing_path():
"""
Test that a non-existing path is ignored.
"""
files, urls = group_locations([os.path.join('this', 'doesnt', 'exist')])
assert not urls and not files, "nothing should have been found"
collector = LinkCollector.create(
session=pretend.stub(is_secure_origin=None), # Shouldn't be used.
options=pretend.stub(
index_url="ignored-by-no-index",
extra_index_urls=[],
no_index=True,
find_links=[os.path.join("this", "doesnt", "exist")],
),
)
sources = collector.collect_sources(
project_name=None, # Shouldn't be used.
candidates_from_page=None, # Shouldn't be used.
)
assert (
not sources.index_urls
and sources.find_links == [None]
), "Nothing should have been found"
def check_links_include(links, names):
@ -664,7 +697,7 @@ class TestLinkCollector:
url, session=link_collector.session,
)
def test_collect_links(self, caplog, data):
def test_collect_sources(self, caplog, data):
caplog.set_level(logging.DEBUG)
link_collector = make_test_link_collector(
@ -673,20 +706,33 @@ class TestLinkCollector:
# is skipped.
index_urls=[PyPI.simple_url, PyPI.simple_url],
)
actual = link_collector.collect_links('twine')
collected_sources = link_collector.collect_sources(
"twine",
candidates_from_page=lambda link: [link],
)
# Spot-check the CollectedLinks return value.
assert len(actual.files) > 20
check_links_include(actual.files, names=['simple-1.0.tar.gz'])
files_it = itertools.chain.from_iterable(
source.file_links()
for sources in collected_sources
for source in sources
if source is not None
)
pages_it = itertools.chain.from_iterable(
source.page_candidates()
for sources in collected_sources
for source in sources
if source is not None
)
files = list(files_it)
pages = list(pages_it)
assert len(actual.find_links) == 1
check_links_include(actual.find_links, names=['packages'])
# Check that find-links URLs are marked as cacheable.
assert actual.find_links[0].cache_link_parsing
# Spot-check the returned sources.
assert len(files) > 20
check_links_include(files, names=["simple-1.0.tar.gz"])
assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
assert pages == [Link('https://pypi.org/simple/twine/')]
# Check that index URLs are marked as *un*cacheable.
assert not actual.project_urls[0].cache_link_parsing
assert not pages[0].cache_link_parsing
expected_message = dedent("""\
1 location(s) to search for versions of twine: