1
1
Fork 0
mirror of https://github.com/pypa/pip synced 2023-12-13 21:30:23 +01:00

Use data-dist-info-metadata (PEP 658) to decouple resolution from downloading (#11111)

Co-authored-by: Tzu-ping Chung <uranusjr@gmail.com>
This commit is contained in:
Danny McClanahan 2022-09-10 10:28:57 +00:00 committed by GitHub
parent a66406a409
commit bad03ef931
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 834 additions and 180 deletions

1
news/11111.feature.rst Normal file
View file

@ -0,0 +1 @@
Use the ``data-dist-info-metadata`` attribute from :pep:`658` to resolve distribution metadata without downloading the dist yet.

View file

@ -335,8 +335,8 @@ class MetadataInconsistent(InstallationError):
"""Built metadata contains inconsistent information.
This is raised when the metadata contains values (e.g. name and version)
that do not match the information previously obtained from sdist filename
or user-supplied ``#egg=`` value.
that do not match the information previously obtained from sdist filename,
user-supplied ``#egg=`` value, or an install requirement name.
"""
def __init__(
@ -348,11 +348,10 @@ class MetadataInconsistent(InstallationError):
self.m_val = m_val
def __str__(self) -> str:
template = (
"Requested {} has inconsistent {}: "
"filename has {!r}, but metadata has {!r}"
return (
f"Requested {self.ireq} has inconsistent {self.field}: "
f"expected {self.f_val!r}, but metadata has {self.m_val!r}"
)
return template.format(self.ireq, self.field, self.f_val, self.m_val)
class LegacyInstallFailure(DiagnosticPipError):

View file

@ -9,10 +9,8 @@ import itertools
import json
import logging
import os
import re
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from html.parser import HTMLParser
from optparse import Values
from typing import (
@ -39,7 +37,7 @@ from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.vcs import vcs
from .sources import CandidatesFromPage, LinkSource, build_source
@ -51,7 +49,6 @@ else:
logger = logging.getLogger(__name__)
HTMLElement = xml.etree.ElementTree.Element
ResponseHeaders = MutableMapping[str, str]
@ -191,94 +188,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None
def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))
def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))
# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
def _clean_url_path(path: str, is_local_path: bool) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part
# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)
cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())
return "".join(cleaned_parts)
def _clean_link(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))
def _create_link_from_element(
element_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional[Link]:
"""
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = element_attribs.get("href")
if not href:
return None
url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = element_attribs.get("data-requires-python")
yanked_reason = element_attribs.get("data-yanked")
link = Link(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
)
return link
class CacheablePageContent:
def __init__(self, page: "IndexContent") -> None:
assert page.cache_link_parsing
@ -326,25 +235,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
file_url = file.get("url")
if file_url is None:
link = Link.from_json(file, page.url)
if link is None:
continue
# The Link.yanked_reason expects an empty string instead of a boolean.
yanked_reason = file.get("yanked")
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
# The Link.yanked_reason expects None instead of False
elif not yanked_reason:
yanked_reason = None
yield Link(
_clean_link(urllib.parse.urljoin(page.url, file_url)),
comes_from=page.url,
requires_python=file.get("requires-python"),
yanked_reason=yanked_reason,
hashes=file.get("hashes", {}),
)
yield link
return
parser = HTMLLinkParser(page.url)
@ -354,11 +248,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
base_url=base_url,
)
link = Link.from_element(anchor, page_url=url, base_url=base_url)
if link is None:
continue
yield link

View file

@ -103,3 +103,25 @@ def get_wheel_distribution(wheel: Wheel, canonical_name: str) -> BaseDistributio
:param canonical_name: Normalized project name of the given wheel.
"""
return select_backend().Distribution.from_wheel(wheel, canonical_name)
def get_metadata_distribution(
metadata_contents: bytes,
filename: str,
canonical_name: str,
) -> BaseDistribution:
"""Get the dist representation of the specified METADATA file contents.
This returns a Distribution instance from the chosen backend sourced from the data
in `metadata_contents`.
:param metadata_contents: Contents of a METADATA file within a dist, or one served
via PEP 658.
:param filename: Filename for the dist this metadata represents.
:param canonical_name: Normalized project name of the given dist.
"""
return select_backend().Distribution.from_metadata_file_contents(
metadata_contents,
filename,
canonical_name,
)

View file

@ -113,6 +113,24 @@ class BaseDistribution(Protocol):
"""
raise NotImplementedError()
@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> "BaseDistribution":
"""Load the distribution from the contents of a METADATA file.
This is used to implement PEP 658 by generating a "shallow" dist object that can
be used for resolution without downloading or building the actual dist yet.
:param metadata_contents: The contents of a METADATA file.
:param filename: File name for the dist with this metadata.
:param project_name: Name of the project this dist represents.
"""
raise NotImplementedError()
@classmethod
def from_wheel(cls, wheel: "Wheel", name: str) -> "BaseDistribution":
"""Load the distribution from a given wheel.

View file

@ -28,6 +28,7 @@ from pip._internal.metadata.base import (
)
from pip._internal.utils.misc import normalize_path
from pip._internal.utils.packaging import safe_extra
from pip._internal.utils.temp_dir import TempDirectory
from pip._internal.utils.wheel import parse_wheel, read_wheel_metadata_file
from ._compat import BasePath, get_dist_name
@ -109,6 +110,23 @@ class Distribution(BaseDistribution):
dist = importlib.metadata.Distribution.at(info_location)
return cls(dist, info_location, info_location.parent)
@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> BaseDistribution:
# Generate temp dir to contain the metadata file, and write the file contents.
temp_dir = pathlib.Path(
TempDirectory(kind="metadata", globally_managed=True).path
)
metadata_path = temp_dir / "METADATA"
metadata_path.write_bytes(metadata_contents)
# Construct dist pointing to the newly created directory.
dist = importlib.metadata.Distribution.at(metadata_path.parent)
return cls(dist, metadata_path.parent, None)
@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:

View file

@ -33,7 +33,7 @@ class EntryPoint(NamedTuple):
group: str
class WheelMetadata:
class InMemoryMetadata:
"""IMetadataProvider that reads metadata files from a dictionary.
This also maps metadata decoding exceptions to our internal exception type.
@ -92,12 +92,29 @@ class Distribution(BaseDistribution):
dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata)
return cls(dist)
@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> BaseDistribution:
metadata_dict = {
"METADATA": metadata_contents,
}
dist = pkg_resources.DistInfoDistribution(
location=filename,
metadata=InMemoryMetadata(metadata_dict, filename),
project_name=project_name,
)
return cls(dist)
@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:
with wheel.as_zipfile() as zf:
info_dir, _ = parse_wheel(zf, name)
metadata_text = {
metadata_dict = {
path.split("/", 1)[-1]: read_wheel_metadata_file(zf, path)
for path in zf.namelist()
if path.startswith(f"{info_dir}/")
@ -108,7 +125,7 @@ class Distribution(BaseDistribution):
raise UnsupportedWheel(f"{name} has an invalid wheel, {e}")
dist = pkg_resources.DistInfoDistribution(
location=wheel.location,
metadata=WheelMetadata(metadata_text, wheel.location),
metadata=InMemoryMetadata(metadata_dict, wheel.location),
project_name=name,
)
return cls(dist)

View file

@ -1,11 +1,14 @@
import functools
import itertools
import logging
import os
import posixpath
import re
import urllib.parse
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Mapping,
@ -18,6 +21,7 @@ from typing import (
from pip._internal.utils.filetypes import WHEEL_EXTENSION
from pip._internal.utils.hashes import Hashes
from pip._internal.utils.misc import (
pairwise,
redact_auth_from_url,
split_auth_from_netloc,
splitext,
@ -36,6 +40,119 @@ logger = logging.getLogger(__name__)
_SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5")
@dataclass(frozen=True)
class LinkHash:
"""Links to content may have embedded hash values. This class parses those.
`name` must be any member of `_SUPPORTED_HASHES`.
This class can be converted to and from `ArchiveInfo`. While ArchiveInfo intends to
be JSON-serializable to conform to PEP 610, this class contains the logic for
parsing a hash name and value for correctness, and then checking whether that hash
conforms to a schema with `.is_hash_allowed()`."""
name: str
value: str
_hash_re = re.compile(
# NB: we do not validate that the second group (.*) is a valid hex
# digest. Instead, we simply keep that string in this class, and then check it
# against Hashes when hash-checking is needed. This is easier to debug than
# proactively discarding an invalid hex digest, as we handle incorrect hashes
# and malformed hashes in the same place.
r"({choices})=(.*)".format(
choices="|".join(re.escape(hash_name) for hash_name in _SUPPORTED_HASHES)
),
)
def __post_init__(self) -> None:
assert self._hash_re.match(f"{self.name}={self.value}")
@classmethod
@functools.lru_cache(maxsize=None)
def split_hash_name_and_value(cls, url: str) -> Optional["LinkHash"]:
"""Search a string for a checksum algorithm name and encoded output value."""
match = cls._hash_re.search(url)
if match is None:
return None
name, value = match.groups()
return cls(name=name, value=value)
def as_hashes(self) -> Hashes:
"""Return a Hashes instance which checks only for the current hash."""
return Hashes({self.name: [self.value]})
def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool:
"""
Return True if the current hash is allowed by `hashes`.
"""
if hashes is None:
return False
return hashes.is_hash_allowed(self.name, hex_digest=self.value)
def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))
def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))
# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
def _clean_url_path(path: str, is_local_path: bool) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part
# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)
cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())
return "".join(cleaned_parts)
def _ensure_quoted_url(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))
class Link(KeyBasedCompareMixin):
"""Represents a parsed link from a Package Index's simple URL"""
@ -46,6 +163,8 @@ class Link(KeyBasedCompareMixin):
"comes_from",
"requires_python",
"yanked_reason",
"dist_info_metadata",
"link_hash",
"cache_link_parsing",
]
@ -55,6 +174,8 @@ class Link(KeyBasedCompareMixin):
comes_from: Optional[Union[str, "IndexContent"]] = None,
requires_python: Optional[str] = None,
yanked_reason: Optional[str] = None,
dist_info_metadata: Optional[str] = None,
link_hash: Optional[LinkHash] = None,
cache_link_parsing: bool = True,
hashes: Optional[Mapping[str, str]] = None,
) -> None:
@ -72,6 +193,14 @@ class Link(KeyBasedCompareMixin):
a simple repository HTML link. If the file has been yanked but
no reason was provided, this should be the empty string. See
PEP 592 for more information and the specification.
:param dist_info_metadata: the metadata attached to the file, or None if no such
metadata is provided. This is the value of the "data-dist-info-metadata"
attribute, if present, in a simple repository HTML link. This may be parsed
into its own `Link` by `self.metadata_link()`. See PEP 658 for more
information and the specification.
:param link_hash: a checksum for the content the link points to. If not
provided, this will be extracted from the link URL, if the URL has
any checksum.
:param cache_link_parsing: A flag that is used elsewhere to determine
whether resources retrieved from this link
should be cached. PyPI index urls should
@ -94,11 +223,75 @@ class Link(KeyBasedCompareMixin):
self.comes_from = comes_from
self.requires_python = requires_python if requires_python else None
self.yanked_reason = yanked_reason
self.dist_info_metadata = dist_info_metadata
self.link_hash = link_hash or LinkHash.split_hash_name_and_value(self._url)
super().__init__(key=url, defining_class=Link)
self.cache_link_parsing = cache_link_parsing
@classmethod
def from_json(
cls,
file_data: Dict[str, Any],
page_url: str,
) -> Optional["Link"]:
"""
Convert an pypi json document from a simple repository page into a Link.
"""
file_url = file_data.get("url")
if file_url is None:
return None
url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
dist_info_metadata = file_data.get("dist-info-metadata")
hashes = file_data.get("hashes", {})
# The Link.yanked_reason expects an empty string instead of a boolean.
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
# The Link.yanked_reason expects None instead of False.
elif not yanked_reason:
yanked_reason = None
return cls(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
hashes=hashes,
dist_info_metadata=dist_info_metadata,
)
@classmethod
def from_element(
cls,
anchor_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional["Link"]:
"""
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = anchor_attribs.get("href")
if not href:
return None
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")
dist_info_metadata = anchor_attribs.get("data-dist-info-metadata")
return cls(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
dist_info_metadata=dist_info_metadata,
)
def __str__(self) -> str:
if self.requires_python:
rp = f" (requires-python:{self.requires_python})"
@ -181,32 +374,36 @@ class Link(KeyBasedCompareMixin):
return None
return match.group(1)
_hash_re = re.compile(
r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES))
)
def metadata_link(self) -> Optional["Link"]:
"""Implementation of PEP 658 parsing."""
# Note that Link.from_element() parsing the "data-dist-info-metadata" attribute
# from an HTML anchor tag is typically how the Link.dist_info_metadata attribute
# gets set.
if self.dist_info_metadata is None:
return None
metadata_url = f"{self.url_without_fragment}.metadata"
link_hash: Optional[LinkHash] = None
# If data-dist-info-metadata="true" is set, then the metadata file exists,
# but there is no information about its checksum or anything else.
if self.dist_info_metadata != "true":
link_hash = LinkHash.split_hash_name_and_value(self.dist_info_metadata)
return Link(metadata_url, link_hash=link_hash)
def as_hashes(self) -> Optional[Hashes]:
if self.link_hash is not None:
return self.link_hash.as_hashes()
return None
@property
def hash(self) -> Optional[str]:
for hashname in _SUPPORTED_HASHES:
if hashname in self._hashes:
return self._hashes[hashname]
match = self._hash_re.search(self._url)
if match:
return match.group(2)
if self.link_hash is not None:
return self.link_hash.value
return None
@property
def hash_name(self) -> Optional[str]:
for hashname in _SUPPORTED_HASHES:
if hashname in self._hashes:
return hashname
match = self._hash_re.search(self._url)
if match:
return match.group(1)
if self.link_hash is not None:
return self.link_hash.name
return None
@property
@ -236,19 +433,15 @@ class Link(KeyBasedCompareMixin):
@property
def has_hash(self) -> bool:
return self.hash_name is not None
return self.link_hash is not None
def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool:
"""
Return True if the link has a hash and it is allowed.
Return True if the link has a hash and it is allowed by `hashes`.
"""
if hashes is None or not self.has_hash:
if self.link_hash is None:
return False
# Assert non-None so mypy knows self.hash_name and self.hash are str.
assert self.hash_name is not None
assert self.hash is not None
return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash)
return self.link_hash.is_hash_allowed(hashes)
class _CleanResult(NamedTuple):

View file

@ -19,12 +19,13 @@ from pip._internal.exceptions import (
HashMismatch,
HashUnpinned,
InstallationError,
MetadataInconsistent,
NetworkConnectionError,
PreviousBuildDirError,
VcsHashUnsupported,
)
from pip._internal.index.package_finder import PackageFinder
from pip._internal.metadata import BaseDistribution
from pip._internal.metadata import BaseDistribution, get_metadata_distribution
from pip._internal.models.direct_url import ArchiveInfo
from pip._internal.models.link import Link
from pip._internal.models.wheel import Wheel
@ -346,19 +347,72 @@ class RequirementPreparer:
# showing the user what the hash should be.
return req.hashes(trust_internet=False) or MissingHashes()
def _fetch_metadata_only(
self,
req: InstallRequirement,
) -> Optional[BaseDistribution]:
if self.require_hashes:
logger.debug(
"Metadata-only fetching is not used as hash checking is required",
)
return None
# Try PEP 658 metadata first, then fall back to lazy wheel if unavailable.
return self._fetch_metadata_using_link_data_attr(
req
) or self._fetch_metadata_using_lazy_wheel(req.link)
def _fetch_metadata_using_link_data_attr(
self,
req: InstallRequirement,
) -> Optional[BaseDistribution]:
"""Fetch metadata from the data-dist-info-metadata attribute, if possible."""
# (1) Get the link to the metadata file, if provided by the backend.
metadata_link = req.link.metadata_link()
if metadata_link is None:
return None
assert req.req is not None
logger.info(
"Obtaining dependency information for %s from %s",
req.req,
metadata_link,
)
# (2) Download the contents of the METADATA file, separate from the dist itself.
metadata_file = get_http_url(
metadata_link,
self._download,
hashes=metadata_link.as_hashes(),
)
with open(metadata_file.path, "rb") as f:
metadata_contents = f.read()
# (3) Generate a dist just from those file contents.
metadata_dist = get_metadata_distribution(
metadata_contents,
req.link.filename,
req.req.name,
)
# (4) Ensure the Name: field from the METADATA file matches the name from the
# install requirement.
#
# NB: raw_name will fall back to the name from the install requirement if
# the Name: field is not present, but it's noted in the raw_name docstring
# that that should NEVER happen anyway.
if metadata_dist.raw_name != req.req.name:
raise MetadataInconsistent(
req, "Name", req.req.name, metadata_dist.raw_name
)
return metadata_dist
def _fetch_metadata_using_lazy_wheel(
self,
link: Link,
) -> Optional[BaseDistribution]:
"""Fetch metadata using lazy wheel, if possible."""
# --use-feature=fast-deps must be provided.
if not self.use_lazy_wheel:
return None
if self.require_hashes:
logger.debug("Lazy wheel is not used as hash checking is required")
return None
if link.is_file or not link.is_wheel:
logger.debug(
"Lazy wheel is not used as %r does not points to a remote wheel",
"Lazy wheel is not used as %r does not point to a remote wheel",
link,
)
return None
@ -414,13 +468,12 @@ class RequirementPreparer:
) -> BaseDistribution:
"""Prepare a requirement to be obtained from req.link."""
assert req.link
link = req.link
self._log_preparing_link(req)
with indent_log():
# Check if the relevant file is already available
# in the download directory
file_path = None
if self.download_dir is not None and link.is_wheel:
if self.download_dir is not None and req.link.is_wheel:
hashes = self._get_linked_req_hashes(req)
file_path = _check_download_dir(req.link, self.download_dir, hashes)
@ -429,10 +482,10 @@ class RequirementPreparer:
self._downloaded[req.link.url] = file_path
else:
# The file is not available, attempt to fetch only metadata
wheel_dist = self._fetch_metadata_using_lazy_wheel(link)
if wheel_dist is not None:
metadata_dist = self._fetch_metadata_only(req)
if metadata_dist is not None:
req.needs_more_preparation = True
return wheel_dist
return metadata_dist
# None of the optimizations worked, fully prepare the requirement
return self._prepare_linked_requirement(req, parallel_builds)

View file

@ -1,17 +1,24 @@
import os
import re
import shutil
import textwrap
import uuid
from dataclasses import dataclass
from enum import Enum
from hashlib import sha256
from pathlib import Path
from typing import List
from textwrap import dedent
from typing import Callable, Dict, List, Tuple
import pytest
from pip._internal.cli.status_codes import ERROR
from pip._internal.utils.urls import path_to_url
from tests.conftest import MockServer, ScriptFactory
from tests.lib import (
PipTestEnvironment,
TestData,
TestPipResult,
create_basic_sdist_for_package,
create_really_basic_wheel,
)
@ -1230,3 +1237,349 @@ def test_download_use_pep517_propagation(
downloads = os.listdir(download_dir)
assert len(downloads) == 2
class MetadataKind(Enum):
"""All the types of values we might be provided for the data-dist-info-metadata
attribute from PEP 658."""
# Valid: will read metadata from the dist instead.
No = "none"
# Valid: will read the .metadata file, but won't check its hash.
Unhashed = "unhashed"
# Valid: will read the .metadata file and check its hash matches.
Sha256 = "sha256"
# Invalid: will error out after checking the hash.
WrongHash = "wrong-hash"
# Invalid: will error out after failing to fetch the .metadata file.
NoFile = "no-file"
@dataclass(frozen=True)
class Package:
"""Mock package structure used to generate a PyPI repository.
Package name and version should correspond to sdists (.tar.gz files) in our test
data."""
name: str
version: str
filename: str
metadata: MetadataKind
# This will override any dependencies specified in the actual dist's METADATA.
requires_dist: Tuple[str, ...] = ()
def metadata_filename(self) -> str:
"""This is specified by PEP 658."""
return f"{self.filename}.metadata"
def generate_additional_tag(self) -> str:
"""This gets injected into the <a> tag in the generated PyPI index page for this
package."""
if self.metadata == MetadataKind.No:
return ""
if self.metadata in [MetadataKind.Unhashed, MetadataKind.NoFile]:
return 'data-dist-info-metadata="true"'
if self.metadata == MetadataKind.WrongHash:
return 'data-dist-info-metadata="sha256=WRONG-HASH"'
assert self.metadata == MetadataKind.Sha256
checksum = sha256(self.generate_metadata()).hexdigest()
return f'data-dist-info-metadata="sha256={checksum}"'
def requires_str(self) -> str:
if not self.requires_dist:
return ""
joined = " and ".join(self.requires_dist)
return f"Requires-Dist: {joined}"
def generate_metadata(self) -> bytes:
"""This is written to `self.metadata_filename()` and will override the actual
dist's METADATA, unless `self.metadata == MetadataKind.NoFile`."""
return dedent(
f"""\
Metadata-Version: 2.1
Name: {self.name}
Version: {self.version}
{self.requires_str()}
"""
).encode("utf-8")
@pytest.fixture(scope="function")
def write_index_html_content(tmpdir: Path) -> Callable[[str], Path]:
"""Generate a PyPI package index.html within a temporary local directory."""
html_dir = tmpdir / "index_html_content"
html_dir.mkdir()
def generate_index_html_subdir(index_html: str) -> Path:
"""Create a new subdirectory after a UUID and write an index.html."""
new_subdir = html_dir / uuid.uuid4().hex
new_subdir.mkdir()
with open(new_subdir / "index.html", "w") as f:
f.write(index_html)
return new_subdir
return generate_index_html_subdir
@pytest.fixture(scope="function")
def html_index_for_packages(
shared_data: TestData,
write_index_html_content: Callable[[str], Path],
) -> Callable[..., Path]:
"""Generate a PyPI HTML package index within a local directory pointing to
blank data."""
def generate_html_index_for_packages(packages: Dict[str, List[Package]]) -> Path:
"""
Produce a PyPI directory structure pointing to the specified packages.
"""
# (1) Generate the content for a PyPI index.html.
pkg_links = "\n".join(
f' <a href="{pkg}/index.html">{pkg}</a>' for pkg in packages.keys()
)
index_html = f"""\
<!DOCTYPE html>
<html>
<head>
<meta name="pypi:repository-version" content="1.0">
<title>Simple index</title>
</head>
<body>
{pkg_links}
</body>
</html>"""
# (2) Generate the index.html in a new subdirectory of the temp directory.
index_html_subdir = write_index_html_content(index_html)
# (3) Generate subdirectories for individual packages, each with their own
# index.html.
for pkg, links in packages.items():
pkg_subdir = index_html_subdir / pkg
pkg_subdir.mkdir()
download_links: List[str] = []
for package_link in links:
# (3.1) Generate the <a> tag which pip can crawl pointing to this
# specific package version.
download_links.append(
f' <a href="{package_link.filename}" {package_link.generate_additional_tag()}>{package_link.filename}</a><br/>' # noqa: E501
)
# (3.2) Copy over the corresponding file in `shared_data.packages`.
shutil.copy(
shared_data.packages / package_link.filename,
pkg_subdir / package_link.filename,
)
# (3.3) Write a metadata file, if applicable.
if package_link.metadata != MetadataKind.NoFile:
with open(pkg_subdir / package_link.metadata_filename(), "wb") as f:
f.write(package_link.generate_metadata())
# (3.4) After collating all the download links and copying over the files,
# write an index.html with the generated download links for each
# copied file for this specific package name.
download_links_str = "\n".join(download_links)
pkg_index_content = f"""\
<!DOCTYPE html>
<html>
<head>
<meta name="pypi:repository-version" content="1.0">
<title>Links for {pkg}</title>
</head>
<body>
<h1>Links for {pkg}</h1>
{download_links_str}
</body>
</html>"""
with open(pkg_subdir / "index.html", "w") as f:
f.write(pkg_index_content)
return index_html_subdir
return generate_html_index_for_packages
@pytest.fixture(scope="function")
def download_generated_html_index(
script: PipTestEnvironment,
html_index_for_packages: Callable[[Dict[str, List[Package]]], Path],
tmpdir: Path,
) -> Callable[..., Tuple[TestPipResult, Path]]:
"""Execute `pip download` against a generated PyPI index."""
download_dir = tmpdir / "download_dir"
def run_for_generated_index(
packages: Dict[str, List[Package]],
args: List[str],
allow_error: bool = False,
) -> Tuple[TestPipResult, Path]:
"""
Produce a PyPI directory structure pointing to the specified packages, then
execute `pip download -i ...` pointing to our generated index.
"""
index_dir = html_index_for_packages(packages)
pip_args = [
"download",
"-d",
str(download_dir),
"-i",
path_to_url(str(index_dir)),
*args,
]
result = script.pip(*pip_args, allow_error=allow_error)
return (result, download_dir)
return run_for_generated_index
# The package database we generate for testing PEP 658 support.
_simple_packages: Dict[str, List[Package]] = {
"simple": [
Package("simple", "1.0", "simple-1.0.tar.gz", MetadataKind.Sha256),
Package("simple", "2.0", "simple-2.0.tar.gz", MetadataKind.No),
# This will raise a hashing error.
Package("simple", "3.0", "simple-3.0.tar.gz", MetadataKind.WrongHash),
],
"simple2": [
# Override the dependencies here in order to force pip to download
# simple-1.0.tar.gz as well.
Package(
"simple2",
"1.0",
"simple2-1.0.tar.gz",
MetadataKind.Unhashed,
("simple==1.0",),
),
# This will raise an error when pip attempts to fetch the metadata file.
Package("simple2", "2.0", "simple2-2.0.tar.gz", MetadataKind.NoFile),
],
"colander": [
# Ensure we can read the dependencies from a metadata file within a wheel
# *without* PEP 658 metadata.
Package(
"colander", "0.9.9", "colander-0.9.9-py2.py3-none-any.whl", MetadataKind.No
),
],
"compilewheel": [
# Ensure we can override the dependencies of a wheel file by injecting PEP
# 658 metadata.
Package(
"compilewheel",
"1.0",
"compilewheel-1.0-py2.py3-none-any.whl",
MetadataKind.Unhashed,
("simple==1.0",),
),
],
"has-script": [
# Ensure we check PEP 658 metadata hashing errors for wheel files.
Package(
"has-script",
"1.0",
"has.script-1.0-py2.py3-none-any.whl",
MetadataKind.WrongHash,
),
],
"translationstring": [
Package(
"translationstring", "1.1", "translationstring-1.1.tar.gz", MetadataKind.No
),
],
"priority": [
# Ensure we check for a missing metadata file for wheels.
Package(
"priority", "1.0", "priority-1.0-py2.py3-none-any.whl", MetadataKind.NoFile
),
],
}
@pytest.mark.parametrize(
"requirement_to_download, expected_outputs",
[
("simple2==1.0", ["simple-1.0.tar.gz", "simple2-1.0.tar.gz"]),
("simple==2.0", ["simple-2.0.tar.gz"]),
(
"colander",
["colander-0.9.9-py2.py3-none-any.whl", "translationstring-1.1.tar.gz"],
),
(
"compilewheel",
["compilewheel-1.0-py2.py3-none-any.whl", "simple-1.0.tar.gz"],
),
],
)
def test_download_metadata(
download_generated_html_index: Callable[..., Tuple[TestPipResult, Path]],
requirement_to_download: str,
expected_outputs: List[str],
) -> None:
"""Verify that if a data-dist-info-metadata attribute is present, then it is used
instead of the actual dist's METADATA."""
_, download_dir = download_generated_html_index(
_simple_packages,
[requirement_to_download],
)
assert sorted(os.listdir(download_dir)) == expected_outputs
@pytest.mark.parametrize(
"requirement_to_download, real_hash",
[
(
"simple==3.0",
"95e0f200b6302989bcf2cead9465cf229168295ea330ca30d1ffeab5c0fed996",
),
(
"has-script",
"16ba92d7f6f992f6de5ecb7d58c914675cf21f57f8e674fb29dcb4f4c9507e5b",
),
],
)
def test_incorrect_metadata_hash(
download_generated_html_index: Callable[..., Tuple[TestPipResult, Path]],
requirement_to_download: str,
real_hash: str,
) -> None:
"""Verify that if a hash for data-dist-info-metadata is provided, it must match the
actual hash of the metadata file."""
result, _ = download_generated_html_index(
_simple_packages,
[requirement_to_download],
allow_error=True,
)
assert result.returncode != 0
expected_msg = f"""\
Expected sha256 WRONG-HASH
Got {real_hash}"""
assert expected_msg in result.stderr
@pytest.mark.parametrize(
"requirement_to_download, expected_url",
[
("simple2==2.0", "simple2-2.0.tar.gz.metadata"),
("priority", "priority-1.0-py2.py3-none-any.whl.metadata"),
],
)
def test_metadata_not_found(
download_generated_html_index: Callable[..., Tuple[TestPipResult, Path]],
requirement_to_download: str,
expected_url: str,
) -> None:
"""Verify that if a data-dist-info-metadata attribute is provided, that pip will
fetch the .metadata file at the location specified by PEP 658, and error
if unavailable."""
result, _ = download_generated_html_index(
_simple_packages,
[requirement_to_download],
allow_error=True,
)
assert result.returncode != 0
expected_re = re.escape(expected_url)
pattern = re.compile(
f"ERROR: 404 Client Error: FileNotFoundError for url:.*{expected_re}"
)
assert pattern.search(result.stderr), (pattern, result.stderr)

View file

@ -1363,7 +1363,7 @@ def test_new_resolver_skip_inconsistent_metadata(script: PipTestEnvironment) ->
)
assert (
" inconsistent version: filename has '3', but metadata has '2'"
" inconsistent version: expected '3', but metadata has '2'"
) in result.stdout, str(result)
script.assert_installed(a="1")

View file

@ -150,14 +150,6 @@ def html5_page(text: str) -> str:
)
def index_page(spec: Dict[str, str]) -> "WSGIApplication":
def link(name: str, value: str) -> str:
return '<a href="{}">{}</a>'.format(value, name)
links = "".join(link(*kv) for kv in spec.items())
return text_html_response(html5_page(links))
def package_page(spec: Dict[str, str]) -> "WSGIApplication":
def link(name: str, value: str) -> str:
return '<a href="{}">{}</a>'.format(value, name)

View file

@ -11,7 +11,7 @@ from pip._internal.exceptions import UnsupportedWheel
from pip._internal.metadata.pkg_resources import (
Distribution,
Environment,
WheelMetadata,
InMemoryMetadata,
)
pkg_resources = pytest.importorskip("pip._vendor.pkg_resources")
@ -99,7 +99,7 @@ def test_wheel_metadata_works() -> None:
dist = Distribution(
pkg_resources.DistInfoDistribution(
location="<in-memory>",
metadata=WheelMetadata({"METADATA": metadata.as_bytes()}, "<in-memory>"),
metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, "<in-memory>"),
project_name=name,
),
)
@ -116,7 +116,7 @@ def test_wheel_metadata_works() -> None:
def test_wheel_metadata_throws_on_bad_unicode() -> None:
metadata = WheelMetadata({"METADATA": b"\xff"}, "<in-memory>")
metadata = InMemoryMetadata({"METADATA": b"\xff"}, "<in-memory>")
with pytest.raises(UnsupportedWheel) as e:
metadata.get_metadata("METADATA")

View file

@ -11,13 +11,12 @@ from unittest import mock
import pytest
from pip._vendor import requests
from pip._vendor.packaging.requirements import Requirement
from pip._internal.exceptions import NetworkConnectionError
from pip._internal.index.collector import (
IndexContent,
LinkCollector,
_clean_link,
_clean_url_path,
_get_index_content,
_get_simple_response,
_make_index_content,
@ -28,7 +27,12 @@ from pip._internal.index.collector import (
from pip._internal.index.sources import _FlatDirectorySource, _IndexDirectorySource
from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.index import PyPI
from pip._internal.models.link import Link
from pip._internal.models.link import (
Link,
LinkHash,
_clean_url_path,
_ensure_quoted_url,
)
from pip._internal.network.session import PipSession
from tests.lib import TestData, make_test_link_collector
@ -402,13 +406,13 @@ def test_clean_url_path_with_local_path(path: str, expected: str) -> None:
),
],
)
def test_clean_link(url: str, clean_url: str) -> None:
assert _clean_link(url) == clean_url
def test_ensure_quoted_url(url: str, clean_url: str) -> None:
assert _ensure_quoted_url(url) == clean_url
def _test_parse_links_data_attribute(
anchor_html: str, attr: str, expected: Optional[str]
) -> None:
) -> Link:
html = (
"<!DOCTYPE html>"
'<html><head><meta charset="utf-8"><head>'
@ -427,6 +431,7 @@ def _test_parse_links_data_attribute(
(link,) = links
actual = getattr(link, attr)
assert actual == expected
return link
@pytest.mark.parametrize(
@ -454,6 +459,12 @@ def test_parse_links__requires_python(
_test_parse_links_data_attribute(anchor_html, "requires_python", expected)
# TODO: this test generates its own examples to validate the json client implementation
# instead of sharing those examples with the html client testing. We expect this won't
# hide any bugs because operations like resolving PEP 658 metadata should use the same
# code for both types of indices, but it might be nice to explicitly have all our tests
# in test_download.py execute over both html and json indices with
# a pytest.mark.parameterize decorator to ensure nothing slips through the cracks.
def test_parse_links_json() -> None:
json_bytes = json.dumps(
{
@ -474,6 +485,14 @@ def test_parse_links_json() -> None:
"requires-python": ">=3.7",
"dist-info-metadata": False,
},
# Same as above, but parsing dist-info-metadata.
{
"filename": "holygrail-1.0-py3-none-any.whl",
"url": "/files/holygrail-1.0-py3-none-any.whl",
"hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
"requires-python": ">=3.7",
"dist-info-metadata": "sha512=aabdd41",
},
],
}
).encode("utf8")
@ -502,8 +521,25 @@ def test_parse_links_json() -> None:
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
),
Link(
"https://example.com/files/holygrail-1.0-py3-none-any.whl",
comes_from=page.url,
requires_python=">=3.7",
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
dist_info_metadata="sha512=aabdd41",
),
]
# Ensure the metadata info can be parsed into the correct link.
metadata_link = links[2].metadata_link()
assert metadata_link is not None
assert (
metadata_link.url
== "https://example.com/files/holygrail-1.0-py3-none-any.whl.metadata"
)
assert metadata_link.link_hash == LinkHash("sha512", "aabdd41")
@pytest.mark.parametrize(
"anchor_html, expected",
@ -534,6 +570,48 @@ def test_parse_links__yanked_reason(anchor_html: str, expected: Optional[str]) -
_test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)
# Requirement objects do not == each other unless they point to the same instance!
_pkg1_requirement = Requirement("pkg1==1.0")
@pytest.mark.parametrize(
"anchor_html, expected, link_hash",
[
# Test not present.
(
'<a href="/pkg1-1.0.tar.gz"></a>',
None,
None,
),
# Test with value "true".
(
'<a href="/pkg1-1.0.tar.gz" data-dist-info-metadata="true"></a>',
"true",
None,
),
# Test with a provided hash value.
(
'<a href="/pkg1-1.0.tar.gz" data-dist-info-metadata="sha256=aa113592bbe"></a>', # noqa: E501
"sha256=aa113592bbe",
None,
),
# Test with a provided hash value for both the requirement as well as metadata.
(
'<a href="/pkg1-1.0.tar.gz#sha512=abc132409cb" data-dist-info-metadata="sha256=aa113592bbe"></a>', # noqa: E501
"sha256=aa113592bbe",
LinkHash("sha512", "abc132409cb"),
),
],
)
def test_parse_links__dist_info_metadata(
anchor_html: str,
expected: Optional[str],
link_hash: Optional[LinkHash],
) -> None:
link = _test_parse_links_data_attribute(anchor_html, "dist_info_metadata", expected)
assert link.link_hash == link_hash
def test_parse_links_caches_same_page_by_url() -> None:
html = (
"<!DOCTYPE html>"
@ -963,3 +1041,23 @@ def test_link_collector_create_find_links_expansion(
expected_temp2_dir = os.path.normcase(temp2_dir)
assert search_scope.find_links == ["~/temp1", expected_temp2_dir]
assert search_scope.index_urls == ["default_url"]
@pytest.mark.parametrize(
"url, result",
[
(
"https://pypi.org/pip-18.0.tar.gz#sha256=aa113592bbe",
LinkHash("sha256", "aa113592bbe"),
),
(
"https://pypi.org/pip-18.0.tar.gz#md5=aa113592bbe",
LinkHash("md5", "aa113592bbe"),
),
("https://pypi.org/pip-18.0.tar.gz", None),
# We don't recognize the "sha500" algorithm, so we discard it.
("https://pypi.org/pip-18.0.tar.gz#sha500=aa113592bbe", None),
],
)
def test_link_hash_parsing(url: str, result: Optional[LinkHash]) -> None:
assert LinkHash.split_hash_name_and_value(url) == result