From fc52fb3fb6a0a0cd67463e2595a8cfd1522b06e0 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 31 Jul 2023 04:12:59 -0400 Subject: [PATCH 01/10] add test for new install --dry-run functionality (no downloading) --- tests/conftest.py | 32 ++- tests/functional/test_install_metadata.py | 236 ++++++++++++++++++++++ 2 files changed, 264 insertions(+), 4 deletions(-) create mode 100644 tests/functional/test_install_metadata.py diff --git a/tests/conftest.py b/tests/conftest.py index cd9931c66..08a6cf9fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -793,6 +793,9 @@ class FakePackage: requires_dist: Tuple[str, ...] = () # This will override the Name specified in the actual dist's METADATA. metadata_name: Optional[str] = None + # Whether to delete the file this points to, which causes any attempt to fetch this + # package to fail unless it is processed as a metadata-only dist. + delete_linked_file: bool = False def metadata_filename(self) -> str: """This is specified by PEP 658.""" @@ -882,6 +885,27 @@ def fake_packages() -> Dict[str, List[FakePackage]]: ("simple==1.0",), ), ], + "complex-dist": [ + FakePackage( + "complex-dist", + "0.1", + "complex_dist-0.1-py2.py3-none-any.whl", + MetadataKind.Unhashed, + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata presents no hash itself. + delete_linked_file=True, + ), + ], + "corruptwheel": [ + FakePackage( + "corruptwheel", + "1.0", + "corruptwheel-1.0-py2.py3-none-any.whl", + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata *does* present a hash. + MetadataKind.Sha256, + ), + ], "has-script": [ # Ensure we check PEP 658 metadata hashing errors for wheel files. FakePackage( @@ -967,10 +991,10 @@ def html_index_for_packages( f' {package_link.filename}
' # noqa: E501 ) # (3.2) Copy over the corresponding file in `shared_data.packages`. - shutil.copy( - shared_data.packages / package_link.filename, - pkg_subdir / package_link.filename, - ) + cached_file = shared_data.packages / package_link.filename + new_file = pkg_subdir / package_link.filename + if not package_link.delete_linked_file: + shutil.copy(cached_file, new_file) # (3.3) Write a metadata file, if applicable. if package_link.metadata != MetadataKind.NoFile: with open(pkg_subdir / package_link.metadata_filename(), "wb") as f: diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py new file mode 100644 index 000000000..32c8f53bb --- /dev/null +++ b/tests/functional/test_install_metadata.py @@ -0,0 +1,236 @@ +import json +import re +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Tuple + +import pytest +from pip._vendor.packaging.requirements import Requirement + +from pip._internal.models.direct_url import DirectUrl +from pip._internal.utils.urls import path_to_url +from tests.lib import ( + PipTestEnvironment, + TestPipResult, +) + + +@pytest.fixture(scope="function") +def install_with_generated_html_index( + script: PipTestEnvironment, + html_index_for_packages: Path, + tmpdir: Path, +) -> Callable[..., Tuple[TestPipResult, Dict[str, Any]]]: + """Execute `pip download` against a generated PyPI index.""" + output_file = tmpdir / "output_file.json" + + def run_for_generated_index( + args: List[str], + *, + dry_run: bool = True, + allow_error: bool = False, + ) -> Tuple[TestPipResult, Dict[str, Any]]: + """ + Produce a PyPI directory structure pointing to the specified packages, then + execute `pip install --report ... -i ...` pointing to our generated index. + """ + pip_args = [ + "install", + *(("--dry-run",) if dry_run else ()), + "--ignore-installed", + "--report", + str(output_file), + "-i", + path_to_url(str(html_index_for_packages)), + *args, + ] + result = script.pip(*pip_args, allow_error=allow_error) + try: + with open(output_file, "rb") as f: + report = json.load(f) + except FileNotFoundError: + if allow_error: + report = {} + else: + raise + return (result, report) + + return run_for_generated_index + + +def iter_dists(report: Dict[str, Any]) -> Iterator[Tuple[Requirement, DirectUrl]]: + """Parse a (req,url) tuple from each installed dist in the --report json.""" + for inst in report["install"]: + metadata = inst["metadata"] + name = metadata["name"] + version = metadata["version"] + req = Requirement(f"{name}=={version}") + direct_url = DirectUrl.from_dict(inst["download_info"]) + yield (req, direct_url) + + +@pytest.mark.parametrize( + "requirement_to_install, expected_outputs", + [ + ("simple2==1.0", ["simple2==1.0", "simple==1.0"]), + ("simple==2.0", ["simple==2.0"]), + ( + "colander", + ["colander==0.9.9", "translationstring==1.1"], + ), + ( + "compilewheel", + ["compilewheel==1.0", "simple==1.0"], + ), + ], +) +def test_install_with_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + expected_outputs: List[str], +) -> None: + """Verify that if a data-dist-info-metadata attribute is present, then it is used + instead of the actual dist's METADATA.""" + _, report = install_with_generated_html_index( + [requirement_to_install], + ) + installed = sorted(str(r) for r, _ in iter_dists(report)) + assert installed == expected_outputs + + +@pytest.mark.parametrize( + "requirement_to_install, real_hash", + [ + ( + "simple==3.0", + "95e0f200b6302989bcf2cead9465cf229168295ea330ca30d1ffeab5c0fed996", + ), + ( + "has-script", + "16ba92d7f6f992f6de5ecb7d58c914675cf21f57f8e674fb29dcb4f4c9507e5b", + ), + ], +) +def test_incorrect_metadata_hash( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + real_hash: str, +) -> None: + """Verify that if a hash for data-dist-info-metadata is provided, it must match the + actual hash of the metadata file.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_msg = f"""\ + Expected sha256 WRONG-HASH + Got {real_hash}""" + assert expected_msg in result.stderr + + +@pytest.mark.parametrize( + "requirement_to_install, expected_url", + [ + ("simple2==2.0", "simple2-2.0.tar.gz.metadata"), + ("priority", "priority-1.0-py2.py3-none-any.whl.metadata"), + ], +) +def test_metadata_not_found( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + expected_url: str, +) -> None: + """Verify that if a data-dist-info-metadata attribute is provided, that pip will + fetch the .metadata file at the location specified by PEP 658, and error + if unavailable.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_re = re.escape(expected_url) + pattern = re.compile( + f"ERROR: 404 Client Error: FileNotFoundError for url:.*{expected_re}" + ) + assert pattern.search(result.stderr), (pattern, result.stderr) + + +def test_produces_error_for_mismatched_package_name_in_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], +) -> None: + """Verify that the package name from the metadata matches the requested package.""" + result, _ = install_with_generated_html_index( + ["simple2==3.0"], + allow_error=True, + ) + assert result.returncode != 0 + assert ( + "simple2-3.0.tar.gz has inconsistent Name: expected 'simple2', but metadata " + "has 'not-simple2'" + ) in result.stdout + + +@pytest.mark.parametrize( + "requirement", + ( + "requires-simple-extra==0.1", + "REQUIRES_SIMPLE-EXTRA==0.1", + "REQUIRES....simple-_-EXTRA==0.1", + ), +) +def test_canonicalizes_package_name_before_verifying_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement: str, +) -> None: + """Verify that the package name from the command line and the package's + METADATA are both canonicalized before comparison, while the name from the METADATA + is always used verbatim to represent the installed candidate in --report. + + Regression test for https://github.com/pypa/pip/issues/12038 + """ + _, report = install_with_generated_html_index( + [requirement], + ) + reqs = [str(r) for r, _ in iter_dists(report)] + assert reqs == ["Requires_Simple.Extra==0.1"] + + +@pytest.mark.parametrize( + "requirement,err_string", + ( + # It's important that we verify pip won't even attempt to fetch the file, so we + # construct an input that will cause it to error if it tries at all. + ("complex-dist==0.1", "404 Client Error: FileNotFoundError"), + ("corruptwheel==1.0", ".whl is invalid."), + ), +) +def test_dry_run_avoids_downloading_metadata_only_dists( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement: str, + err_string: str, +) -> None: + """Verify that the underlying dist files are not downloaded at all when + `install --dry-run` is used to resolve dists with PEP 658 metadata.""" + _, report = install_with_generated_html_index( + [requirement], + ) + assert [requirement] == list(str(r) for r, _ in iter_dists(report)) + result, _ = install_with_generated_html_index( + [requirement], + dry_run=False, + allow_error=True, + ) + assert result.returncode != 0 + assert err_string in result.stderr From 6ec7851bfa6bfda6b6a527b3758e4f0ab9c3daf2 Mon Sep 17 00:00:00 2001 From: Jonathan Helmus Date: Thu, 13 Oct 2022 15:09:01 -0500 Subject: [PATCH 02/10] use .metadata distribution info when possible When performing `install --dry-run` and PEP 658 .metadata files are available to guide the resolve, do not download the associated wheels. Rather use the distribution information directly from the .metadata files when reporting the results on the CLI and in the --report file. - describe the new --dry-run behavior - finalize linked requirements immediately after resolve - introduce is_concrete - funnel InstalledDistribution through _get_prepared_distribution() too --- news/12186.bugfix.rst | 1 + src/pip/_internal/commands/download.py | 4 +- src/pip/_internal/commands/install.py | 7 +- src/pip/_internal/commands/wheel.py | 4 +- src/pip/_internal/distributions/__init__.py | 5 + src/pip/_internal/distributions/base.py | 19 ++- src/pip/_internal/distributions/installed.py | 6 +- src/pip/_internal/distributions/sdist.py | 20 ++- src/pip/_internal/distributions/wheel.py | 4 +- src/pip/_internal/metadata/base.py | 21 +++ .../_internal/metadata/importlib/_dists.py | 17 +- src/pip/_internal/metadata/importlib/_envs.py | 8 +- src/pip/_internal/metadata/pkg_resources.py | 15 +- .../_internal/models/installation_report.py | 2 +- src/pip/_internal/operations/check.py | 5 +- src/pip/_internal/operations/prepare.py | 154 +++++++++++++----- src/pip/_internal/req/req_install.py | 84 +++++++--- src/pip/_internal/req/req_set.py | 4 +- .../resolution/resolvelib/resolver.py | 5 - .../metadata/test_metadata_pkg_resources.py | 1 + tests/unit/test_req.py | 2 + 21 files changed, 287 insertions(+), 101 deletions(-) create mode 100644 news/12186.bugfix.rst diff --git a/news/12186.bugfix.rst b/news/12186.bugfix.rst new file mode 100644 index 000000000..b51d84a0c --- /dev/null +++ b/news/12186.bugfix.rst @@ -0,0 +1 @@ +Avoid downloading any dists in ``install --dry-run`` if PEP 658 ``.metadata`` files or lazy wheels are available. diff --git a/src/pip/_internal/commands/download.py b/src/pip/_internal/commands/download.py index 54247a78a..06d062f8b 100644 --- a/src/pip/_internal/commands/download.py +++ b/src/pip/_internal/commands/download.py @@ -130,6 +130,9 @@ class DownloadCommand(RequirementCommand): self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), hydrate_virtual_reqs=True + ) downloaded: List[str] = [] for req in requirement_set.requirements.values(): @@ -138,7 +141,6 @@ class DownloadCommand(RequirementCommand): preparer.save_linked_requirement(req) downloaded.append(req.name) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) requirement_set.warn_legacy_versions_and_specifiers() if downloaded: diff --git a/src/pip/_internal/commands/install.py b/src/pip/_internal/commands/install.py index f6a300804..296624a8a 100644 --- a/src/pip/_internal/commands/install.py +++ b/src/pip/_internal/commands/install.py @@ -84,7 +84,8 @@ class InstallCommand(RequirementCommand): help=( "Don't actually install anything, just print what would be. " "Can be used in combination with --ignore-installed " - "to 'resolve' the requirements." + "to 'resolve' the requirements. If PEP 658 or fast-deps metadata is " + "available, --dry-run also avoids downloading the dependency at all." ), ) self.cmd_opts.add_option( @@ -377,6 +378,10 @@ class InstallCommand(RequirementCommand): requirement_set = resolver.resolve( reqs, check_supported_wheels=not options.target_dir ) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), + hydrate_virtual_reqs=not options.dry_run, + ) if options.json_report_file: report = InstallationReport(requirement_set.requirements_to_install) diff --git a/src/pip/_internal/commands/wheel.py b/src/pip/_internal/commands/wheel.py index ed578aa25..af56bc4d1 100644 --- a/src/pip/_internal/commands/wheel.py +++ b/src/pip/_internal/commands/wheel.py @@ -145,6 +145,9 @@ class WheelCommand(RequirementCommand): self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), hydrate_virtual_reqs=True + ) reqs_to_build: List[InstallRequirement] = [] for req in requirement_set.requirements.values(): @@ -153,7 +156,6 @@ class WheelCommand(RequirementCommand): elif should_build_for_wheel_command(req): reqs_to_build.append(req) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) requirement_set.warn_legacy_versions_and_specifiers() # build wheels diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index 9a89a838b..f6089daf4 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -1,4 +1,5 @@ from pip._internal.distributions.base import AbstractDistribution +from pip._internal.distributions.installed import InstalledDistribution from pip._internal.distributions.sdist import SourceDistribution from pip._internal.distributions.wheel import WheelDistribution from pip._internal.req.req_install import InstallRequirement @@ -8,6 +9,10 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: """Returns a Distribution for the given InstallRequirement""" + # Only pre-installed requirements will have a .satisfied_by dist. + if install_req.satisfied_by: + return InstalledDistribution(install_req) + # Editable requirements will always be source distributions. They use the # legacy logic until we create a modern standard for them. if install_req.editable: diff --git a/src/pip/_internal/distributions/base.py b/src/pip/_internal/distributions/base.py index 6fb0d7b77..60d1b65cf 100644 --- a/src/pip/_internal/distributions/base.py +++ b/src/pip/_internal/distributions/base.py @@ -35,11 +35,17 @@ class AbstractDistribution(metaclass=abc.ABCMeta): If None, then this dist has no work to do in the build tracker, and ``.prepare_distribution_metadata()`` will not be called.""" - raise NotImplementedError() + ... @abc.abstractmethod def get_metadata_distribution(self) -> BaseDistribution: - raise NotImplementedError() + """Generate a concrete ``BaseDistribution`` instance for this artifact. + + The implementation should also cache the result with + ``self.req.cache_concrete_dist()`` so the distribution is available to other + users of the ``InstallRequirement``. This method is not called within the build + tracker context, so it should not identify any new setup requirements.""" + ... @abc.abstractmethod def prepare_distribution_metadata( @@ -48,4 +54,11 @@ class AbstractDistribution(metaclass=abc.ABCMeta): build_isolation: bool, check_build_deps: bool, ) -> None: - raise NotImplementedError() + """Generate the information necessary to extract metadata from the artifact. + + This method will be executed within the context of ``BuildTracker#track()``, so + it needs to fully identify any seutp requirements so they can be added to the + same active set of tracked builds, while ``.get_metadata_distribution()`` takes + care of generating and caching the ``BaseDistribution`` to expose to the rest of + the resolve.""" + ... diff --git a/src/pip/_internal/distributions/installed.py b/src/pip/_internal/distributions/installed.py index ab8d53be7..f1a748289 100644 --- a/src/pip/_internal/distributions/installed.py +++ b/src/pip/_internal/distributions/installed.py @@ -17,8 +17,10 @@ class InstalledDistribution(AbstractDistribution): return None def get_metadata_distribution(self) -> BaseDistribution: - assert self.req.satisfied_by is not None, "not actually installed" - return self.req.satisfied_by + dist = self.req.satisfied_by + assert dist is not None, "not actually installed" + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/distributions/sdist.py b/src/pip/_internal/distributions/sdist.py index 15ff42b7b..a7b16f6f2 100644 --- a/src/pip/_internal/distributions/sdist.py +++ b/src/pip/_internal/distributions/sdist.py @@ -1,11 +1,11 @@ import logging -from typing import Iterable, Optional, Set, Tuple +from typing import Iterable, Set, Tuple from pip._internal.build_env import BuildEnvironment from pip._internal.distributions.base import AbstractDistribution from pip._internal.exceptions import InstallationError from pip._internal.index.package_finder import PackageFinder -from pip._internal.metadata import BaseDistribution +from pip._internal.metadata import BaseDistribution, get_directory_distribution from pip._internal.utils.subprocess import runner_with_spinner_message logger = logging.getLogger(__name__) @@ -19,13 +19,19 @@ class SourceDistribution(AbstractDistribution): """ @property - def build_tracker_id(self) -> Optional[str]: + def build_tracker_id(self) -> str: """Identify this requirement uniquely by its link.""" assert self.req.link return self.req.link.url_without_fragment def get_metadata_distribution(self) -> BaseDistribution: - return self.req.get_dist() + assert ( + self.req.metadata_directory + ), "Set as part of .prepare_distribution_metadata()" + dist = get_directory_distribution(self.req.metadata_directory) + self.req.cache_concrete_dist(dist) + self.req.validate_sdist_metadata() + return dist def prepare_distribution_metadata( self, @@ -64,7 +70,11 @@ class SourceDistribution(AbstractDistribution): self._raise_conflicts("the backend dependencies", conflicting) if missing: self._raise_missing_reqs(missing) - self.req.prepare_metadata() + + # NB: we must still call .cache_concrete_dist() and .validate_sdist_metadata() + # before the InstallRequirement itself has been updated with the metadata from + # this directory! + self.req.prepare_metadata_directory() def _prepare_build_backend(self, finder: PackageFinder) -> None: # Isolate in a BuildEnvironment and install the build-time diff --git a/src/pip/_internal/distributions/wheel.py b/src/pip/_internal/distributions/wheel.py index eb16e25cb..e27306015 100644 --- a/src/pip/_internal/distributions/wheel.py +++ b/src/pip/_internal/distributions/wheel.py @@ -29,7 +29,9 @@ class WheelDistribution(AbstractDistribution): assert self.req.local_file_path, "Set as part of preparation during download" assert self.req.name, "Wheels are never unnamed" wheel = FilesystemWheel(self.req.local_file_path) - return get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + dist = get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index cafb79fb3..b92d6a4dd 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -105,6 +105,15 @@ class RequiresEntry(NamedTuple): class BaseDistribution(Protocol): + @property + def is_concrete(self) -> bool: + """Whether the distribution really exists somewhere on disk. + + If this is false, it has been synthesized from metadata, e.g. via + ``.from_metadata_file_contents()``, or ``.from_wheel()`` against + a ``MemoryWheel``.""" + raise NotImplementedError() + @classmethod def from_directory(cls, directory: str) -> "BaseDistribution": """Load the distribution from a metadata directory. @@ -667,6 +676,10 @@ class BaseEnvironment: class Wheel(Protocol): location: str + @property + def is_concrete(self) -> bool: + raise NotImplementedError() + def as_zipfile(self) -> zipfile.ZipFile: raise NotImplementedError() @@ -675,6 +688,10 @@ class FilesystemWheel(Wheel): def __init__(self, location: str) -> None: self.location = location + @property + def is_concrete(self) -> bool: + return True + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.location, allowZip64=True) @@ -684,5 +701,9 @@ class MemoryWheel(Wheel): self.location = location self.stream = stream + @property + def is_concrete(self) -> bool: + return False + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.stream, allowZip64=True) diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index 65c043c87..deeab679d 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -99,16 +99,22 @@ class Distribution(BaseDistribution): dist: importlib.metadata.Distribution, info_location: Optional[BasePath], installed_location: Optional[BasePath], + concrete: bool, ) -> None: self._dist = dist self._info_location = info_location self._installed_location = installed_location + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: info_location = pathlib.Path(directory) dist = importlib.metadata.Distribution.at(info_location) - return cls(dist, info_location, info_location.parent) + return cls(dist, info_location, info_location.parent, concrete=True) @classmethod def from_metadata_file_contents( @@ -125,7 +131,7 @@ class Distribution(BaseDistribution): metadata_path.write_bytes(metadata_contents) # Construct dist pointing to the newly created directory. dist = importlib.metadata.Distribution.at(metadata_path.parent) - return cls(dist, metadata_path.parent, None) + return cls(dist, metadata_path.parent, None, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -136,7 +142,12 @@ class Distribution(BaseDistribution): raise InvalidWheel(wheel.location, name) from e except UnsupportedWheel as e: raise UnsupportedWheel(f"{name} has an invalid wheel, {e}") - return cls(dist, dist.info_location, pathlib.PurePosixPath(wheel.location)) + return cls( + dist, + dist.info_location, + pathlib.PurePosixPath(wheel.location), + concrete=wheel.is_concrete, + ) @property def location(self) -> Optional[str]: diff --git a/src/pip/_internal/metadata/importlib/_envs.py b/src/pip/_internal/metadata/importlib/_envs.py index 3850ddaf4..d4d0ff59d 100644 --- a/src/pip/_internal/metadata/importlib/_envs.py +++ b/src/pip/_internal/metadata/importlib/_envs.py @@ -81,7 +81,7 @@ class _DistributionFinder: installed_location: Optional[BasePath] = None else: installed_location = info_location.parent - yield Distribution(dist, info_location, installed_location) + yield Distribution(dist, info_location, installed_location, concrete=True) def find_linked(self, location: str) -> Iterator[BaseDistribution]: """Read location in egg-link files and return distributions in there. @@ -105,7 +105,7 @@ class _DistributionFinder: continue target_location = str(path.joinpath(target_rel)) for dist, info_location in self._find_impl(target_location): - yield Distribution(dist, info_location, path) + yield Distribution(dist, info_location, path, concrete=True) def _find_eggs_in_dir(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_distributions @@ -117,7 +117,7 @@ class _DistributionFinder: if not entry.name.endswith(".egg"): continue for dist in find_distributions(entry.path): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def _find_eggs_in_zip(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_eggs_in_zip @@ -129,7 +129,7 @@ class _DistributionFinder: except zipimport.ZipImportError: return for dist in find_eggs_in_zip(importer, location): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def find_eggs(self, location: str) -> Iterator[BaseDistribution]: """Find eggs in a location. diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index f330ef12a..9451dbd30 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -69,8 +69,13 @@ class InMemoryMetadata: class Distribution(BaseDistribution): - def __init__(self, dist: pkg_resources.Distribution) -> None: + def __init__(self, dist: pkg_resources.Distribution, concrete: bool) -> None: self._dist = dist + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: @@ -90,7 +95,7 @@ class Distribution(BaseDistribution): dist_name = os.path.splitext(dist_dir_name)[0].split("-")[0] dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata) - return cls(dist) + return cls(dist, concrete=True) @classmethod def from_metadata_file_contents( @@ -107,7 +112,7 @@ class Distribution(BaseDistribution): metadata=InMemoryMetadata(metadata_dict, filename), project_name=project_name, ) - return cls(dist) + return cls(dist, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -128,7 +133,7 @@ class Distribution(BaseDistribution): metadata=InMemoryMetadata(metadata_dict, wheel.location), project_name=name, ) - return cls(dist) + return cls(dist, concrete=wheel.is_concrete) @property def location(self) -> Optional[str]: @@ -233,7 +238,7 @@ class Environment(BaseEnvironment): def _iter_distributions(self) -> Iterator[BaseDistribution]: for dist in self._ws: - yield Distribution(dist) + yield Distribution(dist, concrete=True) def _search_distribution(self, name: str) -> Optional[BaseDistribution]: """Find a distribution matching the ``name`` in the environment. diff --git a/src/pip/_internal/models/installation_report.py b/src/pip/_internal/models/installation_report.py index 7f001f35e..99ed31f15 100644 --- a/src/pip/_internal/models/installation_report.py +++ b/src/pip/_internal/models/installation_report.py @@ -29,7 +29,7 @@ class InstallationReport: "requested": ireq.user_supplied, # PEP 566 json encoding for metadata # https://www.python.org/dev/peps/pep-0566/#json-compatible-metadata - "metadata": ireq.get_dist().metadata_dict, + "metadata": ireq.cached_dist.metadata_dict, } if ireq.user_supplied and ireq.extras: # For top level requirements, the list of requested extras, if any. diff --git a/src/pip/_internal/operations/check.py b/src/pip/_internal/operations/check.py index 261045922..e742b7d40 100644 --- a/src/pip/_internal/operations/check.py +++ b/src/pip/_internal/operations/check.py @@ -9,7 +9,6 @@ from pip._vendor.packaging.specifiers import LegacySpecifier from pip._vendor.packaging.utils import NormalizedName, canonicalize_name from pip._vendor.packaging.version import LegacyVersion -from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.metadata import get_default_environment from pip._internal.metadata.base import DistributionVersion from pip._internal.req.req_install import InstallRequirement @@ -127,8 +126,8 @@ def _simulate_installation_of( # Modify it as installing requirement_set would (assuming no errors) for inst_req in to_install: - abstract_dist = make_distribution_for_install_requirement(inst_req) - dist = abstract_dist.get_metadata_distribution() + assert inst_req.is_concrete + dist = inst_req.cached_dist name = dist.canonical_name package_set[name] = PackageDetails(dist.version, list(dist.iter_dependencies())) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 1b32d7eec..2cfc582dd 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -13,7 +13,6 @@ from typing import Dict, Iterable, List, Optional from pip._vendor.packaging.utils import canonicalize_name from pip._internal.distributions import make_distribution_for_install_requirement -from pip._internal.distributions.installed import InstalledDistribution from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, HashMismatch, @@ -24,7 +23,10 @@ from pip._internal.exceptions import ( VcsHashUnsupported, ) from pip._internal.index.package_finder import PackageFinder -from pip._internal.metadata import BaseDistribution, get_metadata_distribution +from pip._internal.metadata import ( + BaseDistribution, + get_metadata_distribution, +) from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel @@ -187,6 +189,8 @@ def _check_download_dir( ) -> Optional[str]: """Check download_dir for previously downloaded file with correct hash If a correct file is found return its path else None + + If a file is found at the given path, but with an invalid hash, the file is deleted. """ download_path = os.path.join(download_dir, link.filename) @@ -517,41 +521,92 @@ class RequirementPreparer: # The file is not available, attempt to fetch only metadata metadata_dist = self._fetch_metadata_only(req) if metadata_dist is not None: - req.needs_more_preparation = True + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) return metadata_dist # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) - def prepare_linked_requirements_more( - self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False - ) -> None: - """Prepare linked requirements more, if needed.""" - reqs = [req for req in reqs if req.needs_more_preparation] + def _ensure_download_info(self, reqs: Iterable[InstallRequirement]) -> None: + """ + `pip install --report` extracts the download info from each requirement for its + JSON output, so we need to make sure every requirement has this before finishing + the resolve. But .download_info will only be populated by the point this method + is called for requirements already found in the wheel cache, so we need to + synthesize it for uncached results. Luckily, a DirectUrl can be parsed directly + from a url without any other context. However, this also means the download info + will only contain a hash if the link itself declares the hash. + """ for req in reqs: + self._populate_download_info(req) + + def _force_fully_prepared( + self, reqs: Iterable[InstallRequirement], require_concrete: bool + ) -> None: + """ + The legacy resolver seems to prepare requirements differently that can leave + them half-done in certain code paths. I'm not quite sure how it's doing things, + but at least we can do this to make sure they do things right. + """ + for req in reqs: + req.prepared = True + if require_concrete: + assert req.is_concrete + + def finalize_linked_requirements( + self, + reqs: Iterable[InstallRequirement], + hydrate_virtual_reqs: bool, + parallel_builds: bool = False, + ) -> None: + """Prepare linked requirements more, if needed. + + Neighboring .metadata files as per PEP 658 or lazy wheels via fast-deps will be + preferred to extract metadata from any concrete requirement (one that has been + mapped to a Link) without downloading the underlying wheel or sdist. When ``pip + install --dry-run`` is called, we want to avoid ever downloading the underlying + dist, but we still need to provide all of the results that pip commands expect + from the typical resolve process. + + Those expectations vary, but one distinction lies in whether the command needs + an actual physical dist somewhere on the filesystem, or just the metadata about + it from the resolver (as in ``pip install --report``). If the command requires + actual physical filesystem locations for the resolved dists, it must call this + method with ``hydrate_virtual_reqs=True`` to fully download anything + that remains. + """ + if not hydrate_virtual_reqs: + self._ensure_download_info(reqs) + self._force_fully_prepared(reqs, require_concrete=False) + return + + partially_downloaded_reqs: List[InstallRequirement] = [] + for req in reqs: + if req.is_concrete: + continue # Determine if any of these requirements were already downloaded. if self.download_dir is not None and req.link.is_wheel: hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir(req.link, self.download_dir, hashes) + # If the file is there, but doesn't match the hash, delete it and print + # a warning. We will be downloading it again via + # partially_downloaded_reqs. + file_path = _check_download_dir( + req.link, self.download_dir, hashes, warn_on_hash_mismatch=True + ) if file_path is not None: + # If the hash does match, then we still need to generate a concrete + # dist, but we don't have to download the wheel again. self._downloaded[req.link.url] = file_path - req.needs_more_preparation = False + partially_downloaded_reqs.append(req) - # Prepare requirements we found were already downloaded for some - # reason. The other downloads will be completed separately. - partially_downloaded_reqs: List[InstallRequirement] = [] - for req in reqs: - if req.needs_more_preparation: - partially_downloaded_reqs.append(req) - else: - self._prepare_linked_requirement(req, parallel_builds) - - # TODO: separate this part out from RequirementPreparer when the v1 - # resolver can be removed! self._complete_partial_requirements( partially_downloaded_reqs, parallel_builds=parallel_builds, ) + # NB: Must call this method before returning! + self._force_fully_prepared(reqs, require_concrete=True) def _prepare_linked_requirement( self, req: InstallRequirement, parallel_builds: bool @@ -611,31 +666,13 @@ class RequirementPreparer: hashes.check_against_path(file_path) local_file = File(file_path, content_type=None) - # If download_info is set, we got it from the wheel cache. - if req.download_info is None: - # Editables don't go through this function (see - # prepare_editable_requirement). - assert not req.editable - req.download_info = direct_url_from_link(link, req.source_dir) - # Make sure we have a hash in download_info. If we got it as part of the - # URL, it will have been verified and we can rely on it. Otherwise we - # compute it from the downloaded file. - # FIXME: https://github.com/pypa/pip/issues/11943 - if ( - isinstance(req.download_info.info, ArchiveInfo) - and not req.download_info.info.hashes - and local_file - ): - hash = hash_file(local_file.path)[0].hexdigest() - # We populate info.hash for backward compatibility. - # This will automatically populate info.hashes. - req.download_info.info.hash = f"sha256={hash}" - # For use in later processing, # preserve the file path on the requirement. if local_file: req.local_file_path = local_file.path + self._populate_download_info(req) + dist = _get_prepared_distribution( req, self.build_tracker, @@ -645,9 +682,31 @@ class RequirementPreparer: ) return dist + def _populate_download_info(self, req: InstallRequirement) -> None: + # If download_info is set, we got it from the wheel cache. + if req.download_info is None: + # Editables don't go through this function (see + # prepare_editable_requirement). + assert not req.editable + req.download_info = direct_url_from_link(req.link, req.source_dir) + # Make sure we have a hash in download_info. If we got it as part of the + # URL, it will have been verified and we can rely on it. Otherwise we + # compute it from the downloaded file. + # FIXME: https://github.com/pypa/pip/issues/11943 + if ( + isinstance(req.download_info.info, ArchiveInfo) + and not req.download_info.info.hashes + and req.local_file_path + ): + hash = hash_file(req.local_file_path)[0].hexdigest() + # We populate info.hash for backward compatibility. + # This will automatically populate info.hashes. + req.download_info.info.hash = f"sha256={hash}" + def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None + assert req.is_concrete link = req.link if link.is_vcs or (link.is_existing_dir() and req.editable): # Make a .zip of the source_dir we already created. @@ -702,6 +761,8 @@ class RequirementPreparer: req.check_if_exists(self.use_user_site) + # This should already have been populated by the preparation of the source dist. + assert req.is_concrete return dist def prepare_installed_requirement( @@ -726,4 +787,13 @@ class RequirementPreparer: "completely repeatable environment, install into an " "empty virtualenv." ) - return InstalledDistribution(req).get_metadata_distribution() + dist = _get_prepared_distribution( + req, + self.build_tracker, + self.finder, + self.build_isolation, + self.check_build_deps, + ) + + assert req.is_concrete + return dist diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 8110114ca..0274e8832 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -23,10 +23,7 @@ from pip._internal.locations import get_scheme from pip._internal.metadata import ( BaseDistribution, get_default_environment, - get_directory_distribution, - get_wheel_distribution, ) -from pip._internal.metadata.base import FilesystemWheel from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.operations.build.metadata import generate_metadata @@ -149,6 +146,7 @@ class InstallRequirement: self.hash_options = hash_options if hash_options else {} self.config_settings = config_settings # Set to True after successful preparation of this requirement + # TODO: this is only used in the legacy resolver: remove this! self.prepared = False # User supplied requirement are explicitly requested for installation # by the user via CLI arguments or requirements files, as opposed to, @@ -180,8 +178,13 @@ class InstallRequirement: # but after loading this flag should be treated as read only. self.use_pep517 = use_pep517 - # This requirement needs more preparation before it can be built - self.needs_more_preparation = False + # When a dist is computed for this requirement, cache it here so it's visible + # everywhere within pip and isn't computed more than once. This may be + # a "virtual" dist without a physical location on the filesystem, or + # a "concrete" dist which has been fully downloaded. + self._cached_dist: Optional[BaseDistribution] = None + # Strictly used in testing: allow calling .cache_concrete_dist() twice. + self.allow_concrete_dist_overwrite = False # This requirement needs to be unpacked before it can be installed. self._archive_source: Optional[Path] = None @@ -233,7 +236,7 @@ class InstallRequirement: return None return self.req.name - @functools.lru_cache() # use cached_property in python 3.8+ + @functools.lru_cache(maxsize=None) # TODO: use cached_property in python 3.8+ def supports_pyproject_editable(self) -> bool: if not self.use_pep517: return False @@ -546,11 +549,11 @@ class InstallRequirement: f"Consider using a build backend that supports PEP 660." ) - def prepare_metadata(self) -> None: + def prepare_metadata_directory(self) -> None: """Ensure that project metadata is available. - Under PEP 517 and PEP 660, call the backend hook to prepare the metadata. - Under legacy processing, call setup.py egg-info. + Under PEP 517 and PEP 660, call the backend hook to prepare the metadata + directory. Under legacy processing, call setup.py egg-info. """ assert self.source_dir, f"No source dir for {self}" details = self.name or f"from {self.link}" @@ -582,6 +585,8 @@ class InstallRequirement: details=details, ) + def validate_sdist_metadata(self) -> None: + """Ensure that we have a dist, and ensure it corresponds to expectations.""" # Act on the newly generated metadata, based on the name and version. if not self.name: self._set_requirement() @@ -592,24 +597,59 @@ class InstallRequirement: @property def metadata(self) -> Any: + # TODO: use cached_property in python 3.8+ if not hasattr(self, "_metadata"): - self._metadata = self.get_dist().metadata + self._metadata = self.cached_dist.metadata return self._metadata - def get_dist(self) -> BaseDistribution: - if self.metadata_directory: - return get_directory_distribution(self.metadata_directory) - elif self.local_file_path and self.is_wheel: - assert self.req is not None - return get_wheel_distribution( - FilesystemWheel(self.local_file_path), - canonicalize_name(self.req.name), + @property + def cached_dist(self) -> BaseDistribution: + """Retrieve the dist resolved from this requirement. + + :raises AssertionError: if the resolver has not yet been executed. + """ + if self._cached_dist is None: + raise AssertionError( + f"InstallRequirement {self} has no dist; " + "ensure the resolver has been executed" ) - raise AssertionError( - f"InstallRequirement {self} has no metadata directory and no wheel: " - f"can't make a distribution." - ) + return self._cached_dist + + def cache_virtual_metadata_only_dist(self, dist: BaseDistribution) -> None: + """Associate a "virtual" metadata-only dist to this requirement. + + This dist cannot be installed, but it can be used to complete the resolve + process. + + :raises AssertionError: if a dist has already been associated. + :raises AssertionError: if the provided dist is "concrete", i.e. exists + somewhere on the filesystem. + """ + assert self._cached_dist is None, self + assert not dist.is_concrete, dist + self._cached_dist = dist + + def cache_concrete_dist(self, dist: BaseDistribution) -> None: + """Associate a "concrete" dist to this requirement. + + A concrete dist exists somewhere on the filesystem and can be installed. + + :raises AssertionError: if a concrete dist has already been associated. + :raises AssertionError: if the provided dist is not concrete. + """ + if self._cached_dist is not None: + # If we set a dist twice for the same requirement, we must be hydrating + # a concrete dist for what was previously virtual. This will occur in the + # case of `install --dry-run` when PEP 658 metadata is available. + if not self.allow_concrete_dist_overwrite: + assert not self._cached_dist.is_concrete + assert dist.is_concrete + self._cached_dist = dist + + @property + def is_concrete(self) -> bool: + return self._cached_dist is not None and self._cached_dist.is_concrete def assert_source_matches_version(self) -> None: assert self.source_dir, f"No source dir for {self}" diff --git a/src/pip/_internal/req/req_set.py b/src/pip/_internal/req/req_set.py index cff676017..031875c1e 100644 --- a/src/pip/_internal/req/req_set.py +++ b/src/pip/_internal/req/req_set.py @@ -86,7 +86,7 @@ class RequirementSet: def warn_legacy_versions_and_specifiers(self) -> None: for req in self.requirements_to_install: - version = req.get_dist().version + version = req.cached_dist.version if isinstance(version, LegacyVersion): deprecated( reason=( @@ -101,7 +101,7 @@ class RequirementSet: issue=12063, gone_in="23.3", ) - for dep in req.get_dist().iter_dependencies(): + for dep in req.cached_dist.iter_dependencies(): if any(isinstance(spec, LegacySpecifier) for spec in dep.specifier): deprecated( reason=( diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index d5b238608..32a3d3ff7 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -157,11 +157,6 @@ class Resolver(BaseResolver): req_set.add_named_requirement(ireq) - reqs = req_set.all_requirements - self.factory.preparer.prepare_linked_requirements_more(reqs) - for req in reqs: - req.prepared = True - req.needs_more_preparation = False return req_set def get_installation_order( diff --git a/tests/unit/metadata/test_metadata_pkg_resources.py b/tests/unit/metadata/test_metadata_pkg_resources.py index ab1a56107..ca0839979 100644 --- a/tests/unit/metadata/test_metadata_pkg_resources.py +++ b/tests/unit/metadata/test_metadata_pkg_resources.py @@ -102,6 +102,7 @@ def test_wheel_metadata_works() -> None: metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, ""), project_name=name, ), + concrete=False, ) assert name == dist.canonical_name == dist.raw_name diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index 2d1fa2694..8595637a0 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -152,6 +152,7 @@ class TestRequirementSet: os.fspath(data.packages.joinpath("LocalEnvironMarker")), ) req.user_supplied = True + req.allow_concrete_dist_overwrite = True reqset.add_unnamed_requirement(req) finder = make_test_finder(find_links=[data.find_links]) with self._basic_resolver(finder) as resolver: @@ -501,6 +502,7 @@ class TestRequirementSet: with self._basic_resolver(finder) as resolver: ireq_url = data.packages.joinpath("FSPkg").as_uri() ireq = get_processed_req_from_line(f"-e {ireq_url}#egg=FSPkg") + ireq.allow_concrete_dist_overwrite = True reqset = resolver.resolve([ireq], True) assert len(reqset.all_requirements) == 1 req = reqset.all_requirements[0] From 4b9c97ff633994459a7bed74aa0d44a573130089 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:39:53 -0400 Subject: [PATCH 03/10] make LinkMetadataCache - catch an exception when parsing metadata which only occurs in CI - handle --no-cache-dir - call os.makedirs() before writing to cache too - catch InvalidSchema when attempting git urls with BatchDownloader - fix other test failures --- src/pip/_internal/cache.py | 55 ++++++---- src/pip/_internal/cli/req_command.py | 7 +- src/pip/_internal/operations/prepare.py | 127 +++++++++++++++++++++++- src/pip/_internal/req/req_install.py | 20 +++- src/pip/_internal/req/req_set.py | 2 +- 5 files changed, 182 insertions(+), 29 deletions(-) diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 8d3a664c7..518202f45 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -1,6 +1,7 @@ """Cache Management """ +import abc import hashlib import json import logging @@ -29,7 +30,7 @@ def _hash_dict(d: Dict[str, str]) -> str: return hashlib.sha224(s.encode("ascii")).hexdigest() -class Cache: +class Cache(abc.ABC): """An abstract class - provides cache directories for data from links :param cache_dir: The root of the cache. @@ -73,6 +74,39 @@ class Cache: return parts + @abc.abstractmethod + def get_path_for_link(self, link: Link) -> str: + """Return a directory to store cached items in for link.""" + ... + + def cache_path(self, link: Link) -> Path: + return Path(self.get_path_for_link(link)) + + +class LinkMetadataCache(Cache): + """Persistently store the metadata of dists found at each link.""" + + def get_path_for_link(self, link: Link) -> str: + parts = self._get_cache_path_parts(link) + assert self.cache_dir + return os.path.join(self.cache_dir, "link-metadata", *parts) + + +class WheelCacheBase(Cache): + """Specializations to the cache concept for wheels.""" + + @abc.abstractmethod + def get( + self, + link: Link, + package_name: Optional[str], + supported_tags: List[Tag], + ) -> Link: + """Returns a link to a cached item if it exists, otherwise returns the + passed link. + """ + ... + def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]: can_not_cache = not self.cache_dir or not canonical_package_name or not link if can_not_cache: @@ -85,23 +119,8 @@ class Cache: candidates.append((candidate, path)) return candidates - def get_path_for_link(self, link: Link) -> str: - """Return a directory to store cached items in for link.""" - raise NotImplementedError() - def get( - self, - link: Link, - package_name: Optional[str], - supported_tags: List[Tag], - ) -> Link: - """Returns a link to a cached item if it exists, otherwise returns the - passed link. - """ - raise NotImplementedError() - - -class SimpleWheelCache(Cache): +class SimpleWheelCache(WheelCacheBase): """A cache of wheels for future installs.""" def __init__(self, cache_dir: str) -> None: @@ -207,7 +226,7 @@ class CacheEntry: ) -class WheelCache(Cache): +class WheelCache(WheelCacheBase): """Wraps EphemWheelCache and SimpleWheelCache into a single Cache This Cache allows for gracefully degradation, using the ephem wheel cache diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 86070f10c..bfc173396 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -12,7 +12,7 @@ from functools import partial from optparse import Values from typing import TYPE_CHECKING, Any, List, Optional, Tuple -from pip._internal.cache import WheelCache +from pip._internal.cache import LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.base_command import Command from pip._internal.cli.command_context import CommandContextMixIn @@ -308,6 +308,10 @@ class RequirementCommand(IndexGroupCommand): "fast-deps has no effect when used with the legacy resolver." ) + if options.cache_dir: + metadata_cache = LinkMetadataCache(options.cache_dir) + else: + metadata_cache = None return RequirementPreparer( build_dir=temp_build_dir_path, src_dir=options.src_dir, @@ -323,6 +327,7 @@ class RequirementCommand(IndexGroupCommand): lazy_wheel=lazy_wheel, verbosity=verbosity, legacy_resolver=legacy_resolver, + metadata_cache=metadata_cache, ) @classmethod diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 2cfc582dd..3a671f4d4 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -4,14 +4,19 @@ # The following comment should be removed at some point in the future. # mypy: strict-optional=False +import email.errors +import json import mimetypes import os import shutil +from dataclasses import dataclass from pathlib import Path -from typing import Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional from pip._vendor.packaging.utils import canonicalize_name +from pip._vendor.requests.exceptions import InvalidSchema +from pip._internal.cache import LinkMetadataCache from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, @@ -213,6 +218,43 @@ def _check_download_dir( return download_path +@dataclass(frozen=True) +class CacheableDist: + metadata: str + filename: Path + canonical_name: str + + @classmethod + def from_dist(cls, link: Link, dist: BaseDistribution) -> "CacheableDist": + return cls( + metadata=str(dist.metadata), + filename=Path(link.filename), + canonical_name=dist.canonical_name, + ) + + def to_dist(self) -> BaseDistribution: + return get_metadata_distribution( + metadata_contents=self.metadata.encode("utf-8"), + filename=str(self.filename), + canonical_name=self.canonical_name, + ) + + def to_json(self) -> Dict[str, Any]: + return dict( + metadata=self.metadata, + filename=str(self.filename), + canonical_name=self.canonical_name, + ) + + @classmethod + def from_json(cls, args: Dict[str, Any]) -> "CacheableDist": + return cls( + metadata=args["metadata"], + filename=Path(args["filename"]), + canonical_name=args["canonical_name"], + ) + + class RequirementPreparer: """Prepares a Requirement""" @@ -232,6 +274,7 @@ class RequirementPreparer: lazy_wheel: bool, verbosity: int, legacy_resolver: bool, + metadata_cache: Optional[LinkMetadataCache] = None, ) -> None: super().__init__() @@ -274,6 +317,8 @@ class RequirementPreparer: # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + self._metadata_cache = metadata_cache + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -370,10 +415,68 @@ class RequirementPreparer: "Metadata-only fetching is not used as hash checking is required", ) return None - # Try PEP 658 metadata first, then fall back to lazy wheel if unavailable. - return self._fetch_metadata_using_link_data_attr( + + cached_metadata = self._fetch_cached_metadata(req.link) + if cached_metadata is not None: + return cached_metadata + + computed_metadata = self._fetch_metadata_using_link_data_attr( req ) or self._fetch_metadata_using_lazy_wheel(req.link) + # Populate the metadata cache. + if computed_metadata is not None: + self._cache_metadata(req.link, computed_metadata) + return computed_metadata + + def _fetch_cached_metadata( + self, + link: Link, + ) -> Optional[BaseDistribution]: + if self._metadata_cache is None: + return None + try: + cached_path = self._metadata_cache.cache_path(link) + os.makedirs(str(cached_path.parent), exist_ok=True) + with cached_path.open("rb") as f: + logger.debug( + "found cached metadata for link %s at %s", link.url, f.name + ) + args = json.load(f) + cached_dist = CacheableDist.from_json(args) + return cached_dist.to_dist() + except (OSError, json.JSONDecodeError, KeyError) as e: + logger.debug( + "no cached metadata for link %s at %s %s(%s)", + link.url, + cached_path, + e.__class__.__name__, + str(e), + ) + return None + + def _cache_metadata( + self, + link: Link, + metadata_dist: BaseDistribution, + ) -> None: + if self._metadata_cache is None: + return + try: + cached_path = self._metadata_cache.cache_path(link) + os.makedirs(str(cached_path.parent), exist_ok=True) + with cached_path.open("w") as f: + cacheable_dist = CacheableDist.from_dist(link, metadata_dist) + args = cacheable_dist.to_json() + logger.debug("caching metadata for link %s at %s", link.url, f.name) + json.dump(args, f) + except (OSError, email.errors.HeaderParseError) as e: + logger.debug( + "could not cache metadata for dist %s from %s: %s(%s)", + metadata_dist, + link, + e.__class__.__name__, + str(e), + ) def _fetch_metadata_using_link_data_attr( self, @@ -461,7 +564,18 @@ class RequirementPreparer: links_to_fully_download: Dict[Link, InstallRequirement] = {} for req in partially_downloaded_reqs: assert req.link - links_to_fully_download[req.link] = req + # (1) File URLs don't need to be downloaded, so skip them. + if req.link.scheme == "file": + continue + # (2) If this is e.g. a git url, we don't know how to handle that in the + # BatchDownloader, so leave it for self._prepare_linked_requirement() at the + # end of this method, which knows how to handle any URL. + try: + # This will raise InvalidSchema if our Session can't download it. + self._session.get_adapter(req.link.url) + links_to_fully_download[req.link] = req + except InvalidSchema: + pass batch_download = self._batch_download( links_to_fully_download.keys(), @@ -527,7 +641,10 @@ class RequirementPreparer: return metadata_dist # None of the optimizations worked, fully prepare the requirement - return self._prepare_linked_requirement(req, parallel_builds) + result = self._prepare_linked_requirement(req, parallel_builds) + # Cache the metadata for later. + self._cache_metadata(req.link, result) + return result def _ensure_download_info(self, reqs: Iterable[InstallRequirement]) -> None: """ diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 0274e8832..8291a0c4f 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -84,7 +84,7 @@ class InstallRequirement: permit_editable_wheels: bool = False, ) -> None: assert req is None or isinstance(req, Requirement), req - self.req = req + self._req = req self.comes_from = comes_from self.constraint = constraint self.editable = editable @@ -189,6 +189,18 @@ class InstallRequirement: # This requirement needs to be unpacked before it can be installed. self._archive_source: Optional[Path] = None + @property + def req(self) -> Optional[Requirement]: + """Calculate a requirement from the cached dist if necessary.""" + if self._req is not None: + return self._req + if self._cached_dist is not None: + name = self._cached_dist.canonical_name + version = str(self._cached_dist.version) + self._req = Requirement(f"{name}=={version}") + return self._req + return None + def __str__(self) -> str: if self.req: s = str(self.req) @@ -377,7 +389,7 @@ class InstallRequirement: def _set_requirement(self) -> None: """Set requirement after generating metadata.""" - assert self.req is None + assert self._req is None assert self.metadata is not None assert self.source_dir is not None @@ -387,7 +399,7 @@ class InstallRequirement: else: op = "===" - self.req = Requirement( + self._req = Requirement( "".join( [ self.metadata["Name"], @@ -413,7 +425,7 @@ class InstallRequirement: metadata_name, self.name, ) - self.req = Requirement(metadata_name) + self._req = Requirement(metadata_name) def check_if_exists(self, use_user_site: bool) -> None: """Find an installed distribution that satisfies or conflicts diff --git a/src/pip/_internal/req/req_set.py b/src/pip/_internal/req/req_set.py index 031875c1e..68141b6f9 100644 --- a/src/pip/_internal/req/req_set.py +++ b/src/pip/_internal/req/req_set.py @@ -46,7 +46,7 @@ class RequirementSet: self.unnamed_requirements.append(install_req) def add_named_requirement(self, install_req: InstallRequirement) -> None: - assert install_req.name + assert install_req.name, install_req project_name = canonicalize_name(install_req.name) self.requirements[project_name] = install_req From 5463825bb99d1a5d1ec65ef7b5dd1bc819dd852b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 01:09:45 -0400 Subject: [PATCH 04/10] expose this new caching with --use-feature=metadata-cache --- news/12256.feature.rst | 1 + src/pip/_internal/cli/cmdoptions.py | 1 + src/pip/_internal/cli/req_command.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 news/12256.feature.rst diff --git a/news/12256.feature.rst b/news/12256.feature.rst new file mode 100644 index 000000000..52045cbdc --- /dev/null +++ b/news/12256.feature.rst @@ -0,0 +1 @@ +Introduce ``--use-feature=metadata-cache`` to cache metadata lookups in ``~/.cache/pip/link-metadata``. diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index 02ba60827..284b87bc5 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -1009,6 +1009,7 @@ use_new_feature: Callable[..., Option] = partial( choices=[ "fast-deps", "truststore", + "metadata-cache", ] + ALWAYS_ENABLED_FEATURES, help="Enable new functionality, that may be backward incompatible.", diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index bfc173396..93bd70397 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -308,7 +308,7 @@ class RequirementCommand(IndexGroupCommand): "fast-deps has no effect when used with the legacy resolver." ) - if options.cache_dir: + if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled): metadata_cache = LinkMetadataCache(options.cache_dir) else: metadata_cache = None From 0f20f629f3f50cb60aa5fbb3f35502f7ef4ffc31 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 06:06:08 -0400 Subject: [PATCH 05/10] fix test failures --- src/pip/_internal/operations/prepare.py | 11 +++++------ tests/functional/test_check.py | 8 ++++---- tests/functional/test_install.py | 8 ++++---- tests/functional/test_install_check.py | 2 +- tests/functional/test_install_reqs.py | 2 +- tests/functional/test_wheel.py | 2 +- 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 3a671f4d4..dbb51f8b4 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -564,18 +564,17 @@ class RequirementPreparer: links_to_fully_download: Dict[Link, InstallRequirement] = {} for req in partially_downloaded_reqs: assert req.link - # (1) File URLs don't need to be downloaded, so skip them. - if req.link.scheme == "file": - continue - # (2) If this is e.g. a git url, we don't know how to handle that in the + # If this is e.g. a git url, we don't know how to handle that in the # BatchDownloader, so leave it for self._prepare_linked_requirement() at the # end of this method, which knows how to handle any URL. + can_simply_download = True try: # This will raise InvalidSchema if our Session can't download it. self._session.get_adapter(req.link.url) - links_to_fully_download[req.link] = req except InvalidSchema: - pass + can_simply_download = False + if can_simply_download: + links_to_fully_download[req.link] = req batch_download = self._batch_download( links_to_fully_download.keys(), diff --git a/tests/functional/test_check.py b/tests/functional/test_check.py index e2b1c60ef..a5715bb8c 100644 --- a/tests/functional/test_check.py +++ b/tests/functional/test_check.py @@ -119,7 +119,7 @@ def test_check_complicated_name_missing(script: PipTestEnvironment) -> None: # Without dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert "Successfully installed package-A-1.0" in result.stdout, str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) @@ -142,7 +142,7 @@ def test_check_complicated_name_broken(script: PipTestEnvironment) -> None: # With broken dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert "Successfully installed package-A-1.0" in result.stdout, str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -175,7 +175,7 @@ def test_check_complicated_name_clean(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert "Successfully installed package-A-1.0" in result.stdout, str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -203,7 +203,7 @@ def test_check_considers_conditional_reqs(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert "Successfully installed package-A-1.0" in result.stdout, str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) diff --git a/tests/functional/test_install.py b/tests/functional/test_install.py index 161881419..eb02e7725 100644 --- a/tests/functional/test_install.py +++ b/tests/functional/test_install.py @@ -2071,7 +2071,7 @@ def test_install_conflict_results_in_warning( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency result2 = script.pip( @@ -2081,7 +2081,7 @@ def test_install_conflict_results_in_warning( allow_stderr_error=True, ) assert "pkga 1.0 requires pkgb==1.0" in result2.stderr, str(result2) - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_install_conflict_warning_can_be_suppressed( @@ -2101,11 +2101,11 @@ def test_install_conflict_warning_can_be_suppressed( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency; suppressing warning result2 = script.pip("install", "--no-index", pkgB_path, "--no-warn-conflicts") - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_target_install_ignores_distutils_config_install_prefix( diff --git a/tests/functional/test_install_check.py b/tests/functional/test_install_check.py index 8a8a7c93a..d16165e03 100644 --- a/tests/functional/test_install_check.py +++ b/tests/functional/test_install_check.py @@ -28,7 +28,7 @@ def test_check_install_canonicalization(script: PipTestEnvironment) -> None: # Let's install pkgA without its dependency result = script.pip("install", "--no-index", pkga_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result.stdout, str(result) + assert "Successfully installed pkga-1.0" in result.stdout, str(result) # Install the first missing dependency. Only an error for the # second dependency should remain. diff --git a/tests/functional/test_install_reqs.py b/tests/functional/test_install_reqs.py index 96cff0dc5..00fe53edd 100644 --- a/tests/functional/test_install_reqs.py +++ b/tests/functional/test_install_reqs.py @@ -626,7 +626,7 @@ def test_install_distribution_full_union( result = script.pip_install_local( to_install, f"{to_install}[bar]", f"{to_install}[baz]" ) - assert "Building wheel for LocalExtras" in result.stdout + assert "Building wheel for localextras" in result.stdout result.did_create(script.site_packages / "simple") result.did_create(script.site_packages / "singlemodule.py") diff --git a/tests/functional/test_wheel.py b/tests/functional/test_wheel.py index c0e279492..2c5c6e221 100644 --- a/tests/functional/test_wheel.py +++ b/tests/functional/test_wheel.py @@ -286,7 +286,7 @@ def test_wheel_package_with_latin1_setup( pkg_to_wheel = data.packages.joinpath("SetupPyLatin1") result = script.pip("wheel", pkg_to_wheel) - assert "Successfully built SetupPyUTF8" in result.stdout + assert "Successfully built setuppyutf8" in result.stdout def test_pip_wheel_with_pep518_build_reqs( From 7591507dfea3e44cc6bbe30a4c3d1e754069e7f1 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 09:12:36 -0400 Subject: [PATCH 06/10] reuse should_cache(req) logic --- src/pip/_internal/cache.py | 57 +++++++++++++++++++++++ src/pip/_internal/network/download.py | 2 +- src/pip/_internal/operations/prepare.py | 44 +++++++----------- src/pip/_internal/wheel_builder.py | 62 +------------------------ tests/unit/test_cache.py | 21 ++++++++- tests/unit/test_wheel_builder.py | 26 ++--------- 6 files changed, 101 insertions(+), 111 deletions(-) diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 518202f45..79a5e1383 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -6,6 +6,7 @@ import hashlib import json import logging import os +import re from pathlib import Path from typing import Any, Dict, List, Optional @@ -16,14 +17,61 @@ from pip._internal.exceptions import InvalidWheelFilename from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel +from pip._internal.req.req_install import InstallRequirement from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds from pip._internal.utils.urls import path_to_url +from pip._internal.vcs import vcs logger = logging.getLogger(__name__) +_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) + ORIGIN_JSON_NAME = "origin.json" +def _contains_egg_info(s: str) -> bool: + """Determine whether the string looks like an egg_info. + + :param s: The string to parse. E.g. foo-2.1 + """ + return bool(_egg_info_re.search(s)) + + +def should_cache( + req: InstallRequirement, +) -> bool: + """ + Return whether a built InstallRequirement can be stored in the persistent + wheel cache, assuming the wheel cache is available, and _should_build() + has determined a wheel needs to be built. + """ + if not req.link: + return False + + if req.editable or (not req.source_dir and not req.link.is_wheel): + # never cache editable requirements + return False + + if req.link and req.link.is_vcs: + # VCS checkout. Do not cache + # unless it points to an immutable commit hash. + assert not req.editable + assert req.source_dir + vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) + assert vcs_backend + if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): + return True + return False + + assert req.link + base, ext = req.link.splitext() + if _contains_egg_info(base): + return True + + # Otherwise, do not cache. + return False + + def _hash_dict(d: Dict[str, str]) -> str: """Return a stable sha224 of a dictionary.""" s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True) @@ -244,6 +292,15 @@ class WheelCache(WheelCacheBase): def get_ephem_path_for_link(self, link: Link) -> str: return self._ephem_cache.get_path_for_link(link) + def resolve_cache_dir(self, req: InstallRequirement) -> str: + """Return the persistent or temporary cache directory where the built or + downloaded wheel should be stored.""" + cache_available = bool(self.cache_dir) + assert req.link, req + if cache_available and should_cache(req): + return self.get_path_for_link(req.link) + return self.get_ephem_path_for_link(req.link) + def get( self, link: Link, diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 79b82a570..3cfcd1415 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -113,7 +113,7 @@ def _get_http_response_filename(resp: Response, link: Link) -> str: def _http_get_download(session: PipSession, link: Link) -> Response: - target_url = link.url.split("#", 1)[0] + target_url = link.url_without_fragment resp = session.get(target_url, headers=HEADERS, stream=True) raise_for_status(resp) return resp diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index dbb51f8b4..74866644c 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -14,9 +14,8 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Optional from pip._vendor.packaging.utils import canonicalize_name -from pip._vendor.requests.exceptions import InvalidSchema -from pip._internal.cache import LinkMetadataCache +from pip._internal.cache import LinkMetadataCache, should_cache from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, @@ -416,7 +415,7 @@ class RequirementPreparer: ) return None - cached_metadata = self._fetch_cached_metadata(req.link) + cached_metadata = self._fetch_cached_metadata(req) if cached_metadata is not None: return cached_metadata @@ -425,21 +424,22 @@ class RequirementPreparer: ) or self._fetch_metadata_using_lazy_wheel(req.link) # Populate the metadata cache. if computed_metadata is not None: - self._cache_metadata(req.link, computed_metadata) + self._cache_metadata(req, computed_metadata) return computed_metadata def _fetch_cached_metadata( - self, - link: Link, + self, req: InstallRequirement ) -> Optional[BaseDistribution]: if self._metadata_cache is None: return None + if not should_cache(req): + return None try: - cached_path = self._metadata_cache.cache_path(link) + cached_path = self._metadata_cache.cache_path(req.link) os.makedirs(str(cached_path.parent), exist_ok=True) with cached_path.open("rb") as f: logger.debug( - "found cached metadata for link %s at %s", link.url, f.name + "found cached metadata for link %s at %s", req.link, f.name ) args = json.load(f) cached_dist = CacheableDist.from_json(args) @@ -447,7 +447,7 @@ class RequirementPreparer: except (OSError, json.JSONDecodeError, KeyError) as e: logger.debug( "no cached metadata for link %s at %s %s(%s)", - link.url, + req.link, cached_path, e.__class__.__name__, str(e), @@ -456,24 +456,26 @@ class RequirementPreparer: def _cache_metadata( self, - link: Link, + req: InstallRequirement, metadata_dist: BaseDistribution, ) -> None: if self._metadata_cache is None: return + if not should_cache(req): + return try: - cached_path = self._metadata_cache.cache_path(link) + cached_path = self._metadata_cache.cache_path(req.link) os.makedirs(str(cached_path.parent), exist_ok=True) with cached_path.open("w") as f: - cacheable_dist = CacheableDist.from_dist(link, metadata_dist) + cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) args = cacheable_dist.to_json() - logger.debug("caching metadata for link %s at %s", link.url, f.name) + logger.debug("caching metadata for link %s at %s", req.link, f.name) json.dump(args, f) except (OSError, email.errors.HeaderParseError) as e: logger.debug( "could not cache metadata for dist %s from %s: %s(%s)", metadata_dist, - link, + req.link, e.__class__.__name__, str(e), ) @@ -564,17 +566,7 @@ class RequirementPreparer: links_to_fully_download: Dict[Link, InstallRequirement] = {} for req in partially_downloaded_reqs: assert req.link - # If this is e.g. a git url, we don't know how to handle that in the - # BatchDownloader, so leave it for self._prepare_linked_requirement() at the - # end of this method, which knows how to handle any URL. - can_simply_download = True - try: - # This will raise InvalidSchema if our Session can't download it. - self._session.get_adapter(req.link.url) - except InvalidSchema: - can_simply_download = False - if can_simply_download: - links_to_fully_download[req.link] = req + links_to_fully_download[req.link] = req batch_download = self._batch_download( links_to_fully_download.keys(), @@ -642,7 +634,7 @@ class RequirementPreparer: # None of the optimizations worked, fully prepare the requirement result = self._prepare_linked_requirement(req, parallel_builds) # Cache the metadata for later. - self._cache_metadata(req.link, result) + self._cache_metadata(req, result) return result def _ensure_download_info(self, reqs: Iterable[InstallRequirement]) -> None: diff --git a/src/pip/_internal/wheel_builder.py b/src/pip/_internal/wheel_builder.py index 60d75dd18..9e5342463 100644 --- a/src/pip/_internal/wheel_builder.py +++ b/src/pip/_internal/wheel_builder.py @@ -3,7 +3,6 @@ import logging import os.path -import re import shutil from typing import Iterable, List, Optional, Tuple @@ -25,23 +24,12 @@ from pip._internal.utils.setuptools_build import make_setuptools_clean_args from pip._internal.utils.subprocess import call_subprocess from pip._internal.utils.temp_dir import TempDirectory from pip._internal.utils.urls import path_to_url -from pip._internal.vcs import vcs logger = logging.getLogger(__name__) -_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) - BuildResult = Tuple[List[InstallRequirement], List[InstallRequirement]] -def _contains_egg_info(s: str) -> bool: - """Determine whether the string looks like an egg_info. - - :param s: The string to parse. E.g. foo-2.1 - """ - return bool(_egg_info_re.search(s)) - - def _should_build( req: InstallRequirement, need_wheel: bool, @@ -87,54 +75,6 @@ def should_build_for_install_command( return _should_build(req, need_wheel=False) -def _should_cache( - req: InstallRequirement, -) -> Optional[bool]: - """ - Return whether a built InstallRequirement can be stored in the persistent - wheel cache, assuming the wheel cache is available, and _should_build() - has determined a wheel needs to be built. - """ - if req.editable or not req.source_dir: - # never cache editable requirements - return False - - if req.link and req.link.is_vcs: - # VCS checkout. Do not cache - # unless it points to an immutable commit hash. - assert not req.editable - assert req.source_dir - vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) - assert vcs_backend - if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): - return True - return False - - assert req.link - base, ext = req.link.splitext() - if _contains_egg_info(base): - return True - - # Otherwise, do not cache. - return False - - -def _get_cache_dir( - req: InstallRequirement, - wheel_cache: WheelCache, -) -> str: - """Return the persistent or temporary cache directory where the built - wheel need to be stored. - """ - cache_available = bool(wheel_cache.cache_dir) - assert req.link - if cache_available and _should_cache(req): - cache_dir = wheel_cache.get_path_for_link(req.link) - else: - cache_dir = wheel_cache.get_ephem_path_for_link(req.link) - return cache_dir - - def _verify_one(req: InstallRequirement, wheel_path: str) -> None: canonical_name = canonicalize_name(req.name or "") w = Wheel(os.path.basename(wheel_path)) @@ -316,7 +256,7 @@ def build( build_successes, build_failures = [], [] for req in requirements: assert req.name - cache_dir = _get_cache_dir(req, wheel_cache) + cache_dir = wheel_cache.resolve_cache_dir(req) wheel_file = _build_one( req, cache_dir, diff --git a/tests/unit/test_cache.py b/tests/unit/test_cache.py index d0fee69c3..ff989a07b 100644 --- a/tests/unit/test_cache.py +++ b/tests/unit/test_cache.py @@ -1,13 +1,32 @@ import os from pathlib import Path +import pytest from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version -from pip._internal.cache import WheelCache, _hash_dict +from pip._internal.cache import WheelCache, _contains_egg_info, _hash_dict from pip._internal.models.link import Link from pip._internal.utils.misc import ensure_dir +@pytest.mark.parametrize( + "s, expected", + [ + # Trivial. + ("pip-18.0", True), + # Ambiguous. + ("foo-2-2", True), + ("im-valid", True), + # Invalid. + ("invalid", False), + ("im_invalid", False), + ], +) +def test_contains_egg_info(s: str, expected: bool) -> None: + result = _contains_egg_info(s) + assert result == expected + + def test_falsey_path_none() -> None: wc = WheelCache("") assert wc.cache_dir is None diff --git a/tests/unit/test_wheel_builder.py b/tests/unit/test_wheel_builder.py index 9044f9453..cd4b915cf 100644 --- a/tests/unit/test_wheel_builder.py +++ b/tests/unit/test_wheel_builder.py @@ -5,7 +5,7 @@ from typing import Optional, cast import pytest -from pip._internal import wheel_builder +from pip._internal import cache, wheel_builder from pip._internal.models.link import Link from pip._internal.operations.build.wheel_legacy import format_command_result from pip._internal.req.req_install import InstallRequirement @@ -13,24 +13,6 @@ from pip._internal.vcs.git import Git from tests.lib import _create_test_package -@pytest.mark.parametrize( - "s, expected", - [ - # Trivial. - ("pip-18.0", True), - # Ambiguous. - ("foo-2-2", True), - ("im-valid", True), - # Invalid. - ("invalid", False), - ("im_invalid", False), - ], -) -def test_contains_egg_info(s: str, expected: bool) -> None: - result = wheel_builder._contains_egg_info(s) - assert result == expected - - class ReqMock: def __init__( self, @@ -128,7 +110,7 @@ def test_should_build_for_wheel_command(req: ReqMock, expected: bool) -> None: ], ) def test_should_cache(req: ReqMock, expected: bool) -> None: - assert wheel_builder._should_cache(cast(InstallRequirement, req)) is expected + assert cache.should_cache(cast(InstallRequirement, req)) is expected def test_should_cache_git_sha(tmpdir: Path) -> None: @@ -138,12 +120,12 @@ def test_should_cache_git_sha(tmpdir: Path) -> None: # a link referencing a sha should be cached url = "git+https://g.c/o/r@" + commit + "#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert wheel_builder._should_cache(cast(InstallRequirement, req)) + assert cache.should_cache(cast(InstallRequirement, req)) # a link not referencing a sha should not be cached url = "git+https://g.c/o/r@master#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert not wheel_builder._should_cache(cast(InstallRequirement, req)) + assert not cache.should_cache(cast(InstallRequirement, req)) def test_format_command_result__INFO(caplog: pytest.LogCaptureFixture) -> None: From 5a42ae469cf26323e70e350b4e552dc4d6d24c04 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 19:15:37 -0400 Subject: [PATCH 07/10] gzip compress link metadata for a slight reduction in disk space --- src/pip/_internal/operations/prepare.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 74866644c..d9ceb98e0 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -5,6 +5,7 @@ # mypy: strict-optional=False import email.errors +import gzip import json import mimetypes import os @@ -437,7 +438,7 @@ class RequirementPreparer: try: cached_path = self._metadata_cache.cache_path(req.link) os.makedirs(str(cached_path.parent), exist_ok=True) - with cached_path.open("rb") as f: + with gzip.open(cached_path, mode="rt", encoding="utf-8") as f: logger.debug( "found cached metadata for link %s at %s", req.link, f.name ) @@ -466,7 +467,7 @@ class RequirementPreparer: try: cached_path = self._metadata_cache.cache_path(req.link) os.makedirs(str(cached_path.parent), exist_ok=True) - with cached_path.open("w") as f: + with gzip.open(cached_path, mode="wt", encoding="utf-8") as f: cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) args = cacheable_dist.to_json() logger.debug("caching metadata for link %s at %s", req.link, f.name) From 60cc5024e2fa6ec80f5f921cce53230f36bb8bd6 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 20:09:20 -0400 Subject: [PATCH 08/10] squashed commit - add FetchResolveCache - pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir --- src/pip/_internal/cache.py | 56 ++++--- src/pip/_internal/cli/req_command.py | 7 +- src/pip/_internal/index/collector.py | 52 ++++-- src/pip/_internal/index/package_finder.py | 186 +++++++++++++++++++++- src/pip/_internal/models/link.py | 6 +- 5 files changed, 259 insertions(+), 48 deletions(-) diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 79a5e1383..8a0b9797b 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -89,7 +89,9 @@ class Cache(abc.ABC): assert not cache_dir or os.path.isabs(cache_dir) self.cache_dir = cache_dir or None - def _get_cache_path_parts(self, link: Link) -> List[str]: + def _get_cache_path_parts( + self, link: Link, *, interpreter_dependent: bool + ) -> List[str]: """Get parts of part that must be os.path.joined with cache_dir""" # We want to generate an url to use as our cache key, we don't want to @@ -101,13 +103,14 @@ class Cache(abc.ABC): if link.subdirectory_fragment: key_parts["subdirectory"] = link.subdirectory_fragment - # Include interpreter name, major and minor version in cache key - # to cope with ill-behaved sdists that build a different wheel - # depending on the python version their setup.py is being run on, - # and don't encode the difference in compatibility tags. - # https://github.com/pypa/pip/issues/7296 - key_parts["interpreter_name"] = interpreter_name() - key_parts["interpreter_version"] = interpreter_version() + if interpreter_dependent: + # Include interpreter name, major and minor version in cache key + # to cope with ill-behaved sdists that build a different wheel + # depending on the python version their setup.py is being run on, + # and don't encode the difference in compatibility tags. + # https://github.com/pypa/pip/issues/7296 + key_parts["interpreter_name"] = interpreter_name() + key_parts["interpreter_version"] = interpreter_version() # Encode our key url with sha224, we'll use this because it has similar # security properties to sha256, but with a shorter total output (and @@ -135,26 +138,23 @@ class LinkMetadataCache(Cache): """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir return os.path.join(self.cache_dir, "link-metadata", *parts) +class FetchResolveCache(Cache): + def get_path_for_link(self, link: Link) -> str: + # We are reading index links to extract other links from, not executing any + # python code, so these caches are interpreter-independent. + parts = self._get_cache_path_parts(link, interpreter_dependent=False) + assert self.cache_dir + return os.path.join(self.cache_dir, "fetch-resolve", *parts) + + class WheelCacheBase(Cache): """Specializations to the cache concept for wheels.""" - @abc.abstractmethod - def get( - self, - link: Link, - package_name: Optional[str], - supported_tags: List[Tag], - ) -> Link: - """Returns a link to a cached item if it exists, otherwise returns the - passed link. - """ - ... - def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]: can_not_cache = not self.cache_dir or not canonical_package_name or not link if can_not_cache: @@ -167,6 +167,18 @@ class WheelCacheBase(Cache): candidates.append((candidate, path)) return candidates + @abc.abstractmethod + def get( + self, + link: Link, + package_name: Optional[str], + supported_tags: List[Tag], + ) -> Link: + """Returns a link to a cached item if it exists, otherwise returns the + passed link. + """ + ... + class SimpleWheelCache(WheelCacheBase): """A cache of wheels for future installs.""" @@ -189,7 +201,7 @@ class SimpleWheelCache(WheelCacheBase): :param link: The link of the sdist for which this will cache wheels. """ - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir # Store wheels within the root cache_dir return os.path.join(self.cache_dir, "wheels", *parts) diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 93bd70397..dc63e4aae 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -12,7 +12,7 @@ from functools import partial from optparse import Values from typing import TYPE_CHECKING, Any, List, Optional, Tuple -from pip._internal.cache import LinkMetadataCache, WheelCache +from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.base_command import Command from pip._internal.cli.command_context import CommandContextMixIn @@ -506,8 +506,13 @@ class RequirementCommand(IndexGroupCommand): ignore_requires_python=ignore_requires_python, ) + if options.cache_dir: + fetch_resolve_cache = FetchResolveCache(options.cache_dir) + else: + fetch_resolve_cache = None return PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + fetch_resolve_cache=fetch_resolve_cache, ) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index b3e293ea3..1c19f4eed 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -96,7 +96,9 @@ class _NotHTTP(Exception): pass -def _ensure_api_response(url: str, session: PipSession) -> None: +def _ensure_api_response( + url: str, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> None: """ Send a HEAD request to the URL, and ensure the response contains a simple API Response. @@ -108,13 +110,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None: if scheme not in {"http", "https"}: raise _NotHTTP() - resp = session.head(url, allow_redirects=True) + resp = session.head(url, allow_redirects=True, headers=headers) raise_for_status(resp) _ensure_api_header(resp) -def _get_simple_response(url: str, session: PipSession) -> Response: +def _get_simple_response( + url: str, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> Response: """Access an Simple API response with GET, and return the response. This consists of three parts: @@ -128,10 +132,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response: and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_api_response(url, session=session) + _ensure_api_response(url, session=session, headers=headers) logger.debug("Getting page %s", redact_auth_from_url(url)) + logger.debug("headers: %s", str(headers)) + if headers is None: + headers = {} resp = session.get( url, headers={ @@ -156,6 +163,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response: # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", + **headers, }, ) raise_for_status(resp) @@ -235,7 +243,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json(file, page.url, page_content=page) if link is None: continue yield link @@ -248,7 +256,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + link = Link.from_element( + anchor, page_url=url, base_url=base_url, page_content=page + ) if link is None: continue yield link @@ -264,6 +274,8 @@ class IndexContent: encoding: Optional[str], url: str, cache_link_parsing: bool = True, + etag: Optional[str] = None, + date: Optional[str] = None, ) -> None: """ :param encoding: the encoding to decode the given content. @@ -271,12 +283,16 @@ class IndexContent: :param cache_link_parsing: whether links parsed from this page's url should be cached. PyPI index urls should have this set to False, for example. + :param etag: The ``ETag`` header from an HTTP request against ``url``. + :param date: The ``Date`` header from an HTTP request against ``url``. """ self.content = content self.content_type = content_type self.encoding = encoding self.url = url self.cache_link_parsing = cache_link_parsing + self.etag = etag + self.date = date def __str__(self) -> str: return redact_auth_from_url(self.url) @@ -321,7 +337,8 @@ def _handle_get_simple_fail( def _make_index_content( - response: Response, cache_link_parsing: bool = True + response: Response, + cache_link_parsing: bool = True, ) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) return IndexContent( @@ -330,11 +347,15 @@ def _make_index_content( encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, + etag=response.headers.get("ETag", None), + date=response.headers.get("Date", None), ) -def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]: - url = link.url.split("#", 1)[0] +def _get_index_content( + link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> Optional["IndexContent"]: + url = link.url_without_fragment # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) @@ -361,7 +382,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon logger.debug(" file: URL is directory, getting %s", url) try: - resp = _get_simple_response(url, session=session) + resp = _get_simple_response(url, session=session, headers=headers) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " @@ -377,9 +398,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon exc.request_desc, exc.content_type, ) - except NetworkConnectionError as exc: - _handle_get_simple_fail(link, exc) - except RetryError as exc: + except (NetworkConnectionError, RetryError) as exc: _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " @@ -454,11 +473,14 @@ class LinkCollector: def find_links(self) -> List[str]: return self.search_scope.find_links - def fetch_response(self, location: Link) -> Optional[IndexContent]: + def fetch_response( + self, location: Link, headers: Optional[Dict[str, str]] = None + ) -> Optional[IndexContent]: """ Fetch an HTML page containing package links. """ - return _get_index_content(location, session=self.session) + logger.debug("headers: %s", str(headers)) + return _get_index_content(location, session=self.session, headers=headers) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 2121ca327..d84210307 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -4,8 +4,21 @@ import enum import functools import itertools import logging +import os import re -from typing import TYPE_CHECKING, FrozenSet, Iterable, List, Optional, Set, Tuple, Union +from hashlib import sha256 +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Dict, + FrozenSet, + Iterable, + List, + Optional, + Set, + Tuple, + Union, +) from pip._vendor.packaging import specifiers from pip._vendor.packaging.tags import Tag @@ -13,13 +26,14 @@ from pip._vendor.packaging.utils import canonicalize_name from pip._vendor.packaging.version import _BaseVersion from pip._vendor.packaging.version import parse as parse_version +from pip._internal.cache import FetchResolveCache from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, InvalidWheelFilename, UnsupportedWheel, ) -from pip._internal.index.collector import LinkCollector, parse_links +from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link @@ -602,6 +616,7 @@ class PackageFinder: format_control: Optional[FormatControl] = None, candidate_prefs: Optional[CandidatePreferences] = None, ignore_requires_python: Optional[bool] = None, + fetch_resolve_cache: Optional[FetchResolveCache] = None, ) -> None: """ This constructor is primarily meant to be used by the create() class @@ -629,6 +644,8 @@ class PackageFinder: # These are boring links that have already been logged somehow. self._logged_links: Set[Tuple[Link, LinkType, str]] = set() + self._fetch_resolve_cache = fetch_resolve_cache + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -639,6 +656,7 @@ class PackageFinder: link_collector: LinkCollector, selection_prefs: SelectionPreferences, target_python: Optional[TargetPython] = None, + fetch_resolve_cache: Optional[FetchResolveCache] = None, ) -> "PackageFinder": """Create a PackageFinder. @@ -663,6 +681,7 @@ class PackageFinder: allow_yanked=selection_prefs.allow_yanked, format_control=selection_prefs.format_control, ignore_requires_python=selection_prefs.ignore_requires_python, + fetch_resolve_cache=fetch_resolve_cache, ) @property @@ -781,18 +800,169 @@ class PackageFinder: return candidates - def process_project_url( + @staticmethod + def _try_load_http_cache_headers( + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + headers: Dict[str, str], + ) -> Tuple[Optional[str], Optional[str], Optional[str]]: + etag: Optional[str] = None + try: + etag = etag_path.read_text() + logger.debug( + "found cached etag for url %s at %s: %s", + project_url, + etag_path, + etag, + ) + headers["If-None-Match"] = etag + except OSError as e: + logger.debug("no etag found for url %s (%s)", project_url, str(e)) + + date: Optional[str] = None + try: + date = date_path.read_text() + logger.debug( + "found cached date for url %s at %s: %s", + project_url, + date_path, + date, + ) + headers["If-Modified-Since"] = date + except OSError as e: + logger.debug("no date found for url %s (%s)", project_url, str(e)) + + checksum: Optional[str] = None + try: + checksum = checksum_path.read_text() + logger.debug( + "found checksum for url %s at %s: %s", + project_url, + checksum_path, + checksum, + ) + except OSError as e: + logger.debug("no checksum found for url %s (%s)", project_url, str(e)) + + return (etag, date, checksum) + + @staticmethod + def _write_http_cache_info( + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + index_response: IndexContent, + prev_etag: Optional[str], + prev_checksum: Optional[str], + ) -> Tuple[Optional[str], Optional[str], str, bool]: + hasher = sha256() + hasher.update(index_response.content) + new_checksum = hasher.hexdigest() + checksum_path.write_text(new_checksum) + page_unmodified = new_checksum == prev_checksum + + new_etag = index_response.etag + if new_etag is None: + logger.debug("no etag returned from fetch for url %s", project_url.url) + try: + etag_path.unlink() + except OSError: + pass + elif new_etag != prev_etag: + logger.debug( + "etag for url %s updated from %s -> %s", + project_url.url, + prev_etag, + new_etag, + ) + etag_path.write_text(new_etag) + else: + logger.debug( + "etag was unmodified for url %s (%s)", project_url.url, prev_etag + ) + assert page_unmodified + + new_date = index_response.date + if new_date is None: + logger.debug( + "no date could be parsed from response for url %s", project_url + ) + try: + date_path.unlink() + except OSError: + pass + else: + logger.debug('date "%s" written for url %s', new_date, project_url) + date_path.write_text(new_date) + + return (new_etag, new_date, new_checksum, page_unmodified) + + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: - logger.debug( - "Fetching project page and analyzing links: %s", - project_url, - ) index_response = self._link_collector.fetch_response(project_url) if index_response is None: return [] - page_links = list(parse_links(index_response)) + page_links = parse_links(index_response) + + with indent_log(): + package_links = self.evaluate_links(link_evaluator, links=page_links) + return package_links + + @functools.lru_cache(maxsize=None) + def process_project_url( + self, project_url: Link, link_evaluator: LinkEvaluator + ) -> List[InstallationCandidate]: + if self._fetch_resolve_cache is None: + return self._process_project_url_uncached(project_url, link_evaluator) + + cached_path = self._fetch_resolve_cache.cache_path(project_url) + os.makedirs(str(cached_path), exist_ok=True) + + etag_path = cached_path / "etag" + date_path = cached_path / "modified-since-date" + checksum_path = cached_path / "checksum" + + headers: Dict[str, str] = {} + # NB: mutates headers! + prev_etag, _prev_date, prev_checksum = self._try_load_http_cache_headers( + etag_path, date_path, checksum_path, project_url, headers + ) + + logger.debug( + "Fetching project page and analyzing links: %s", + project_url, + ) + + # A 304 Not Modified is converted into the re-use of a cached response from the + # Cache-Control library, so we won't explicitly check for a 304. + index_response = self._link_collector.fetch_response( + project_url, + headers=headers, + ) + if index_response is None: + return [] + + ( + _new_etag, + _new_date, + _new_checksum, + page_unmodified, + ) = self._write_http_cache_info( + etag_path, + date_path, + checksum_path, + project_url, + index_response, + prev_etag=prev_etag, + prev_checksum=prev_checksum, + ) + + page_links = parse_links(index_response) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 4453519ad..9f5a924c0 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -264,6 +264,7 @@ class Link(KeyBasedCompareMixin): cls, file_data: Dict[str, Any], page_url: str, + page_content: Optional["IndexContent"] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. @@ -303,7 +304,7 @@ class Link(KeyBasedCompareMixin): return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, hashes=hashes, @@ -316,6 +317,7 @@ class Link(KeyBasedCompareMixin): anchor_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, + page_content: Optional["IndexContent"] = None, ) -> Optional["Link"]: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -356,7 +358,7 @@ class Link(KeyBasedCompareMixin): return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, From c396596b7ba0e70b6d26fe8dbd03b5d8c614176d Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 05:31:47 -0400 Subject: [PATCH 09/10] add NEWS --- news/12257.feature.rst | 1 + src/pip/_internal/cli/req_command.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 news/12257.feature.rst diff --git a/news/12257.feature.rst b/news/12257.feature.rst new file mode 100644 index 000000000..21ea8cf59 --- /dev/null +++ b/news/12257.feature.rst @@ -0,0 +1 @@ +Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled. diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index dc63e4aae..e63af8b61 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -506,7 +506,7 @@ class RequirementCommand(IndexGroupCommand): ignore_requires_python=ignore_requires_python, ) - if options.cache_dir: + if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled): fetch_resolve_cache = FetchResolveCache(options.cache_dir) else: fetch_resolve_cache = None From 5cc8a36eddcbb8c736032d4064b56a02f80393cd Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 07:15:34 -0400 Subject: [PATCH 10/10] fix test failures --- src/pip/_internal/index/package_finder.py | 5 +++++ tests/unit/test_collector.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index d84210307..36edd13d6 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -903,6 +903,11 @@ class PackageFinder: def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: + logger.debug( + "Fetching project page and analyzing links: %s", + project_url, + ) + index_response = self._link_collector.fetch_response(project_url) if index_response is None: return [] diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 5410a4afc..6aca7bb15 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -90,7 +90,7 @@ def test_get_simple_response_archive_to_http_scheme( session.assert_has_calls( [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), ] ) mock_raise_for_status.assert_called_once_with(session.head.return_value) @@ -152,7 +152,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html( assert resp is not None assert session.mock_calls == [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), mock.call.get( url, headers={ @@ -240,7 +240,7 @@ def test_get_simple_response_dont_log_clear_text_password( assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ @@ -248,6 +248,9 @@ def test_get_simple_response_dont_log_clear_text_password( ] record = caplog.records[1] assert record.levelname == "DEBUG" + assert record.message.splitlines() == ["headers: None"] + record = caplog.records[2] + assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Fetched page https://user:****@example.com/simple/ as text/html", ] @@ -838,7 +841,7 @@ def test_get_index_content_directory_append_index(tmpdir: Path) -> None: mock_func.return_value = fake_response actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ - mock.call(expected_url, session=session), + mock.call(expected_url, headers=None, session=session), ], f"actual calls: {mock_func.mock_calls}" assert actual is not None @@ -957,6 +960,7 @@ class TestLinkCollector: # _get_simple_response(). mock_get_simple_response.assert_called_once_with( url, + headers=None, session=link_collector.session, )