import itertools import json import logging import os import re import uuid from pathlib import Path from textwrap import dedent from typing import Dict, List, Optional, Tuple from unittest import mock import pytest from pip._vendor import requests from pip._vendor.packaging.requirements import Requirement from pip._internal.exceptions import NetworkConnectionError from pip._internal.index.collector import ( IndexContent, LinkCollector, _get_index_content, _get_simple_response, _make_index_content, _NotAPIContent, _NotHTTP, parse_links, ) from pip._internal.index.sources import _FlatDirectorySource, _IndexDirectorySource from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.index import PyPI from pip._internal.models.link import ( Link, LinkHash, _clean_url_path, _ensure_quoted_url, ) from pip._internal.network.session import PipSession from tests.lib import TestData, make_test_link_collector ACCEPT = ", ".join( [ "application/vnd.pypi.simple.v1+json", "application/vnd.pypi.simple.v1+html; q=0.1", "text/html; q=0.01", ] ) @pytest.mark.parametrize( "url", [ "ftp://python.org/python-3.7.1.zip", "file:///opt/data/pip-18.0.tar.gz", ], ) def test_get_simple_response_archive_to_naive_scheme(url: str) -> None: """ `_get_simple_response()` should error on an archive-like URL if the scheme does not allow "poking" without getting data. """ with pytest.raises(_NotHTTP): _get_simple_response(url, session=mock.Mock(PipSession)) @pytest.mark.parametrize( "url, content_type", [ ("http://python.org/python-3.7.1.zip", "application/zip"), ("https://pypi.org/pip-18.0.tar.gz", "application/gzip"), ], ) @mock.patch("pip._internal.index.collector.raise_for_status") def test_get_simple_response_archive_to_http_scheme( mock_raise_for_status: mock.Mock, url: str, content_type: str ) -> None: """ `_get_simple_response()` should send a HEAD request on an archive-like URL if the scheme supports it, and raise `_NotAPIContent` if the response isn't HTML. """ session = mock.Mock(PipSession) session.head.return_value = mock.Mock( **{ "request.method": "HEAD", "headers": {"Content-Type": content_type}, } ) with pytest.raises(_NotAPIContent) as ctx: _get_simple_response(url, session=session) session.assert_has_calls( [ mock.call.head(url, allow_redirects=True), ] ) mock_raise_for_status.assert_called_once_with(session.head.return_value) assert ctx.value.args == (content_type, "HEAD") @pytest.mark.parametrize( "url", [ ("ftp://python.org/python-3.7.1.zip"), ("file:///opt/data/pip-18.0.tar.gz"), ], ) def test_get_index_content_invalid_content_type_archive( caplog: pytest.LogCaptureFixture, url: str ) -> None: """`_get_index_content()` should warn if an archive URL is not HTML and therefore cannot be used for a HEAD request. """ caplog.set_level(logging.WARNING) link = Link(url) session = mock.Mock(PipSession) assert _get_index_content(link, session=session) is None assert ( "pip._internal.index.collector", logging.WARNING, "Skipping page {} because it looks like an archive, and cannot " "be checked by a HTTP HEAD request.".format(url), ) in caplog.record_tuples @pytest.mark.parametrize( "url", [ "http://python.org/python-3.7.1.zip", "https://pypi.org/pip-18.0.tar.gz", ], ) @mock.patch("pip._internal.index.collector.raise_for_status") def test_get_simple_response_archive_to_http_scheme_is_html( mock_raise_for_status: mock.Mock, url: str ) -> None: """ `_get_simple_response()` should work with archive-like URLs if the HEAD request is responded with text/html. """ session = mock.Mock(PipSession) session.head.return_value = mock.Mock( **{ "request.method": "HEAD", "headers": {"Content-Type": "text/html"}, } ) session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"}) resp = _get_simple_response(url, session=session) assert resp is not None assert session.mock_calls == [ mock.call.head(url, allow_redirects=True), mock.call.get( url, headers={ "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), ] assert mock_raise_for_status.mock_calls == [ mock.call(session.head.return_value), mock.call(resp), ] @pytest.mark.parametrize( "url", [ "https://pypi.org/simple/pip", "https://pypi.org/simple/pip/", "https://python.org/sitemap.xml", ], ) @mock.patch("pip._internal.index.collector.raise_for_status") def test_get_simple_response_no_head( mock_raise_for_status: mock.Mock, url: str ) -> None: """ `_get_simple_response()` shouldn't send a HEAD request if the URL does not look like an archive, only the GET request that retrieves data. """ session = mock.Mock(PipSession) # Mock the headers dict to ensure it is accessed. session.get.return_value = mock.Mock( headers=mock.Mock( **{ "get.return_value": "text/html", } ) ) resp = _get_simple_response(url, session=session) assert resp is not None assert session.head.call_count == 0 assert session.get.mock_calls == [ mock.call( url, headers={ "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), mock.call().headers.get("Content-Type", "Unknown"), mock.call().headers.get("Content-Type", "Unknown"), ] mock_raise_for_status.assert_called_once_with(resp) @mock.patch("pip._internal.index.collector.raise_for_status") def test_get_simple_response_dont_log_clear_text_password( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture ) -> None: """ `_get_simple_response()` should redact the password from the index URL in its DEBUG log message. """ session = mock.Mock(PipSession) # Mock the headers dict to ensure it is accessed. session.get.return_value = mock.Mock( headers=mock.Mock( **{ "get.return_value": "text/html", } ) ) caplog.set_level(logging.DEBUG) resp = _get_simple_response( "https://user:my_password@example.com/simple/", session=session ) assert resp is not None mock_raise_for_status.assert_called_once_with(resp) assert len(caplog.records) == 2 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Getting page https://user:****@example.com/simple/", ] record = caplog.records[1] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Fetched page https://user:****@example.com/simple/ as text/html", ] @pytest.mark.parametrize( ("path", "expected"), [ # Test a character that needs quoting. ("a b", "a%20b"), # Test an unquoted "@". ("a @ b", "a%20@%20b"), # Test multiple unquoted "@". ("a @ @ b", "a%20@%20@%20b"), # Test a quoted "@". ("a %40 b", "a%20%40%20b"), # Test a quoted "@" before an unquoted "@". ("a %40b@ c", "a%20%40b@%20c"), # Test a quoted "@" after an unquoted "@". ("a @b%40 c", "a%20@b%40%20c"), # Test alternating quoted and unquoted "@". ("a %40@b %40@c %40", "a%20%40@b%20%40@c%20%40"), # Test an unquoted "/". ("a / b", "a%20/%20b"), # Test multiple unquoted "/". ("a / / b", "a%20/%20/%20b"), # Test a quoted "/". ("a %2F b", "a%20%2F%20b"), # Test a quoted "/" before an unquoted "/". ("a %2Fb/ c", "a%20%2Fb/%20c"), # Test a quoted "/" after an unquoted "/". ("a /b%2F c", "a%20/b%2F%20c"), # Test alternating quoted and unquoted "/". ("a %2F/b %2F/c %2F", "a%20%2F/b%20%2F/c%20%2F"), # Test normalizing non-reserved quoted characters "[" and "]" ("a %5b %5d b", "a%20%5B%20%5D%20b"), # Test normalizing a reserved quoted "/" ("a %2f b", "a%20%2F%20b"), ], ) @pytest.mark.parametrize("is_local_path", [True, False]) def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: assert _clean_url_path(path, is_local_path=is_local_path) == expected @pytest.mark.parametrize( ("path", "expected"), [ # Test a VCS path with a Windows drive letter and revision. pytest.param( "/T:/with space/repo.git@1.0", "///T:/with%20space/repo.git@1.0", marks=pytest.mark.skipif("sys.platform != 'win32'"), ), # Test a VCS path with a Windows drive letter and revision, # running on non-windows platform. pytest.param( "/T:/with space/repo.git@1.0", "/T%3A/with%20space/repo.git@1.0", marks=pytest.mark.skipif("sys.platform == 'win32'"), ), ], ) def test_clean_url_path_with_local_path(path: str, expected: str) -> None: actual = _clean_url_path(path, is_local_path=True) assert actual == expected @pytest.mark.parametrize( ("url", "clean_url"), [ # URL with hostname and port. Port separator should not be quoted. ( "https://localhost.localdomain:8181/path/with space/", "https://localhost.localdomain:8181/path/with%20space/", ), # URL that is already properly quoted. The quoting `%` # characters should not be quoted again. ( "https://localhost.localdomain:8181/path/with%20quoted%20space/", "https://localhost.localdomain:8181/path/with%20quoted%20space/", ), # URL with IPv4 address and port. ( "https://127.0.0.1:8181/path/with space/", "https://127.0.0.1:8181/path/with%20space/", ), # URL with IPv6 address and port. The `[]` brackets around the # IPv6 address should not be quoted. ( "https://[fd00:0:0:236::100]:8181/path/with space/", "https://[fd00:0:0:236::100]:8181/path/with%20space/", ), # URL with query. The leading `?` should not be quoted. ( "https://localhost.localdomain:8181/path/with/query?request=test", "https://localhost.localdomain:8181/path/with/query?request=test", ), # URL with colon in the path portion. ( "https://localhost.localdomain:8181/path:/with:/colon", "https://localhost.localdomain:8181/path%3A/with%3A/colon", ), # URL with something that looks like a drive letter, but is # not. The `:` should be quoted. ( "https://localhost.localdomain/T:/path/", "https://localhost.localdomain/T%3A/path/", ), # URL with a quoted "/" in the path portion. ( "https://example.com/access%2Ftoken/path/", "https://example.com/access%2Ftoken/path/", ), # VCS URL containing revision string. ( "git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0", "git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0", ), # VCS URL with a quoted "#" in the revision string. ( "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", ), # VCS URL with a quoted "@" in the revision string. ( "git+https://example.com/repo.git@at%40 space#egg=my-package-1.0", "git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0", ), # URL with Windows drive letter. The `:` after the drive # letter should not be quoted. The trailing `/` should be # removed. pytest.param( "file:///T:/path/with spaces/", "file:///T:/path/with%20spaces", marks=pytest.mark.skipif("sys.platform != 'win32'"), ), # URL with Windows drive letter, running on non-windows # platform. The `:` after the drive should be quoted. pytest.param( "file:///T:/path/with spaces/", "file:///T%3A/path/with%20spaces/", marks=pytest.mark.skipif("sys.platform == 'win32'"), ), # Test a VCS URL with a Windows drive letter and revision. pytest.param( "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", "git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0", marks=pytest.mark.skipif("sys.platform != 'win32'"), ), # Test a VCS URL with a Windows drive letter and revision, # running on non-windows platform. pytest.param( "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", "git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0", marks=pytest.mark.skipif("sys.platform == 'win32'"), ), ], ) def test_ensure_quoted_url(url: str, clean_url: str) -> None: assert _ensure_quoted_url(url) == clean_url def _test_parse_links_data_attribute( anchor_html: str, attr: str, expected: Optional[str] ) -> Link: html = ( "" '
' "{}" ).format(anchor_html) html_bytes = html.encode("utf-8") page = IndexContent( html_bytes, "text/html", encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. url=f"https://example.com/simple-{uuid.uuid4()}/", ) links = list(parse_links(page)) (link,) = links actual = getattr(link, attr) assert actual == expected return link @pytest.mark.parametrize( "anchor_html, expected", [ # Test not present. ('', None), # Test present with no value. ('', None), # Test a value with an escaped character. ( '', ">=3.6", ), # Test requires python is unescaped once. ( '', ">=3.6", ), ], ) def test_parse_links__requires_python( anchor_html: str, expected: Optional[str] ) -> None: _test_parse_links_data_attribute(anchor_html, "requires_python", expected) # TODO: this test generates its own examples to validate the json client implementation # instead of sharing those examples with the html client testing. We expect this won't # hide any bugs because operations like resolving PEP 658 metadata should use the same # code for both types of indices, but it might be nice to explicitly have all our tests # in test_download.py execute over both html and json indices with # a pytest.mark.parameterize decorator to ensure nothing slips through the cracks. def test_parse_links_json() -> None: json_bytes = json.dumps( { "meta": {"api-version": "1.0"}, "name": "holygrail", "files": [ { "filename": "holygrail-1.0.tar.gz", "url": "https://example.com/files/holygrail-1.0.tar.gz", "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, "requires-python": ">=3.7", "yanked": "Had a vulnerability", }, { "filename": "holygrail-1.0-py3-none-any.whl", "url": "/files/holygrail-1.0-py3-none-any.whl", "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, "requires-python": ">=3.7", "dist-info-metadata": False, }, # Same as above, but parsing dist-info-metadata. { "filename": "holygrail-1.0-py3-none-any.whl", "url": "/files/holygrail-1.0-py3-none-any.whl", "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, "requires-python": ">=3.7", "dist-info-metadata": "sha512=aabdd41", }, ], } ).encode("utf8") page = IndexContent( json_bytes, "application/vnd.pypi.simple.v1+json", encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. url=f"https://example.com/simple-{uuid.uuid4()}/", ) links = list(parse_links(page)) assert links == [ Link( "https://example.com/files/holygrail-1.0.tar.gz", comes_from=page.url, requires_python=">=3.7", yanked_reason="Had a vulnerability", hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", comes_from=page.url, requires_python=">=3.7", yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", comes_from=page.url, requires_python=">=3.7", yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, dist_info_metadata="sha512=aabdd41", ), ] # Ensure the metadata info can be parsed into the correct link. metadata_link = links[2].metadata_link() assert metadata_link is not None assert ( metadata_link.url == "https://example.com/files/holygrail-1.0-py3-none-any.whl.metadata" ) assert metadata_link._hashes == {"sha512": "aabdd41"} @pytest.mark.parametrize( "anchor_html, expected", [ # Test not present. ('', None), # Test present with no value. ('', None), # Test the empty string. ('', ""), # Test a non-empty string. ('', "error"), # Test a value with an escaped character. ('', "version < 1"), # Test a yanked reason with a non-ascii character. ( '', "curlyquote \u2018", ), # Test yanked reason is unescaped once. ( '', "version < 1", ), ], ) def test_parse_links__yanked_reason(anchor_html: str, expected: Optional[str]) -> None: _test_parse_links_data_attribute(anchor_html, "yanked_reason", expected) # Requirement objects do not == each other unless they point to the same instance! _pkg1_requirement = Requirement("pkg1==1.0") @pytest.mark.parametrize( "anchor_html, expected, hashes", [ # Test not present. ( '', None, {}, ), # Test with value "true". ( '', "true", {}, ), # Test with a provided hash value. ( '', # noqa: E501 "sha256=aa113592bbe", {}, ), # Test with a provided hash value for both the requirement as well as metadata. ( '', # noqa: E501 "sha256=aa113592bbe", {"sha512": "abc132409cb"}, ), ], ) def test_parse_links__dist_info_metadata( anchor_html: str, expected: Optional[str], hashes: Dict[str, str], ) -> None: link = _test_parse_links_data_attribute(anchor_html, "dist_info_metadata", expected) assert link._hashes == hashes def test_parse_links_caches_same_page_by_url() -> None: html = ( "" '' '' ) html_bytes = html.encode("utf-8") url = "https://example.com/simple/" page_1 = IndexContent( html_bytes, "text/html", encoding=None, url=url, ) # Make a second page with zero content, to ensure that it's not accessed, # because the page was cached by url. page_2 = IndexContent( b"", "text/html", encoding=None, url=url, ) # Make a third page which represents an index url, which should not be # cached, even for the same url. We modify the page content slightly to # verify that the result is not cached. page_3 = IndexContent( re.sub(b"pkg1", b"pkg2", html_bytes), "text/html", encoding=None, url=url, cache_link_parsing=False, ) parsed_links_1 = list(parse_links(page_1)) assert len(parsed_links_1) == 1 assert "pkg1" in parsed_links_1[0].url parsed_links_2 = list(parse_links(page_2)) assert parsed_links_2 == parsed_links_1 parsed_links_3 = list(parse_links(page_3)) assert len(parsed_links_3) == 1 assert parsed_links_3 != parsed_links_1 assert "pkg2" in parsed_links_3[0].url @mock.patch("pip._internal.index.collector.raise_for_status") def test_request_http_error( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture ) -> None: caplog.set_level(logging.DEBUG) link = Link("http://localhost") session = mock.Mock(PipSession) session.get.return_value = mock.Mock() mock_raise_for_status.side_effect = NetworkConnectionError("Http error") assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Http error - skipping" in caplog.text def test_request_retries(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.DEBUG) link = Link("http://localhost") session = mock.Mock(PipSession) session.get.side_effect = requests.exceptions.RetryError("Retry error") assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Retry error - skipping" in caplog.text def test_make_index_content() -> None: headers = {"Content-Type": "text/html; charset=UTF-8"} response = mock.Mock( content=b"