mirror of
https://github.com/pypa/pip
synced 2023-12-13 21:30:23 +01:00
Merge pull request #5838 from uranusjr/htmlpage-extract-breakdown-get-page
Refactor _get_html_page() to use exceptions for flow control
This commit is contained in:
commit
8dbbe165f8
1
news/5838.bugfix
Normal file
1
news/5838.bugfix
Normal file
|
@ -0,0 +1 @@
|
|||
Fix content type detection if a directory named like an archive is used as a package source.
|
|
@ -59,18 +59,113 @@ SECURE_ORIGINS = [
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_content_type(url, session):
|
||||
"""Get the Content-Type of the given url, using a HEAD request"""
|
||||
def _match_vcs_scheme(url):
|
||||
"""Look for VCS schemes in the URL.
|
||||
|
||||
Returns the matched VCS scheme, or None if there's no match.
|
||||
"""
|
||||
from pip._internal.vcs import VcsSupport
|
||||
for scheme in VcsSupport.schemes:
|
||||
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
|
||||
return scheme
|
||||
return None
|
||||
|
||||
|
||||
def _is_url_like_archive(url):
|
||||
"""Return whether the URL looks like an archive.
|
||||
"""
|
||||
filename = Link(url).filename
|
||||
for bad_ext in ARCHIVE_EXTENSIONS:
|
||||
if filename.endswith(bad_ext):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class _NotHTML(Exception):
|
||||
def __init__(self, content_type, request_desc):
|
||||
super(_NotHTML, self).__init__(content_type, request_desc)
|
||||
self.content_type = content_type
|
||||
self.request_desc = request_desc
|
||||
|
||||
|
||||
def _ensure_html_header(response):
|
||||
"""Check the Content-Type header to ensure the response contains HTML.
|
||||
|
||||
Raises `_NotHTML` if the content type is not text/html.
|
||||
"""
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if not content_type.lower().startswith("text/html"):
|
||||
raise _NotHTML(content_type, response.request.method)
|
||||
|
||||
|
||||
class _NotHTTP(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _ensure_html_response(url, session):
|
||||
"""Send a HEAD request to the URL, and ensure the response contains HTML.
|
||||
|
||||
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
|
||||
`_NotHTML` if the content type is not text/html.
|
||||
"""
|
||||
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
|
||||
if scheme not in {'http', 'https'}:
|
||||
# FIXME: some warning or something?
|
||||
# assertion error?
|
||||
return ''
|
||||
raise _NotHTTP()
|
||||
|
||||
resp = session.head(url, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
return resp.headers.get("Content-Type", "")
|
||||
_ensure_html_header(resp)
|
||||
|
||||
|
||||
def _get_html_response(url, session):
|
||||
"""Access an HTML page with GET, and return the response.
|
||||
|
||||
This consists of three parts:
|
||||
|
||||
1. If the URL looks suspiciously like an archive, send a HEAD first to
|
||||
check the Content-Type is HTML, to avoid downloading a large file.
|
||||
Raise `_NotHTTP` if the content type cannot be determined, or
|
||||
`_NotHTML` if it is not HTML.
|
||||
2. Actually perform the request. Raise HTTP exceptions on network failures.
|
||||
3. Check the Content-Type header to make sure we got HTML, and raise
|
||||
`_NotHTML` otherwise.
|
||||
"""
|
||||
if _is_url_like_archive(url):
|
||||
_ensure_html_response(url, session=session)
|
||||
|
||||
logger.debug('Getting page %s', url)
|
||||
|
||||
resp = session.get(
|
||||
url,
|
||||
headers={
|
||||
"Accept": "text/html",
|
||||
# We don't want to blindly returned cached data for
|
||||
# /simple/, because authors generally expecting that
|
||||
# twine upload && pip install will function, but if
|
||||
# they've done a pip install in the last ~10 minutes
|
||||
# it won't. Thus by setting this to zero we will not
|
||||
# blindly use any cached data, however the benefit of
|
||||
# using max-age=0 instead of no-cache, is that we will
|
||||
# still support conditional requests, so we will still
|
||||
# minimize traffic sent in cases where the page hasn't
|
||||
# changed at all, we will just always incur the round
|
||||
# trip for the conditional GET now instead of only
|
||||
# once per 10 minutes.
|
||||
# For more information, please see pypa/pip#5670.
|
||||
"Cache-Control": "max-age=0",
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
# The check for archives above only works if the url ends with
|
||||
# something that looks like an archive. However that is not a
|
||||
# requirement of an url. Unless we issue a HEAD request on every
|
||||
# url we cannot know ahead of time for sure if something is HTML
|
||||
# or not. However we can check after we've downloaded it.
|
||||
_ensure_html_header(resp)
|
||||
|
||||
return resp
|
||||
|
||||
|
||||
def _handle_get_page_fail(link, reason, url, meth=None):
|
||||
|
@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
|
|||
"_get_html_page() missing 1 required keyword argument: 'session'"
|
||||
)
|
||||
|
||||
url = link.url
|
||||
url = url.split('#', 1)[0]
|
||||
url = link.url.split('#', 1)[0]
|
||||
|
||||
# Check for VCS schemes that do not support lookup as web pages.
|
||||
from pip._internal.vcs import VcsSupport
|
||||
for scheme in VcsSupport.schemes:
|
||||
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
|
||||
logger.debug('Cannot look at %s URL %s', scheme, link)
|
||||
return None
|
||||
vcs_scheme = _match_vcs_scheme(url)
|
||||
if vcs_scheme:
|
||||
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
|
||||
return None
|
||||
|
||||
# Tack index.html onto file:// URLs that point to directories
|
||||
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
|
||||
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
|
||||
# add trailing slash if not present so urljoin doesn't trim
|
||||
# final segment
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
url = urllib_parse.urljoin(url, 'index.html')
|
||||
logger.debug(' file: URL is directory, getting %s', url)
|
||||
|
||||
try:
|
||||
filename = link.filename
|
||||
for bad_ext in ARCHIVE_EXTENSIONS:
|
||||
if filename.endswith(bad_ext):
|
||||
content_type = _get_content_type(url, session=session)
|
||||
if content_type.lower().startswith('text/html'):
|
||||
break
|
||||
else:
|
||||
logger.debug(
|
||||
'Skipping page %s because of Content-Type: %s',
|
||||
link,
|
||||
content_type,
|
||||
)
|
||||
return
|
||||
|
||||
logger.debug('Getting page %s', url)
|
||||
|
||||
# Tack index.html onto file:// URLs that point to directories
|
||||
(scheme, netloc, path, params, query, fragment) = \
|
||||
urllib_parse.urlparse(url)
|
||||
if (scheme == 'file' and
|
||||
os.path.isdir(urllib_request.url2pathname(path))):
|
||||
# add trailing slash if not present so urljoin doesn't trim
|
||||
# final segment
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
url = urllib_parse.urljoin(url, 'index.html')
|
||||
logger.debug(' file: URL is directory, getting %s', url)
|
||||
|
||||
resp = session.get(
|
||||
url,
|
||||
headers={
|
||||
"Accept": "text/html",
|
||||
# We don't want to blindly returned cached data for
|
||||
# /simple/, because authors generally expecting that
|
||||
# twine upload && pip install will function, but if
|
||||
# they've done a pip install in the last ~10 minutes
|
||||
# it won't. Thus by setting this to zero we will not
|
||||
# blindly use any cached data, however the benefit of
|
||||
# using max-age=0 instead of no-cache, is that we will
|
||||
# still support conditional requests, so we will still
|
||||
# minimize traffic sent in cases where the page hasn't
|
||||
# changed at all, we will just always incur the round
|
||||
# trip for the conditional GET now instead of only
|
||||
# once per 10 minutes.
|
||||
# For more information, please see pypa/pip#5670.
|
||||
"Cache-Control": "max-age=0",
|
||||
},
|
||||
resp = _get_html_response(url, session=session)
|
||||
except _NotHTTP as exc:
|
||||
logger.debug(
|
||||
'Skipping page %s because it looks like an archive, and cannot '
|
||||
'be checked by HEAD.', link,
|
||||
)
|
||||
except _NotHTML as exc:
|
||||
logger.debug(
|
||||
'Skipping page %s because the %s request got Content-Type: %s',
|
||||
link, exc.request_desc, exc.content_type,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
# The check for archives above only works if the url ends with
|
||||
# something that looks like an archive. However that is not a
|
||||
# requirement of an url. Unless we issue a HEAD request on every
|
||||
# url we cannot know ahead of time for sure if something is HTML
|
||||
# or not. However we can check after we've downloaded it.
|
||||
content_type = resp.headers.get('Content-Type', 'unknown')
|
||||
if not content_type.lower().startswith("text/html"):
|
||||
logger.debug(
|
||||
'Skipping page %s because of Content-Type: %s',
|
||||
link,
|
||||
content_type,
|
||||
)
|
||||
return
|
||||
|
||||
inst = HTMLPage(resp.content, resp.url, resp.headers)
|
||||
except requests.HTTPError as exc:
|
||||
_handle_get_page_fail(link, exc, url)
|
||||
except RetryError as exc:
|
||||
|
@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
|
|||
except requests.Timeout:
|
||||
_handle_get_page_fail(link, "timed out", url)
|
||||
else:
|
||||
return inst
|
||||
return HTMLPage(resp.content, resp.url, resp.headers)
|
||||
|
||||
|
||||
class PackageFinder(object):
|
||||
|
@ -679,7 +728,7 @@ class PackageFinder(object):
|
|||
continue
|
||||
seen.add(location)
|
||||
|
||||
page = self._get_page(location)
|
||||
page = _get_html_page(location, session=self.session)
|
||||
if page is None:
|
||||
continue
|
||||
|
||||
|
@ -796,9 +845,6 @@ class PackageFinder(object):
|
|||
|
||||
return InstallationCandidate(search.supplied, version, link)
|
||||
|
||||
def _get_page(self, link):
|
||||
return _get_html_page(link, session=self.session)
|
||||
|
||||
|
||||
def egg_info_matches(
|
||||
egg_info, search_name, link,
|
||||
|
|
162
tests/unit/test_index_html_page.py
Normal file
162
tests/unit/test_index_html_page.py
Normal file
|
@ -0,0 +1,162 @@
|
|||
import logging
|
||||
|
||||
import mock
|
||||
import pytest
|
||||
from pip._vendor.six.moves.urllib import request as urllib_request
|
||||
|
||||
from pip._internal.download import PipSession
|
||||
from pip._internal.index import (
|
||||
Link, _get_html_page, _get_html_response, _NotHTML, _NotHTTP,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"ftp://python.org/python-3.7.1.zip",
|
||||
"file:///opt/data/pip-18.0.tar.gz",
|
||||
],
|
||||
)
|
||||
def test_get_html_response_archive_to_naive_scheme(url):
|
||||
"""
|
||||
`_get_html_response()` should error on an archive-like URL if the scheme
|
||||
does not allow "poking" without getting data.
|
||||
"""
|
||||
with pytest.raises(_NotHTTP):
|
||||
_get_html_response(url, session=mock.Mock(PipSession))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, content_type",
|
||||
[
|
||||
("http://python.org/python-3.7.1.zip", "application/zip"),
|
||||
("https://pypi.org/pip-18.0.tar.gz", "application/gzip"),
|
||||
],
|
||||
)
|
||||
def test_get_html_response_archive_to_http_scheme(url, content_type):
|
||||
"""
|
||||
`_get_html_response()` should send a HEAD request on an archive-like URL
|
||||
if the scheme supports it, and raise `_NotHTML` if the response isn't HTML.
|
||||
"""
|
||||
session = mock.Mock(PipSession)
|
||||
session.head.return_value = mock.Mock(**{
|
||||
"request.method": "HEAD",
|
||||
"headers": {"Content-Type": content_type},
|
||||
})
|
||||
|
||||
with pytest.raises(_NotHTML) as ctx:
|
||||
_get_html_response(url, session=session)
|
||||
|
||||
session.assert_has_calls([
|
||||
mock.call.head(url, allow_redirects=True),
|
||||
])
|
||||
assert ctx.value.args == (content_type, "HEAD")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"http://python.org/python-3.7.1.zip",
|
||||
"https://pypi.org/pip-18.0.tar.gz",
|
||||
],
|
||||
)
|
||||
def test_get_html_response_archive_to_http_scheme_is_html(url):
|
||||
"""
|
||||
`_get_html_response()` should work with archive-like URLs if the HEAD
|
||||
request is responded with text/html.
|
||||
"""
|
||||
session = mock.Mock(PipSession)
|
||||
session.head.return_value = mock.Mock(**{
|
||||
"request.method": "HEAD",
|
||||
"headers": {"Content-Type": "text/html"},
|
||||
})
|
||||
session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"})
|
||||
|
||||
resp = _get_html_response(url, session=session)
|
||||
|
||||
assert resp is not None
|
||||
assert session.mock_calls == [
|
||||
mock.call.head(url, allow_redirects=True),
|
||||
mock.call.head().raise_for_status(),
|
||||
mock.call.get(url, headers={
|
||||
"Accept": "text/html", "Cache-Control": "max-age=0",
|
||||
}),
|
||||
mock.call.get().raise_for_status(),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"https://pypi.org/simple/pip",
|
||||
"https://pypi.org/simple/pip/",
|
||||
"https://python.org/sitemap.xml",
|
||||
],
|
||||
)
|
||||
def test_get_html_response_no_head(url):
|
||||
"""
|
||||
`_get_html_response()` shouldn't send a HEAD request if the URL does not
|
||||
look like an archive, only the GET request that retrieves data.
|
||||
"""
|
||||
session = mock.Mock(PipSession)
|
||||
|
||||
# Mock the headers dict to ensure it is accessed.
|
||||
session.get.return_value = mock.Mock(headers=mock.Mock(**{
|
||||
"get.return_value": "text/html",
|
||||
}))
|
||||
|
||||
resp = _get_html_response(url, session=session)
|
||||
|
||||
assert resp is not None
|
||||
assert session.head.call_count == 0
|
||||
assert session.get.mock_calls == [
|
||||
mock.call(url, headers={
|
||||
"Accept": "text/html", "Cache-Control": "max-age=0",
|
||||
}),
|
||||
mock.call().raise_for_status(),
|
||||
mock.call().headers.get("Content-Type", ""),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, vcs_scheme",
|
||||
[
|
||||
("svn+http://pypi.org/something", "svn"),
|
||||
("git+https://github.com/pypa/pip.git", "git"),
|
||||
],
|
||||
)
|
||||
def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
|
||||
"""`_get_html_page()` should error if an invalid scheme is given.
|
||||
|
||||
Only file:, http:, https:, and ftp: are allowed.
|
||||
"""
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
page = _get_html_page(Link(url), session=mock.Mock(PipSession))
|
||||
|
||||
assert page is None
|
||||
assert caplog.record_tuples == [
|
||||
(
|
||||
"pip._internal.index",
|
||||
logging.DEBUG,
|
||||
"Cannot look at {} URL {}".format(vcs_scheme, url),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get_html_page_directory_append_index(tmpdir):
|
||||
"""`_get_html_page()` should append "index.html" to a directory URL.
|
||||
"""
|
||||
dirpath = tmpdir.mkdir("something")
|
||||
dir_url = "file:///{}".format(
|
||||
urllib_request.pathname2url(dirpath).lstrip("/"),
|
||||
)
|
||||
|
||||
session = mock.Mock(PipSession)
|
||||
with mock.patch("pip._internal.index._get_html_response") as mock_func:
|
||||
_get_html_page(Link(dir_url), session=session)
|
||||
assert mock_func.mock_calls == [
|
||||
mock.call(
|
||||
"{}/index.html".format(dir_url.rstrip("/")),
|
||||
session=session,
|
||||
),
|
||||
]
|
Loading…
Reference in a new issue