1
1
Fork 0
mirror of https://github.com/pypa/pip synced 2023-12-13 21:30:23 +01:00

Merge pull request #5838 from uranusjr/htmlpage-extract-breakdown-get-page

Refactor _get_html_page() to use exceptions for flow control
This commit is contained in:
Chris Jerdonek 2018-10-11 12:27:17 -07:00 committed by GitHub
commit 8dbbe165f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 291 additions and 82 deletions

1
news/5838.bugfix Normal file
View file

@ -0,0 +1 @@
Fix content type detection if a directory named like an archive is used as a package source.

View file

@ -59,18 +59,113 @@ SECURE_ORIGINS = [
logger = logging.getLogger(__name__)
def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
def _match_vcs_scheme(url):
"""Look for VCS schemes in the URL.
Returns the matched VCS scheme, or None if there's no match.
"""
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
return scheme
return None
def _is_url_like_archive(url):
"""Return whether the URL looks like an archive.
"""
filename = Link(url).filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
return True
return False
class _NotHTML(Exception):
def __init__(self, content_type, request_desc):
super(_NotHTML, self).__init__(content_type, request_desc)
self.content_type = content_type
self.request_desc = request_desc
def _ensure_html_header(response):
"""Check the Content-Type header to ensure the response contains HTML.
Raises `_NotHTML` if the content type is not text/html.
"""
content_type = response.headers.get("Content-Type", "")
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type, response.request.method)
class _NotHTTP(Exception):
pass
def _ensure_html_response(url, session):
"""Send a HEAD request to the URL, and ensure the response contains HTML.
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
`_NotHTML` if the content type is not text/html.
"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
# assertion error?
return ''
raise _NotHTTP()
resp = session.head(url, allow_redirects=True)
resp.raise_for_status()
return resp.headers.get("Content-Type", "")
_ensure_html_header(resp)
def _get_html_response(url, session):
"""Access an HTML page with GET, and return the response.
This consists of three parts:
1. If the URL looks suspiciously like an archive, send a HEAD first to
check the Content-Type is HTML, to avoid downloading a large file.
Raise `_NotHTTP` if the content type cannot be determined, or
`_NotHTML` if it is not HTML.
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check the Content-Type header to make sure we got HTML, and raise
`_NotHTML` otherwise.
"""
if _is_url_like_archive(url):
_ensure_html_response(url, session=session)
logger.debug('Getting page %s', url)
resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
)
resp.raise_for_status()
# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
_ensure_html_header(resp)
return resp
def _handle_get_page_fail(link, reason, url, meth=None):
@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
"_get_html_page() missing 1 required keyword argument: 'session'"
)
url = link.url
url = url.split('#', 1)[0]
url = link.url.split('#', 1)[0]
# Check for VCS schemes that do not support lookup as web pages.
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
logger.debug('Cannot look at %s URL %s', scheme, link)
return None
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
return None
# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)
try:
filename = link.filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = _get_content_type(url, session=session)
if content_type.lower().startswith('text/html'):
break
else:
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return
logger.debug('Getting page %s', url)
# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)
resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
resp = _get_html_response(url, session=session)
except _NotHTTP as exc:
logger.debug(
'Skipping page %s because it looks like an archive, and cannot '
'be checked by HEAD.', link,
)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because the %s request got Content-Type: %s',
link, exc.request_desc, exc.content_type,
)
resp.raise_for_status()
# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return
inst = HTMLPage(resp.content, resp.url, resp.headers)
except requests.HTTPError as exc:
_handle_get_page_fail(link, exc, url)
except RetryError as exc:
@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out", url)
else:
return inst
return HTMLPage(resp.content, resp.url, resp.headers)
class PackageFinder(object):
@ -679,7 +728,7 @@ class PackageFinder(object):
continue
seen.add(location)
page = self._get_page(location)
page = _get_html_page(location, session=self.session)
if page is None:
continue
@ -796,9 +845,6 @@ class PackageFinder(object):
return InstallationCandidate(search.supplied, version, link)
def _get_page(self, link):
return _get_html_page(link, session=self.session)
def egg_info_matches(
egg_info, search_name, link,

View file

@ -0,0 +1,162 @@
import logging
import mock
import pytest
from pip._vendor.six.moves.urllib import request as urllib_request
from pip._internal.download import PipSession
from pip._internal.index import (
Link, _get_html_page, _get_html_response, _NotHTML, _NotHTTP,
)
@pytest.mark.parametrize(
"url",
[
"ftp://python.org/python-3.7.1.zip",
"file:///opt/data/pip-18.0.tar.gz",
],
)
def test_get_html_response_archive_to_naive_scheme(url):
"""
`_get_html_response()` should error on an archive-like URL if the scheme
does not allow "poking" without getting data.
"""
with pytest.raises(_NotHTTP):
_get_html_response(url, session=mock.Mock(PipSession))
@pytest.mark.parametrize(
"url, content_type",
[
("http://python.org/python-3.7.1.zip", "application/zip"),
("https://pypi.org/pip-18.0.tar.gz", "application/gzip"),
],
)
def test_get_html_response_archive_to_http_scheme(url, content_type):
"""
`_get_html_response()` should send a HEAD request on an archive-like URL
if the scheme supports it, and raise `_NotHTML` if the response isn't HTML.
"""
session = mock.Mock(PipSession)
session.head.return_value = mock.Mock(**{
"request.method": "HEAD",
"headers": {"Content-Type": content_type},
})
with pytest.raises(_NotHTML) as ctx:
_get_html_response(url, session=session)
session.assert_has_calls([
mock.call.head(url, allow_redirects=True),
])
assert ctx.value.args == (content_type, "HEAD")
@pytest.mark.parametrize(
"url",
[
"http://python.org/python-3.7.1.zip",
"https://pypi.org/pip-18.0.tar.gz",
],
)
def test_get_html_response_archive_to_http_scheme_is_html(url):
"""
`_get_html_response()` should work with archive-like URLs if the HEAD
request is responded with text/html.
"""
session = mock.Mock(PipSession)
session.head.return_value = mock.Mock(**{
"request.method": "HEAD",
"headers": {"Content-Type": "text/html"},
})
session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"})
resp = _get_html_response(url, session=session)
assert resp is not None
assert session.mock_calls == [
mock.call.head(url, allow_redirects=True),
mock.call.head().raise_for_status(),
mock.call.get(url, headers={
"Accept": "text/html", "Cache-Control": "max-age=0",
}),
mock.call.get().raise_for_status(),
]
@pytest.mark.parametrize(
"url",
[
"https://pypi.org/simple/pip",
"https://pypi.org/simple/pip/",
"https://python.org/sitemap.xml",
],
)
def test_get_html_response_no_head(url):
"""
`_get_html_response()` shouldn't send a HEAD request if the URL does not
look like an archive, only the GET request that retrieves data.
"""
session = mock.Mock(PipSession)
# Mock the headers dict to ensure it is accessed.
session.get.return_value = mock.Mock(headers=mock.Mock(**{
"get.return_value": "text/html",
}))
resp = _get_html_response(url, session=session)
assert resp is not None
assert session.head.call_count == 0
assert session.get.mock_calls == [
mock.call(url, headers={
"Accept": "text/html", "Cache-Control": "max-age=0",
}),
mock.call().raise_for_status(),
mock.call().headers.get("Content-Type", ""),
]
@pytest.mark.parametrize(
"url, vcs_scheme",
[
("svn+http://pypi.org/something", "svn"),
("git+https://github.com/pypa/pip.git", "git"),
],
)
def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
"""`_get_html_page()` should error if an invalid scheme is given.
Only file:, http:, https:, and ftp: are allowed.
"""
with caplog.at_level(logging.DEBUG):
page = _get_html_page(Link(url), session=mock.Mock(PipSession))
assert page is None
assert caplog.record_tuples == [
(
"pip._internal.index",
logging.DEBUG,
"Cannot look at {} URL {}".format(vcs_scheme, url),
),
]
def test_get_html_page_directory_append_index(tmpdir):
"""`_get_html_page()` should append "index.html" to a directory URL.
"""
dirpath = tmpdir.mkdir("something")
dir_url = "file:///{}".format(
urllib_request.pathname2url(dirpath).lstrip("/"),
)
session = mock.Mock(PipSession)
with mock.patch("pip._internal.index._get_html_response") as mock_func:
_get_html_page(Link(dir_url), session=session)
assert mock_func.mock_calls == [
mock.call(
"{}/index.html".format(dir_url.rstrip("/")),
session=session,
),
]