mirror of https://github.com/pypa/pip
Extract content type detection logic
This commit is contained in:
parent
44713cafe5
commit
bad04d8549
|
@ -71,8 +71,28 @@ def _match_vcs_scheme(url):
|
|||
return None
|
||||
|
||||
|
||||
def _get_content_type(url, session):
|
||||
"""Get the Content-Type of the given url, using a HEAD request"""
|
||||
def _is_filename_like_archive(filename):
|
||||
"""Check whether the filename looks like an archive.
|
||||
"""
|
||||
for bad_ext in ARCHIVE_EXTENSIONS:
|
||||
if filename.endswith(bad_ext):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class _NotHTML(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _ensure_html_header(headers):
|
||||
content_type = headers.get("Content-Type", "")
|
||||
if not content_type.lower().startswith("text/html"):
|
||||
raise _NotHTML(content_type)
|
||||
|
||||
|
||||
def _ensure_html_response(url, session):
|
||||
"""Send a HEAD request to the URL, and ensure the Content-Type is HTML.
|
||||
"""
|
||||
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
|
||||
if scheme not in {'http', 'https'}:
|
||||
# FIXME: some warning or something?
|
||||
|
@ -82,33 +102,20 @@ def _get_content_type(url, session):
|
|||
resp = session.head(url, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
return resp.headers.get("Content-Type", "")
|
||||
_ensure_html_header(resp.headers)
|
||||
|
||||
|
||||
class _NotHTML(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _get_html_response(url, filename, session):
|
||||
def _get_html_response(url, session):
|
||||
"""Access an HTML page with GET, and return the response.
|
||||
|
||||
This consists of five parts:
|
||||
This consists of three parts:
|
||||
|
||||
1. Check whether the URL looks like an archive. If it does, make a HEAD
|
||||
request to check the Content-Type header is indeed HTML, and raise
|
||||
`_NotHTML` if it's not. Raise HTTP exceptions on network failures.
|
||||
2. If URL scheme is file: and points to a directory, make it point to
|
||||
1. If URL scheme is file: and points to a directory, make it point to
|
||||
index.html instead.
|
||||
3. Actually perform the request. Raise HTTP exceptions on network failures.
|
||||
4. Check whether Content-Type header to make sure the thing we got is HTML,
|
||||
2. Actually perform the request. Raise HTTP exceptions on network failures.
|
||||
3. Check whether Content-Type header to make sure the thing we got is HTML,
|
||||
and raise `_NotHTML` if it's not.
|
||||
"""
|
||||
for bad_ext in ARCHIVE_EXTENSIONS:
|
||||
if filename.endswith(bad_ext):
|
||||
content_type = _get_content_type(url, session=session)
|
||||
if not content_type.lower().startswith('text/html'):
|
||||
raise _NotHTML(content_type)
|
||||
|
||||
logger.debug('Getting page %s', url)
|
||||
|
||||
# Tack index.html onto file:// URLs that point to directories
|
||||
|
@ -148,9 +155,7 @@ def _get_html_response(url, filename, session):
|
|||
# requirement of an url. Unless we issue a HEAD request on every
|
||||
# url we cannot know ahead of time for sure if something is HTML
|
||||
# or not. However we can check after we've downloaded it.
|
||||
content_type = resp.headers.get('Content-Type', 'unknown')
|
||||
if not content_type.lower().startswith("text/html"):
|
||||
raise _NotHTML(content_type)
|
||||
_ensure_html_header(resp.headers)
|
||||
|
||||
return resp
|
||||
|
||||
|
@ -176,7 +181,11 @@ def _get_html_page(link, session=None):
|
|||
return None
|
||||
|
||||
try:
|
||||
resp = _get_html_response(url, link.filename, session=session)
|
||||
# If the URL looks suspiciously like an archive, send a HEAD first to
|
||||
# check the Content-Type is HTML, to avoid downloading a large file.
|
||||
if _is_filename_like_archive(link.filename):
|
||||
_ensure_html_response(url, session=session)
|
||||
resp = _get_html_response(url, session=session)
|
||||
except _NotHTML as exc:
|
||||
logger.debug(
|
||||
'Skipping page %s because of Content-Type: %s',
|
||||
|
|
Loading…
Reference in New Issue