Extract content type detection logic

This commit is contained in:
Tzu-ping Chung 2018-10-01 18:53:51 +08:00
parent 44713cafe5
commit bad04d8549
1 changed files with 34 additions and 25 deletions

View File

@ -71,8 +71,28 @@ def _match_vcs_scheme(url):
return None
def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
def _is_filename_like_archive(filename):
"""Check whether the filename looks like an archive.
"""
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
return True
return False
class _NotHTML(Exception):
pass
def _ensure_html_header(headers):
content_type = headers.get("Content-Type", "")
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type)
def _ensure_html_response(url, session):
"""Send a HEAD request to the URL, and ensure the Content-Type is HTML.
"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
@ -82,33 +102,20 @@ def _get_content_type(url, session):
resp = session.head(url, allow_redirects=True)
resp.raise_for_status()
return resp.headers.get("Content-Type", "")
_ensure_html_header(resp.headers)
class _NotHTML(Exception):
pass
def _get_html_response(url, filename, session):
def _get_html_response(url, session):
"""Access an HTML page with GET, and return the response.
This consists of five parts:
This consists of three parts:
1. Check whether the URL looks like an archive. If it does, make a HEAD
request to check the Content-Type header is indeed HTML, and raise
`_NotHTML` if it's not. Raise HTTP exceptions on network failures.
2. If URL scheme is file: and points to a directory, make it point to
1. If URL scheme is file: and points to a directory, make it point to
index.html instead.
3. Actually perform the request. Raise HTTP exceptions on network failures.
4. Check whether Content-Type header to make sure the thing we got is HTML,
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check whether Content-Type header to make sure the thing we got is HTML,
and raise `_NotHTML` if it's not.
"""
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = _get_content_type(url, session=session)
if not content_type.lower().startswith('text/html'):
raise _NotHTML(content_type)
logger.debug('Getting page %s', url)
# Tack index.html onto file:// URLs that point to directories
@ -148,9 +155,7 @@ def _get_html_response(url, filename, session):
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type)
_ensure_html_header(resp.headers)
return resp
@ -176,7 +181,11 @@ def _get_html_page(link, session=None):
return None
try:
resp = _get_html_response(url, link.filename, session=session)
# If the URL looks suspiciously like an archive, send a HEAD first to
# check the Content-Type is HTML, to avoid downloading a large file.
if _is_filename_like_archive(link.filename):
_ensure_html_response(url, session=session)
resp = _get_html_response(url, session=session)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because of Content-Type: %s',