Do a check prior to returning HTMLPage that it is indeed html

Previously pip would check with a HEAD request if the url looked
like it contained an archive and skip the file. However this didn't
work if the url didn't look like an archive and instead just
redirected to an archive.

This will therefore, after the url has been fetched, inspect the
headers and look to see if the Content-Type is text/html.
This commit is contained in:
Donald Stufft 2013-06-07 08:38:30 -04:00
parent 60f8da5c41
commit 75cef55df7
1 changed files with 15 additions and 0 deletions

View File

@ -544,6 +544,21 @@ class HTMLPage(object):
contents = gzip.GzipFile(fileobj=BytesIO(contents)).read()
if encoding == 'deflate':
contents = zlib.decompress(contents)
# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download
# redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz
# Unless we issue a HEAD request on every url we cannot know
# ahead of time for sure if something is HTML or not. However we
# can check after we've downloaded it.
if not headers["Content-Type"].lower().startswith("text/html"):
logger.debug('Skipping page %s because of Content-Type: %s' %
(link, headers["Content-Type"]))
if cache is not None:
cache.set_is_archive(url)
return None
inst = cls(u(contents), real_url, headers)
except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError):
e = sys.exc_info()[1]