Fix #1100 - Use html5lib to decode the HTML response

This commit is contained in:
Donald Stufft 2014-06-14 14:02:26 -04:00
parent 36fc45eee0
commit 9f10a05982
3 changed files with 29 additions and 6 deletions

View File

@ -110,6 +110,9 @@
* Automatically retry failed HTTP requests by default. (:pull:`1444`, pull:`2147`)
* Fixed :issue:`1100` - Handle HTML Encoding better using a method that is more
similar to how browsers handle it. (:pull:`1874`)
**1.5.6 (2014-05-16)**

View File

@ -2,6 +2,7 @@
from __future__ import absolute_import
import logging
import cgi
import sys
import os
import re
@ -735,16 +736,28 @@ class HTMLPage(object):
"""Represents one page, along with its URL"""
# FIXME: these regexes are horrible hacks:
_homepage_re = re.compile(r'<th>\s*home\s*page', re.I)
_download_re = re.compile(r'<th>\s*download\s+url', re.I)
_homepage_re = re.compile(b'<th>\\s*home\\s*page', re.I)
_download_re = re.compile(b'<th>\\s*download\\s+url', re.I)
_href_re = re.compile(
'href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))',
b'href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))',
re.I | re.S
)
def __init__(self, content, url, headers=None, trusted=None):
# Determine if we have any encoding information in our headers
encoding = None
if headers and "Content-Type" in headers:
content_type, params = cgi.parse_header(headers["Content-Type"])
if "charset" in params:
encoding = params['charset']
self.content = content
self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
self.parsed = html5lib.parse(
self.content,
encoding=encoding,
namespaceHTMLElements=False,
)
self.url = url
self.headers = headers
self.trusted = trusted
@ -824,7 +837,10 @@ class HTMLPage(object):
)
return
inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted)
inst = cls(
resp.content, resp.url, resp.headers,
trusted=link.trusted,
)
except requests.HTTPError as exc:
level = 2 if exc.response.status_code == 404 else 1
cls._handle_fail(req, link, exc, url, level=level)
@ -955,6 +971,10 @@ class HTMLPage(object):
)
if not url:
continue
try:
url = url.decode("ascii")
except UnicodeDecodeError:
continue
url = self.clean_link(urllib_parse.urljoin(self.base_url, url))
yield Link(url, self, trusted=False, _deprecated_regex=True)

View File

@ -10,7 +10,7 @@ def test_html_page_should_be_able_to_scrap_rel_links():
Test scraping page looking for url in href
"""
page = HTMLPage(
"""
b"""
<!-- The <th> elements below are a terrible terrible hack for setuptools -->
<li>
<strong>Home Page:</strong>