Merge pull request #261 from romlok/schmontenttype

Try to parse feeds even if content-type says HTML
This commit is contained in:
Eric Le Lay 2017-11-27 21:16:38 +01:00 committed by GitHub
commit fd99473684
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -36,9 +36,11 @@ import urlparse
try:
# Python 2
from rfc822 import mktime_tz
from StringIO import StringIO
except ImportError:
# Python 3
from email.utils import mktime_tz
from io import StringIO
class ExceptionWithData(Exception):
@ -171,25 +173,32 @@ class Fetcher(object):
except HTTPError as e:
return self._check_statuscode(e, e.geturl())
if not is_local and stream.headers.get('content-type', '').startswith('text/html'):
if autodiscovery:
ad = FeedAutodiscovery(url)
ad.feed(stream.read())
if ad._resolved_url:
try:
self._parse_feed(ad._resolved_url, None, None, False)
return Result(NEW_LOCATION, ad._resolved_url)
except Exception as e:
logger.warn('Feed autodiscovery failed', exc_info=True)
data = stream
if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'):
# We use StringIO in case the stream needs to be read again
data = StringIO(stream.read())
ad = FeedAutodiscovery(url)
ad.feed(data.read())
if ad._resolved_url:
try:
self._parse_feed(ad._resolved_url, None, None, False)
return Result(NEW_LOCATION, ad._resolved_url)
except Exception as e:
logger.warn('Feed autodiscovery failed', exc_info=True)
# Second, try to resolve the URL
url = self._resolve_url(url)
if url:
return Result(NEW_LOCATION, url)
# Second, try to resolve the URL
url = self._resolve_url(url)
if url:
return Result(NEW_LOCATION, url)
# Reset the stream so podcastparser can give it a go
data.seek(0)
raise InvalidFeed('Got HTML document instead')
feed = podcastparser.parse(url, stream)
try:
feed = podcastparser.parse(url, data)
except ValueError as e:
raise InvalidFeed(u'Could not parse feed: {msg}'.format(msg=e))
if is_local:
feed['headers'] = {}
return Result(UPDATED_FEED, feed)