2009-06-12 00:51:13 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# gPodder - A media aggregator and podcast client
|
2018-01-28 19:39:53 +01:00
|
|
|
# Copyright (c) 2005-2018 The gPodder Team
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
# gPodder is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# gPodder is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# Generic feed fetching module for aggregators
|
2010-11-22 19:40:29 +01:00
|
|
|
# Thomas Perl <thp@gpodder.org>; 2009-06-11
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
|
2018-07-24 11:08:10 +02:00
|
|
|
import logging
|
|
|
|
import urllib.parse
|
|
|
|
from html.parser import HTMLParser
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2018-07-24 11:08:10 +02:00
|
|
|
import podcastparser
|
2020-07-11 17:42:26 +02:00
|
|
|
from requests.exceptions import RequestException
|
2020-02-05 08:40:37 +01:00
|
|
|
|
2016-02-06 17:46:07 +01:00
|
|
|
from gpodder import util
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2012-08-15 10:56:04 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2012-02-12 11:38:26 +01:00
|
|
|
try:
|
|
|
|
# Python 2
|
|
|
|
from rfc822 import mktime_tz
|
2017-04-01 16:22:26 +02:00
|
|
|
from StringIO import StringIO
|
2012-02-12 11:38:26 +01:00
|
|
|
except ImportError:
|
|
|
|
# Python 3
|
|
|
|
from email.utils import mktime_tz
|
2017-04-01 16:22:26 +02:00
|
|
|
from io import StringIO
|
2012-02-12 11:38:26 +01:00
|
|
|
|
2012-08-15 10:56:04 +02:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class ExceptionWithData(Exception):
|
|
|
|
"""Base exception with additional payload"""
|
|
|
|
def __init__(self, data):
|
|
|
|
Exception.__init__(self)
|
|
|
|
self.data = data
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return '%s: %s' % (self.__class__.__name__, str(self.data))
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Temporary errors
|
|
|
|
class BadRequest(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class InternalServerError(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class WifiLogin(ExceptionWithData): pass
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Fatal errors
|
|
|
|
class Unsubscribe(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class NotFound(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class InvalidFeed(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class UnknownStatusCode(ExceptionWithData): pass
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Authentication error
|
2018-12-15 16:10:00 +01:00
|
|
|
class AuthenticationRequired(Exception):
|
|
|
|
def __init__(self, msg, url=None):
|
|
|
|
super().__init__(msg)
|
|
|
|
self.url = url
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2018-02-10 11:11:20 +01:00
|
|
|
|
2012-07-10 11:57:59 +02:00
|
|
|
# Successful status codes
|
2019-06-09 18:57:13 +02:00
|
|
|
UPDATED_FEED, NEW_LOCATION, NOT_MODIFIED = list(range(3))
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2018-02-10 11:11:20 +01:00
|
|
|
|
2012-07-10 11:57:59 +02:00
|
|
|
class Result:
|
|
|
|
def __init__(self, status, feed=None):
|
|
|
|
self.status = status
|
|
|
|
self.feed = feed
|
2009-06-12 00:51:13 +02:00
|
|
|
|
|
|
|
|
2016-02-06 17:46:07 +01:00
|
|
|
class FeedAutodiscovery(HTMLParser):
|
|
|
|
def __init__(self, base):
|
|
|
|
HTMLParser.__init__(self)
|
|
|
|
self._base = base
|
|
|
|
self._resolved_url = None
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == 'link':
|
|
|
|
attrs = dict(attrs)
|
|
|
|
|
|
|
|
is_feed = attrs.get('type', '') in Fetcher.FEED_TYPES
|
|
|
|
is_alternate = attrs.get('rel', '') == 'alternate'
|
|
|
|
url = attrs.get('href', None)
|
2016-11-21 23:13:46 +01:00
|
|
|
url = urllib.parse.urljoin(self._base, url)
|
2016-02-06 17:46:07 +01:00
|
|
|
|
|
|
|
if is_feed and is_alternate and url:
|
|
|
|
logger.info('Feed autodiscovery: %s', url)
|
|
|
|
self._resolved_url = url
|
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class Fetcher(object):
|
|
|
|
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
|
|
|
|
FEED_TYPES = ('application/rss+xml',
|
|
|
|
'application/atom+xml',
|
|
|
|
'application/rdf+xml',
|
|
|
|
'application/xml',
|
|
|
|
'text/xml')
|
|
|
|
|
|
|
|
def _resolve_url(self, url):
|
|
|
|
"""Provide additional ways of resolving an URL
|
|
|
|
|
|
|
|
Subclasses can override this method to provide more
|
|
|
|
ways of resolving a given URL to a feed URL. If the
|
|
|
|
Fetcher is in "autodiscovery" mode, it will try this
|
|
|
|
method as a last resort for coming up with a feed URL.
|
|
|
|
"""
|
|
|
|
return None
|
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
@staticmethod
|
|
|
|
def _check_statuscode(status, url):
|
|
|
|
if status >= 200 and status < 300:
|
|
|
|
return UPDATED_FEED
|
2009-06-12 00:51:13 +02:00
|
|
|
elif status == 304:
|
2020-07-11 17:42:26 +02:00
|
|
|
return NOT_MODIFIED
|
|
|
|
# redirects are handled by requests directly
|
|
|
|
# => the status should never be 301, 302, 303, 307, 308
|
2012-07-10 11:57:59 +02:00
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
if status == 401:
|
|
|
|
raise AuthenticationRequired('authentication required', url)
|
2009-06-12 00:51:13 +02:00
|
|
|
elif status == 403:
|
|
|
|
raise Unsubscribe('forbidden')
|
|
|
|
elif status == 404:
|
|
|
|
raise NotFound('not found')
|
|
|
|
elif status == 410:
|
|
|
|
raise Unsubscribe('resource is gone')
|
2020-07-11 17:42:26 +02:00
|
|
|
elif status >= 400 and status < 500:
|
|
|
|
raise BadRequest('bad request')
|
|
|
|
elif status >= 500 and status < 600:
|
2009-06-12 00:51:13 +02:00
|
|
|
raise InternalServerError('internal server error')
|
|
|
|
else:
|
|
|
|
raise UnknownStatusCode(status)
|
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
@staticmethod
|
|
|
|
def _podcastparse_feed(url, data_stream):
|
|
|
|
try:
|
|
|
|
feed = podcastparser.parse(url, data_stream)
|
|
|
|
feed['url'] = url
|
|
|
|
return feed
|
|
|
|
except ValueError as e:
|
|
|
|
raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))
|
|
|
|
|
2019-06-09 18:57:13 +02:00
|
|
|
def _parse_feed(self, url, etag, modified, autodiscovery=True, max_episodes=0):
|
2020-07-11 17:42:26 +02:00
|
|
|
# handle local file first
|
|
|
|
if url.startswith('file://'):
|
|
|
|
url = url[len('file://'):]
|
|
|
|
stream = open(url)
|
|
|
|
feed = self._podcastparse_feed(url, stream)
|
|
|
|
feed['headers'] = {}
|
|
|
|
return Result(UPDATED_FEED, feed)
|
|
|
|
|
|
|
|
# remote feed
|
2016-02-06 17:46:07 +01:00
|
|
|
headers = {}
|
|
|
|
if modified is not None:
|
|
|
|
headers['If-Modified-Since'] = modified
|
|
|
|
if etag is not None:
|
|
|
|
headers['If-None-Match'] = etag
|
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
stream = util.urlopen(url, headers)
|
|
|
|
|
|
|
|
responses = stream.history + [stream]
|
|
|
|
for i, resp in enumerate(responses):
|
|
|
|
if resp.is_permanent_redirect:
|
|
|
|
# there should always be a next response when a redirect is encountered
|
|
|
|
# If max redirects is reached, TooManyRedirects is raised
|
|
|
|
# TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION
|
2020-07-12 17:02:16 +02:00
|
|
|
return Result(NEW_LOCATION, responses[i + 1].url)
|
2020-07-11 17:42:26 +02:00
|
|
|
res = self._check_statuscode(stream.status_code, stream.url)
|
|
|
|
if res == NOT_MODIFIED:
|
|
|
|
return Result(res, stream.url)
|
2017-03-04 04:53:19 +01:00
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
if autodiscovery and stream.headers.get('content-type', '').startswith('text/html'):
|
|
|
|
ad = FeedAutodiscovery(url)
|
|
|
|
ad.feed(stream.text) # uses headers, then apparent encoding
|
2017-04-02 13:24:36 +02:00
|
|
|
if ad._resolved_url:
|
|
|
|
try:
|
|
|
|
self._parse_feed(ad._resolved_url, None, None, False)
|
|
|
|
return Result(NEW_LOCATION, ad._resolved_url)
|
|
|
|
except Exception as e:
|
|
|
|
logger.warn('Feed autodiscovery failed', exc_info=True)
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2017-04-02 13:24:36 +02:00
|
|
|
# Second, try to resolve the URL
|
|
|
|
url = self._resolve_url(url)
|
|
|
|
if url:
|
|
|
|
return Result(NEW_LOCATION, url)
|
2020-07-11 17:42:26 +02:00
|
|
|
feed = self._podcastparse_feed(url, StringIO(stream.text))
|
|
|
|
feed['headers'] = stream.headers
|
|
|
|
return Result(UPDATED_FEED, feed)
|
2012-07-10 11:57:59 +02:00
|
|
|
|
2019-06-09 18:57:13 +02:00
|
|
|
def fetch(self, url, etag=None, modified=None, max_episodes=0):
|
|
|
|
return self._parse_feed(url, etag, modified, max_episodes)
|