2009-06-12 00:51:13 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# gPodder - A media aggregator and podcast client
|
2018-01-28 19:39:53 +01:00
|
|
|
# Copyright (c) 2005-2018 The gPodder Team
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
# gPodder is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# gPodder is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# Generic feed fetching module for aggregators
|
2010-11-22 19:40:29 +01:00
|
|
|
# Thomas Perl <thp@gpodder.org>; 2009-06-11
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
|
2018-07-24 11:08:10 +02:00
|
|
|
import logging
|
|
|
|
import urllib.parse
|
|
|
|
from html.parser import HTMLParser
|
2020-07-18 15:04:02 +02:00
|
|
|
from io import BytesIO
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
from requests.exceptions import RequestException
|
2020-02-05 08:40:37 +01:00
|
|
|
|
2020-07-13 10:33:25 +02:00
|
|
|
from gpodder import util, youtube
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2012-08-15 10:56:04 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class ExceptionWithData(Exception):
|
|
|
|
"""Base exception with additional payload"""
|
|
|
|
def __init__(self, data):
|
|
|
|
Exception.__init__(self)
|
|
|
|
self.data = data
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return '%s: %s' % (self.__class__.__name__, str(self.data))
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Temporary errors
|
|
|
|
class BadRequest(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class InternalServerError(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class WifiLogin(ExceptionWithData): pass
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Fatal errors
|
|
|
|
class Unsubscribe(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class NotFound(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class InvalidFeed(Exception): pass
|
2018-02-11 00:22:00 +01:00
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class UnknownStatusCode(ExceptionWithData): pass
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
# Authentication error
|
2018-12-15 16:10:00 +01:00
|
|
|
class AuthenticationRequired(Exception):
|
|
|
|
def __init__(self, msg, url=None):
|
|
|
|
super().__init__(msg)
|
|
|
|
self.url = url
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2018-02-10 11:11:20 +01:00
|
|
|
|
2012-07-10 11:57:59 +02:00
|
|
|
# Successful status codes
|
2019-06-09 18:57:13 +02:00
|
|
|
UPDATED_FEED, NEW_LOCATION, NOT_MODIFIED = list(range(3))
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2018-02-10 11:11:20 +01:00
|
|
|
|
2012-07-10 11:57:59 +02:00
|
|
|
class Result:
|
|
|
|
def __init__(self, status, feed=None):
|
|
|
|
self.status = status
|
|
|
|
self.feed = feed
|
2009-06-12 00:51:13 +02:00
|
|
|
|
|
|
|
|
2016-02-06 17:46:07 +01:00
|
|
|
class FeedAutodiscovery(HTMLParser):
|
|
|
|
def __init__(self, base):
|
|
|
|
HTMLParser.__init__(self)
|
|
|
|
self._base = base
|
|
|
|
self._resolved_url = None
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == 'link':
|
|
|
|
attrs = dict(attrs)
|
|
|
|
|
|
|
|
is_feed = attrs.get('type', '') in Fetcher.FEED_TYPES
|
2020-07-13 10:33:25 +02:00
|
|
|
is_youtube = 'youtube.com' in self._base
|
2016-02-06 17:46:07 +01:00
|
|
|
is_alternate = attrs.get('rel', '') == 'alternate'
|
2020-07-13 10:33:25 +02:00
|
|
|
is_canonical = attrs.get('rel', '') == 'canonical'
|
2016-02-06 17:46:07 +01:00
|
|
|
url = attrs.get('href', None)
|
2016-11-21 23:13:46 +01:00
|
|
|
url = urllib.parse.urljoin(self._base, url)
|
2016-02-06 17:46:07 +01:00
|
|
|
|
|
|
|
if is_feed and is_alternate and url:
|
|
|
|
logger.info('Feed autodiscovery: %s', url)
|
|
|
|
self._resolved_url = url
|
2020-07-13 10:33:25 +02:00
|
|
|
elif is_youtube and is_canonical and url:
|
|
|
|
url = youtube.parse_youtube_url(url)
|
|
|
|
logger.info('Feed autodiscovery: %s', url)
|
|
|
|
self._resolved_url = url
|
2016-02-06 17:46:07 +01:00
|
|
|
|
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
class FetcherFeedData:
|
|
|
|
def __init__(self, text, content):
|
|
|
|
self.text = text
|
|
|
|
self.content = content
|
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
class Fetcher(object):
|
|
|
|
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
|
|
|
|
FEED_TYPES = ('application/rss+xml',
|
|
|
|
'application/atom+xml',
|
|
|
|
'application/rdf+xml',
|
|
|
|
'application/xml',
|
|
|
|
'text/xml')
|
|
|
|
|
|
|
|
def _resolve_url(self, url):
|
|
|
|
"""Provide additional ways of resolving an URL
|
|
|
|
|
|
|
|
Subclasses can override this method to provide more
|
|
|
|
ways of resolving a given URL to a feed URL. If the
|
|
|
|
Fetcher is in "autodiscovery" mode, it will try this
|
|
|
|
method as a last resort for coming up with a feed URL.
|
|
|
|
"""
|
|
|
|
return None
|
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
@staticmethod
|
|
|
|
def _check_statuscode(status, url):
|
|
|
|
if status >= 200 and status < 300:
|
|
|
|
return UPDATED_FEED
|
2009-06-12 00:51:13 +02:00
|
|
|
elif status == 304:
|
2020-07-11 17:42:26 +02:00
|
|
|
return NOT_MODIFIED
|
|
|
|
# redirects are handled by requests directly
|
|
|
|
# => the status should never be 301, 302, 303, 307, 308
|
2012-07-10 11:57:59 +02:00
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
if status == 401:
|
|
|
|
raise AuthenticationRequired('authentication required', url)
|
2009-06-12 00:51:13 +02:00
|
|
|
elif status == 403:
|
|
|
|
raise Unsubscribe('forbidden')
|
|
|
|
elif status == 404:
|
|
|
|
raise NotFound('not found')
|
|
|
|
elif status == 410:
|
|
|
|
raise Unsubscribe('resource is gone')
|
2020-07-11 17:42:26 +02:00
|
|
|
elif status >= 400 and status < 500:
|
|
|
|
raise BadRequest('bad request')
|
|
|
|
elif status >= 500 and status < 600:
|
2009-06-12 00:51:13 +02:00
|
|
|
raise InternalServerError('internal server error')
|
|
|
|
else:
|
|
|
|
raise UnknownStatusCode(status)
|
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs):
|
2020-07-14 18:43:44 +02:00
|
|
|
"""
|
|
|
|
kwargs are passed from Fetcher.fetch
|
|
|
|
:param str url: real url
|
2020-07-14 18:45:13 +02:00
|
|
|
:param data_stream: file-like object to read from (bytes mode)
|
2020-07-14 18:43:44 +02:00
|
|
|
:param dict-like headers: response headers (may be empty)
|
|
|
|
:param int status: always UPDATED_FEED for now
|
|
|
|
:return Result: Result(status, model.Feed from parsed data_stream)
|
|
|
|
"""
|
|
|
|
raise NotImplementedError("Implement parse_feed()")
|
|
|
|
|
|
|
|
def fetch(self, url, etag=None, modified=None, autodiscovery=True, **kwargs):
|
|
|
|
""" use kwargs to pass extra data to parse_feed in Fetcher subclasses """
|
2020-07-11 17:42:26 +02:00
|
|
|
# handle local file first
|
|
|
|
if url.startswith('file://'):
|
|
|
|
url = url[len('file://'):]
|
|
|
|
stream = open(url)
|
2022-02-27 10:52:06 +01:00
|
|
|
return self.parse_feed(url, None, stream, {}, UPDATED_FEED, **kwargs)
|
2020-07-11 17:42:26 +02:00
|
|
|
|
|
|
|
# remote feed
|
2016-02-06 17:46:07 +01:00
|
|
|
headers = {}
|
|
|
|
if modified is not None:
|
|
|
|
headers['If-Modified-Since'] = modified
|
|
|
|
if etag is not None:
|
|
|
|
headers['If-None-Match'] = etag
|
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
stream = util.urlopen(url, headers)
|
|
|
|
|
|
|
|
responses = stream.history + [stream]
|
|
|
|
for i, resp in enumerate(responses):
|
|
|
|
if resp.is_permanent_redirect:
|
|
|
|
# there should always be a next response when a redirect is encountered
|
|
|
|
# If max redirects is reached, TooManyRedirects is raised
|
|
|
|
# TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION
|
2020-07-12 17:02:16 +02:00
|
|
|
return Result(NEW_LOCATION, responses[i + 1].url)
|
2020-07-11 17:42:26 +02:00
|
|
|
res = self._check_statuscode(stream.status_code, stream.url)
|
|
|
|
if res == NOT_MODIFIED:
|
2020-07-14 18:43:44 +02:00
|
|
|
return Result(NOT_MODIFIED, stream.url)
|
2017-03-04 04:53:19 +01:00
|
|
|
|
2020-07-11 17:42:26 +02:00
|
|
|
if autodiscovery and stream.headers.get('content-type', '').startswith('text/html'):
|
|
|
|
ad = FeedAutodiscovery(url)
|
2020-07-18 15:04:02 +02:00
|
|
|
# response_text() will assume utf-8 if no charset specified
|
2020-07-23 22:33:15 +02:00
|
|
|
ad.feed(util.response_text(stream))
|
2020-09-12 14:05:57 +02:00
|
|
|
if ad._resolved_url and ad._resolved_url != url:
|
2017-04-02 13:24:36 +02:00
|
|
|
try:
|
2020-07-14 18:43:44 +02:00
|
|
|
self.fetch(ad._resolved_url, etag=None, modified=None, autodiscovery=False, **kwargs)
|
2017-04-02 13:24:36 +02:00
|
|
|
return Result(NEW_LOCATION, ad._resolved_url)
|
|
|
|
except Exception as e:
|
2022-04-17 11:07:51 +02:00
|
|
|
logger.warning('Feed autodiscovery failed', exc_info=True)
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2020-09-12 14:05:57 +02:00
|
|
|
# Second, try to resolve the URL
|
|
|
|
new_url = self._resolve_url(url)
|
|
|
|
if new_url and new_url != url:
|
|
|
|
return Result(NEW_LOCATION, new_url)
|
2016-02-06 17:46:07 +01:00
|
|
|
|
2020-07-14 18:45:13 +02:00
|
|
|
# xml documents specify the encoding inline so better pass encoded body.
|
|
|
|
# Especially since requests will use ISO-8859-1 for content-type 'text/xml'
|
|
|
|
# if the server doesn't specify a charset.
|
2022-02-27 10:52:06 +01:00
|
|
|
return self.parse_feed(url, FetcherFeedData(stream.text, stream.content), BytesIO(stream.content), stream.headers,
|
|
|
|
UPDATED_FEED, **kwargs)
|