2009-06-12 00:51:13 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# gPodder - A media aggregator and podcast client
|
2012-01-09 21:19:24 +01:00
|
|
|
# Copyright (c) 2005-2012 Thomas Perl and the gPodder Team
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
# gPodder is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# gPodder is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# Generic feed fetching module for aggregators
|
2010-11-22 19:40:29 +01:00
|
|
|
# Thomas Perl <thp@gpodder.org>; 2009-06-11
|
2009-06-12 00:51:13 +02:00
|
|
|
#
|
|
|
|
|
|
|
|
import feedparser
|
|
|
|
|
2012-02-12 11:38:26 +01:00
|
|
|
try:
|
|
|
|
# Python 2
|
|
|
|
from rfc822 import mktime_tz
|
|
|
|
except ImportError:
|
|
|
|
# Python 3
|
|
|
|
from email.utils import mktime_tz
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
def patch_feedparser():
|
2009-10-05 19:27:25 +02:00
|
|
|
"""Monkey-patch the Universal Feed Parser"""
|
|
|
|
# Detect the 'plain' content type as 'text/plain'
|
|
|
|
# http://code.google.com/p/feedparser/issues/detail?id=80
|
2009-06-12 00:51:13 +02:00
|
|
|
def mapContentType2(self, contentType):
|
|
|
|
contentType = contentType.lower()
|
|
|
|
if contentType == 'text' or contentType == 'plain':
|
|
|
|
contentType = 'text/plain'
|
|
|
|
elif contentType == 'html':
|
|
|
|
contentType = 'text/html'
|
|
|
|
elif contentType == 'xhtml':
|
|
|
|
contentType = 'application/xhtml+xml'
|
|
|
|
return contentType
|
|
|
|
|
|
|
|
try:
|
|
|
|
if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
|
|
|
|
feedparser._FeedParserMixin.mapContentType = mapContentType2
|
|
|
|
except:
|
|
|
|
pass
|
2009-07-18 16:23:21 +02:00
|
|
|
|
|
|
|
# Fix parsing of Media RSS with feedparser, as described here:
|
|
|
|
# http://code.google.com/p/feedparser/issues/detail?id=100#c4
|
|
|
|
def _start_media_content(self, attrsD):
|
|
|
|
context = self._getContext()
|
|
|
|
context.setdefault('media_content', [])
|
|
|
|
context['media_content'].append(attrsD)
|
|
|
|
|
|
|
|
try:
|
|
|
|
feedparser._FeedParserMixin._start_media_content = _start_media_content
|
|
|
|
except:
|
|
|
|
pass
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2009-10-05 19:27:25 +02:00
|
|
|
# Fix problem with the EA.com official podcast
|
|
|
|
# https://bugs.gpodder.org/show_bug.cgi?id=588
|
|
|
|
if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
|
|
|
|
feedparser.ACCEPT_HEADER += ',*/*'
|
|
|
|
|
2011-12-05 09:57:49 +01:00
|
|
|
# Fix problem with YouTube feeds and pubDate/atom:modified
|
|
|
|
# https://bugs.gpodder.org/show_bug.cgi?id=1492
|
|
|
|
# http://code.google.com/p/feedparser/issues/detail?id=310
|
|
|
|
def _end_updated(self):
|
|
|
|
value = self.pop('updated')
|
|
|
|
parsed_value = feedparser._parse_date(value)
|
|
|
|
overwrite = ('youtube.com' not in self.baseuri)
|
|
|
|
self._save('updated_parsed', parsed_value, overwrite=overwrite)
|
|
|
|
|
|
|
|
try:
|
|
|
|
feedparser._FeedParserMixin._end_updated = _end_updated
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
patch_feedparser()
|
|
|
|
|
|
|
|
|
|
|
|
class ExceptionWithData(Exception):
|
|
|
|
"""Base exception with additional payload"""
|
|
|
|
def __init__(self, data):
|
|
|
|
Exception.__init__(self)
|
|
|
|
self.data = data
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return '%s: %s' % (self.__class__.__name__, str(self.data))
|
|
|
|
|
|
|
|
|
|
|
|
# Temporary errors
|
|
|
|
class Offline(Exception): pass
|
|
|
|
class BadRequest(Exception): pass
|
|
|
|
class InternalServerError(Exception): pass
|
|
|
|
class WifiLogin(ExceptionWithData): pass
|
|
|
|
|
|
|
|
# Fatal errors
|
|
|
|
class Unsubscribe(Exception): pass
|
|
|
|
class NotFound(Exception): pass
|
|
|
|
class InvalidFeed(Exception): pass
|
|
|
|
class UnknownStatusCode(ExceptionWithData): pass
|
|
|
|
|
|
|
|
# Authentication error
|
|
|
|
class AuthenticationRequired(Exception): pass
|
|
|
|
|
|
|
|
# Successful parsing of the feed
|
|
|
|
class UpdatedFeed(ExceptionWithData): pass
|
|
|
|
class NewLocation(ExceptionWithData): pass
|
|
|
|
class NotModified(ExceptionWithData): pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Fetcher(object):
|
|
|
|
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
|
|
|
|
FEED_TYPES = ('application/rss+xml',
|
|
|
|
'application/atom+xml',
|
|
|
|
'application/rdf+xml',
|
|
|
|
'application/xml',
|
|
|
|
'text/xml')
|
|
|
|
|
|
|
|
def __init__(self, user_agent):
|
|
|
|
self.user_agent = user_agent
|
|
|
|
|
|
|
|
def _resolve_url(self, url):
|
|
|
|
"""Provide additional ways of resolving an URL
|
|
|
|
|
|
|
|
Subclasses can override this method to provide more
|
|
|
|
ways of resolving a given URL to a feed URL. If the
|
|
|
|
Fetcher is in "autodiscovery" mode, it will try this
|
|
|
|
method as a last resort for coming up with a feed URL.
|
|
|
|
"""
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _autodiscover_feed(self, feed):
|
|
|
|
try:
|
|
|
|
# First, try all <link> elements if available
|
|
|
|
for link in feed.feed.get('links', ()):
|
|
|
|
is_feed = link.get('type', '') in self.FEED_TYPES
|
|
|
|
is_alternate = link.get('rel', '') == 'alternate'
|
|
|
|
url = link.get('href', None)
|
|
|
|
|
|
|
|
if url and is_feed and is_alternate:
|
|
|
|
try:
|
|
|
|
self._parse_feed(url, None, None, False)
|
|
|
|
except UpdatedFeed, updated:
|
|
|
|
raise
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
|
|
|
# Second, try to resolve the URL
|
|
|
|
url = self._resolve_url(feed.href)
|
|
|
|
if url:
|
|
|
|
self._parse_feed(url, None, None, False)
|
|
|
|
except UpdatedFeed, updated:
|
|
|
|
raise NewLocation(updated.data)
|
|
|
|
except Exception, e:
|
|
|
|
pass
|
|
|
|
|
|
|
|
def _check_offline(self, feed):
|
|
|
|
if not hasattr(feed, 'headers'):
|
|
|
|
raise Offline()
|
|
|
|
|
|
|
|
def _check_wifi_login_page(self, feed):
|
|
|
|
html_page = 'text/html' in feed.headers.get('content-type', '')
|
|
|
|
if not feed.version and feed.status == 302 and html_page:
|
|
|
|
raise WifiLogin(feed.href)
|
|
|
|
|
|
|
|
def _check_valid_feed(self, feed):
|
|
|
|
if feed is None:
|
|
|
|
raise InvalidFeed('feed is None')
|
|
|
|
|
|
|
|
if not hasattr(feed, 'status'):
|
|
|
|
raise InvalidFeed('feed has no status code')
|
|
|
|
|
2009-09-02 18:46:00 +02:00
|
|
|
if not feed.version and feed.status != 304 and feed.status != 401:
|
2009-06-12 00:51:13 +02:00
|
|
|
raise InvalidFeed('unknown feed type')
|
|
|
|
|
|
|
|
def _normalize_status(self, status):
|
|
|
|
# Based on Mark Pilgrim's "Atom aggregator behaviour" article
|
|
|
|
if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
|
|
|
|
return status
|
|
|
|
elif status >= 200 and status < 300:
|
|
|
|
return 200
|
|
|
|
elif status >= 300 and status < 400:
|
|
|
|
return 302
|
|
|
|
elif status >= 400 and status < 500:
|
|
|
|
return 400
|
|
|
|
elif status >= 500 and status < 600:
|
|
|
|
return 500
|
|
|
|
else:
|
|
|
|
return status
|
|
|
|
|
2011-08-08 21:51:19 +02:00
|
|
|
def _check_rss_redirect(self, feed):
|
|
|
|
new_location = feed.feed.get('newlocation', None)
|
|
|
|
if new_location:
|
|
|
|
feed.href = feed.feed.newlocation
|
|
|
|
raise NewLocation(feed)
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
def _check_statuscode(self, feed):
|
|
|
|
status = self._normalize_status(feed.status)
|
|
|
|
if status == 200:
|
|
|
|
raise UpdatedFeed(feed)
|
|
|
|
elif status == 301:
|
|
|
|
raise NewLocation(feed)
|
|
|
|
elif status == 302:
|
|
|
|
raise UpdatedFeed(feed)
|
|
|
|
elif status == 304:
|
|
|
|
raise NotModified(feed)
|
|
|
|
elif status == 400:
|
|
|
|
raise BadRequest('bad request')
|
|
|
|
elif status == 401:
|
|
|
|
raise AuthenticationRequired('authentication required')
|
|
|
|
elif status == 403:
|
|
|
|
raise Unsubscribe('forbidden')
|
|
|
|
elif status == 404:
|
|
|
|
raise NotFound('not found')
|
|
|
|
elif status == 410:
|
|
|
|
raise Unsubscribe('resource is gone')
|
|
|
|
elif status == 500:
|
|
|
|
raise InternalServerError('internal server error')
|
|
|
|
else:
|
|
|
|
raise UnknownStatusCode(status)
|
|
|
|
|
|
|
|
def _parse_feed(self, url, etag, modified, autodiscovery=True):
|
|
|
|
"""Parse the feed and raise the result."""
|
2010-12-16 19:24:06 +01:00
|
|
|
if url.startswith('file://'):
|
|
|
|
is_local = True
|
|
|
|
url = url[len('file://'):]
|
|
|
|
else:
|
|
|
|
is_local = False
|
|
|
|
|
2009-06-12 00:51:13 +02:00
|
|
|
feed = feedparser.parse(url,
|
|
|
|
agent=self.user_agent,
|
|
|
|
modified=modified,
|
2012-01-02 15:17:03 +01:00
|
|
|
etag=etag)
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2010-12-16 19:24:06 +01:00
|
|
|
if is_local:
|
|
|
|
if feed.version:
|
|
|
|
feed.headers = {}
|
|
|
|
raise UpdatedFeed(feed)
|
|
|
|
else:
|
|
|
|
raise InvalidFeed('Not a valid feed file')
|
|
|
|
else:
|
|
|
|
self._check_offline(feed)
|
|
|
|
self._check_wifi_login_page(feed)
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2010-12-16 19:24:06 +01:00
|
|
|
if feed.status != 304 and not feed.version and autodiscovery:
|
|
|
|
self._autodiscover_feed(feed)
|
2009-06-12 00:51:13 +02:00
|
|
|
|
2010-12-16 19:24:06 +01:00
|
|
|
self._check_valid_feed(feed)
|
2011-08-08 21:51:19 +02:00
|
|
|
self._check_rss_redirect(feed)
|
2010-12-16 19:24:06 +01:00
|
|
|
self._check_statuscode(feed)
|
2009-06-12 00:51:13 +02:00
|
|
|
|
|
|
|
def fetch(self, url, etag=None, modified=None):
|
|
|
|
"""Download a feed, with optional etag an modified values
|
|
|
|
|
|
|
|
This method will always raise an exception that tells
|
|
|
|
the calling code the result of the fetch operation. See
|
|
|
|
the code for the feedcore module for all the possible
|
|
|
|
exception types.
|
|
|
|
"""
|
|
|
|
self._parse_feed(url, etag, modified)
|
|
|
|
|
2012-02-12 11:38:26 +01:00
|
|
|
|
|
|
|
def get_pubdate(entry):
|
|
|
|
"""Try to determine the real pubDate of a feedparser entry
|
|
|
|
|
|
|
|
This basically takes the updated_parsed value, but also uses some more
|
|
|
|
advanced techniques to work around various issues with ugly feeds.
|
|
|
|
|
|
|
|
"published" now also takes precedence over "updated" (with updated used as
|
|
|
|
a fallback if published is not set/available). RSS' "pubDate" element is
|
|
|
|
"updated", and will only be used if published_parsed is not available.
|
|
|
|
"""
|
|
|
|
|
|
|
|
pubdate = entry.get('published_parsed', None)
|
|
|
|
|
|
|
|
if pubdate is None:
|
|
|
|
pubdate = entry.get('updated_parsed', None)
|
|
|
|
|
|
|
|
if pubdate is None:
|
|
|
|
# See http://code.google.com/p/feedparser/issues/detail?id=327
|
|
|
|
updated = entry.get('published', entry.get('updated', None))
|
|
|
|
if updated is not None:
|
|
|
|
# FIXME: This is kludgy. We should write our own date handler
|
|
|
|
# and register it with feedparser.registerDateHandler() and/or
|
|
|
|
# wait for feedparser to add support for this bogus date format.
|
|
|
|
pubdate = feedparser._parse_date(updated.replace(',', ''))
|
|
|
|
|
|
|
|
if pubdate is None:
|
|
|
|
# Cannot determine pubdate - party like it's 1970!
|
|
|
|
return 0
|
|
|
|
|
|
|
|
return mktime_tz(pubdate + (0,))
|
|
|
|
|