gpodder/src/gpodder/feedcore.py

299 lines
9.8 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
#
# gPodder - A media aggregator and podcast client
2012-01-09 21:19:24 +01:00
# Copyright (c) 2005-2012 Thomas Perl and the gPodder Team
#
# gPodder is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# gPodder is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# Generic feed fetching module for aggregators
2010-11-22 19:40:29 +01:00
# Thomas Perl <thp@gpodder.org>; 2009-06-11
#
import feedparser
try:
# Python 2
from rfc822 import mktime_tz
except ImportError:
# Python 3
from email.utils import mktime_tz
def patch_feedparser():
"""Monkey-patch the Universal Feed Parser"""
# Detect the 'plain' content type as 'text/plain'
# http://code.google.com/p/feedparser/issues/detail?id=80
def mapContentType2(self, contentType):
contentType = contentType.lower()
if contentType == 'text' or contentType == 'plain':
contentType = 'text/plain'
elif contentType == 'html':
contentType = 'text/html'
elif contentType == 'xhtml':
contentType = 'application/xhtml+xml'
return contentType
try:
if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
feedparser._FeedParserMixin.mapContentType = mapContentType2
except:
pass
# Fix parsing of Media RSS with feedparser, as described here:
# http://code.google.com/p/feedparser/issues/detail?id=100#c4
def _start_media_content(self, attrsD):
context = self._getContext()
context.setdefault('media_content', [])
context['media_content'].append(attrsD)
try:
feedparser._FeedParserMixin._start_media_content = _start_media_content
except:
pass
# Fix problem with the EA.com official podcast
# https://bugs.gpodder.org/show_bug.cgi?id=588
if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
feedparser.ACCEPT_HEADER += ',*/*'
# Fix problem with YouTube feeds and pubDate/atom:modified
# https://bugs.gpodder.org/show_bug.cgi?id=1492
# http://code.google.com/p/feedparser/issues/detail?id=310
def _end_updated(self):
value = self.pop('updated')
parsed_value = feedparser._parse_date(value)
overwrite = ('youtube.com' not in self.baseuri)
self._save('updated_parsed', parsed_value, overwrite=overwrite)
try:
feedparser._FeedParserMixin._end_updated = _end_updated
except:
pass
patch_feedparser()
class ExceptionWithData(Exception):
"""Base exception with additional payload"""
def __init__(self, data):
Exception.__init__(self)
self.data = data
def __str__(self):
return '%s: %s' % (self.__class__.__name__, str(self.data))
# Temporary errors
class Offline(Exception): pass
class BadRequest(Exception): pass
class InternalServerError(Exception): pass
class WifiLogin(ExceptionWithData): pass
# Fatal errors
class Unsubscribe(Exception): pass
class NotFound(Exception): pass
class InvalidFeed(Exception): pass
class UnknownStatusCode(ExceptionWithData): pass
# Authentication error
class AuthenticationRequired(Exception): pass
# Successful status codes
UPDATED_FEED, NEW_LOCATION, NOT_MODIFIED, CUSTOM_FEED = range(4)
class Result:
def __init__(self, status, feed=None):
self.status = status
self.feed = feed
class Fetcher(object):
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
FEED_TYPES = ('application/rss+xml',
'application/atom+xml',
'application/rdf+xml',
'application/xml',
'text/xml')
def __init__(self, user_agent):
self.user_agent = user_agent
def _resolve_url(self, url):
"""Provide additional ways of resolving an URL
Subclasses can override this method to provide more
ways of resolving a given URL to a feed URL. If the
Fetcher is in "autodiscovery" mode, it will try this
method as a last resort for coming up with a feed URL.
"""
return None
def _autodiscover_feed(self, feed):
# First, try all <link> elements if available
for link in feed.feed.get('links', ()):
is_feed = link.get('type', '') in self.FEED_TYPES
is_alternate = link.get('rel', '') == 'alternate'
url = link.get('href', None)
if url and is_feed and is_alternate:
try:
return self._parse_feed(url, None, None, False)
except Exception, e:
pass
# Second, try to resolve the URL
url = self._resolve_url(feed.href)
if url:
result = self._parse_feed(url, None, None, False)
result.status = NEW_LOCATION
return result
def _check_offline(self, feed):
if not hasattr(feed, 'headers'):
raise Offline()
def _check_wifi_login_page(self, feed):
html_page = 'text/html' in feed.headers.get('content-type', '')
if not feed.version and feed.status == 302 and html_page:
raise WifiLogin(feed.href)
def _check_valid_feed(self, feed):
if feed is None:
raise InvalidFeed('feed is None')
if not hasattr(feed, 'status'):
raise InvalidFeed('feed has no status code')
if not feed.version and feed.status != 304 and feed.status != 401:
raise InvalidFeed('unknown feed type')
def _normalize_status(self, status):
# Based on Mark Pilgrim's "Atom aggregator behaviour" article
if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
return status
elif status >= 200 and status < 300:
return 200
elif status >= 300 and status < 400:
return 302
elif status >= 400 and status < 500:
return 400
elif status >= 500 and status < 600:
return 500
else:
return status
def _check_rss_redirect(self, feed):
new_location = feed.feed.get('newlocation', None)
if new_location:
feed.href = feed.feed.newlocation
return Result(NEW_LOCATION, feed)
return None
def _check_statuscode(self, feed):
status = self._normalize_status(feed.status)
if status == 200:
return Result(UPDATED_FEED, feed)
elif status == 301:
return Result(NEW_LOCATION, feed)
elif status == 302:
return Result(UPDATED_FEED, feed)
elif status == 304:
return Result(NOT_MODIFIED, feed)
if status == 400:
raise BadRequest('bad request')
elif status == 401:
raise AuthenticationRequired('authentication required')
elif status == 403:
raise Unsubscribe('forbidden')
elif status == 404:
raise NotFound('not found')
elif status == 410:
raise Unsubscribe('resource is gone')
elif status == 500:
raise InternalServerError('internal server error')
else:
raise UnknownStatusCode(status)
def _parse_feed(self, url, etag, modified, autodiscovery=True):
if url.startswith('file://'):
is_local = True
url = url[len('file://'):]
else:
is_local = False
feed = feedparser.parse(url,
agent=self.user_agent,
modified=modified,
2012-01-02 15:17:03 +01:00
etag=etag)
if is_local:
if feed.version:
feed.headers = {}
return Result(UPDATED_FEED, feed)
else:
raise InvalidFeed('Not a valid feed file')
else:
self._check_offline(feed)
self._check_wifi_login_page(feed)
if feed.status != 304 and not feed.version and autodiscovery:
self._autodiscover_feed(feed)
self._check_valid_feed(feed)
redirect = self._check_rss_redirect(feed)
if redirect is not None:
return redirect
return self._check_statuscode(feed)
def fetch(self, url, etag=None, modified=None):
return self._parse_feed(url, etag, modified)
def get_pubdate(entry):
"""Try to determine the real pubDate of a feedparser entry
This basically takes the updated_parsed value, but also uses some more
advanced techniques to work around various issues with ugly feeds.
"published" now also takes precedence over "updated" (with updated used as
a fallback if published is not set/available). RSS' "pubDate" element is
"updated", and will only be used if published_parsed is not available.
"""
pubdate = entry.get('published_parsed', None)
if pubdate is None:
pubdate = entry.get('updated_parsed', None)
if pubdate is None:
# See http://code.google.com/p/feedparser/issues/detail?id=327
updated = entry.get('published', entry.get('updated', None))
if updated is not None:
# FIXME: This is kludgy. We should write our own date handler
# and register it with feedparser.registerDateHandler() and/or
# wait for feedparser to add support for this bogus date format.
pubdate = feedparser._parse_date(updated.replace(',', ''))
if pubdate is None:
# Cannot determine pubdate - party like it's 1970!
return 0
return mktime_tz(pubdate + (0,))