gpodder/src/gpodder/feedcore.py

# -*- coding: utf-8 -*-
#
# gPodder - A media aggregator and podcast client
# Copyright (c) 2005-2010 Thomas Perl and the gPodder Team
#
# gPodder is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# gPodder is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

#
# Generic feed fetching module for aggregators
# Thomas Perl <thpinfo.com>; 2009-06-11
#

import feedparser

def patch_feedparser():
    """Monkey-patch the Universal Feed Parser"""
    # Detect the 'plain' content type as 'text/plain'
    # http://code.google.com/p/feedparser/issues/detail?id=80
    def mapContentType2(self, contentType):
        contentType = contentType.lower()
        if contentType == 'text' or contentType == 'plain':
            contentType = 'text/plain'
        elif contentType == 'html':
            contentType = 'text/html'
        elif contentType == 'xhtml':
            contentType = 'application/xhtml+xml'
        return contentType

    try:
        if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
            feedparser._FeedParserMixin.mapContentType = mapContentType2
    except:
        pass
    
    # Fix parsing of Media RSS with feedparser, as described here: 
    #   http://code.google.com/p/feedparser/issues/detail?id=100#c4
    def _start_media_content(self, attrsD):
        context = self._getContext()
        context.setdefault('media_content', [])
        context['media_content'].append(attrsD)
        
    try:
        feedparser._FeedParserMixin._start_media_content = _start_media_content
    except:
        pass

    # Fix problem with the EA.com official podcast
    # https://bugs.gpodder.org/show_bug.cgi?id=588
    if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
        feedparser.ACCEPT_HEADER += ',*/*'

patch_feedparser()


class ExceptionWithData(Exception):
    """Base exception with additional payload"""
    def __init__(self, data):
        Exception.__init__(self)
        self.data = data

    def __str__(self):
        return '%s: %s' % (self.__class__.__name__, str(self.data))


# Temporary errors
class Offline(Exception): pass
class BadRequest(Exception): pass
class InternalServerError(Exception): pass
class WifiLogin(ExceptionWithData): pass

# Fatal errors
class Unsubscribe(Exception): pass
class NotFound(Exception): pass
class InvalidFeed(Exception): pass
class UnknownStatusCode(ExceptionWithData): pass

# Authentication error
class AuthenticationRequired(Exception): pass

# Successful parsing of the feed
class UpdatedFeed(ExceptionWithData): pass
class NewLocation(ExceptionWithData): pass
class NotModified(ExceptionWithData): pass


class Fetcher(object):
    # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
    FEED_TYPES = ('application/rss+xml',
                  'application/atom+xml',
                  'application/rdf+xml',
                  'application/xml',
                  'text/xml')

    def __init__(self, user_agent):
        self.user_agent = user_agent

    def _get_handlers(self):
        """Provide additional urllib2 handler objects

        Subclasses can override this method to inject urllib2
        handler objects into the feedparser.parse() call to
        extent the functionalty of this Fetcher (for proxies, ..)
        """
        return []

    def _resolve_url(self, url):
        """Provide additional ways of resolving an URL

        Subclasses can override this method to provide more
        ways of resolving a given URL to a feed URL. If the
        Fetcher is in "autodiscovery" mode, it will try this
        method as a last resort for coming up with a feed URL.
        """
        return None

    def _autodiscover_feed(self, feed):
        try:
            # First, try all <link> elements if available
            for link in feed.feed.get('links', ()):
                is_feed = link.get('type', '') in self.FEED_TYPES
                is_alternate = link.get('rel', '') == 'alternate'
                url = link.get('href', None)

                if url and is_feed and is_alternate:
                    try:
                        self._parse_feed(url, None, None, False)
                    except UpdatedFeed, updated:
                        raise
                    except Exception:
                        pass

            # Second, try to resolve the URL
            url = self._resolve_url(feed.href)
            if url:
                self._parse_feed(url, None, None, False)
        except UpdatedFeed, updated:
            raise NewLocation(updated.data)
        except Exception, e:
            pass

    def _check_offline(self, feed):
        if not hasattr(feed, 'headers'):
            raise Offline()

    def _check_wifi_login_page(self, feed):
        html_page = 'text/html' in feed.headers.get('content-type', '')
        if not feed.version and feed.status == 302 and html_page:
            raise WifiLogin(feed.href)

    def _check_valid_feed(self, feed):
        if feed is None:
            raise InvalidFeed('feed is None')

        if not hasattr(feed, 'status'):
            raise InvalidFeed('feed has no status code')

        if not feed.version and feed.status != 304 and feed.status != 401:
            raise InvalidFeed('unknown feed type')

    def _normalize_status(self, status):
        # Based on Mark Pilgrim's "Atom aggregator behaviour" article
        if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
            return status
        elif status >= 200 and status < 300:
            return 200
        elif status >= 300 and status < 400:
            return 302
        elif status >= 400 and status < 500:
            return 400
        elif status >= 500 and status < 600:
            return 500
        else:
            return status

    def _check_statuscode(self, feed):
        status = self._normalize_status(feed.status)
        if status == 200:
            raise UpdatedFeed(feed)
        elif status == 301:
            raise NewLocation(feed)
        elif status == 302:
            raise UpdatedFeed(feed)
        elif status == 304:
            raise NotModified(feed)
        elif status == 400:
            raise BadRequest('bad request')
        elif status == 401:
            raise AuthenticationRequired('authentication required')
        elif status == 403:
            raise Unsubscribe('forbidden')
        elif status == 404:
            raise NotFound('not found')
        elif status == 410:
            raise Unsubscribe('resource is gone')
        elif status == 500:
            raise InternalServerError('internal server error')
        else:
            raise UnknownStatusCode(status)

    def _parse_feed(self, url, etag, modified, autodiscovery=True):
        """Parse the feed and raise the result."""
        feed = feedparser.parse(url,
                agent=self.user_agent,
                modified=modified,
                etag=etag,
                handlers=self._get_handlers())

        self._check_offline(feed)
        self._check_wifi_login_page(feed)

        if feed.status != 304 and not feed.version and autodiscovery:
            self._autodiscover_feed(feed)

        self._check_valid_feed(feed)
        self._check_statuscode(feed)

    def fetch(self, url, etag=None, modified=None):
        """Download a feed, with optional etag an modified values

        This method will always raise an exception that tells
        the calling code the result of the fetch operation. See
        the code for the feedcore module for all the possible
        exception types.
        """
        self._parse_feed(url, etag, modified)
First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00			`# -- coding: utf-8 --`
			`#`
			`# gPodder - A media aggregator and podcast client`
Add 2010 to the years in copyright notice The following command has been used: grep -r '2005-2009 Thomas Perl and the gPodder Team' * \| cut -d: -f1 \| xargs sed -i -e 's/2005-2009 Thomas Perl and the gPodder Team/2005-2010 Thomas Perl and the gPodder Team/g' 2010-01-02 17:35:42 +01:00			`# Copyright (c) 2005-2010 Thomas Perl and the gPodder Team`
First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00			`#`
			`# gPodder is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation; either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# gPodder is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`

			`#`
			`# Generic feed fetching module for aggregators`
			`# Thomas Perl <thpinfo.com>; 2009-06-11`
			`#`

			`import feedparser`

			`def patch_feedparser():`
Add EA.com compatibility to feedparser (bug 588) Modify feedparser's default "Accept:" header to also include a plain "/" content type, so that the web server at EA.com will happily return the RSS feed contents instead of an error. Thanks to Romain Janvier for reporting this bug. 2009-10-05 19:27:25 +02:00			`"""Monkey-patch the Universal Feed Parser"""`
			`# Detect the 'plain' content type as 'text/plain'`
			`# http://code.google.com/p/feedparser/issues/detail?id=80`
First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00			`def mapContentType2(self, contentType):`
			`contentType = contentType.lower()`
			`if contentType == 'text' or contentType == 'plain':`
			`contentType = 'text/plain'`
			`elif contentType == 'html':`
			`contentType = 'text/html'`
			`elif contentType == 'xhtml':`
			`contentType = 'application/xhtml+xml'`
			`return contentType`

			`try:`
			`if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':`
			`feedparser._FeedParserMixin.mapContentType = mapContentType2`
			`except:`
			`pass`
Monkey-patch feedparser to support Media RSS (bug 507) Support Media RSS' media_content tag for Media RSS podcasts. See http://code.google.com/p/feedparser/issues/detail?id=100#c4 2009-07-18 16:23:21 +02:00
			`# Fix parsing of Media RSS with feedparser, as described here:`
			`# http://code.google.com/p/feedparser/issues/detail?id=100#c4`
			`def _start_media_content(self, attrsD):`
			`context = self._getContext()`
			`context.setdefault('media_content', [])`
			`context['media_content'].append(attrsD)`

			`try:`
			`feedparser._FeedParserMixin._start_media_content = _start_media_content`
			`except:`
			`pass`
First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00
Add EA.com compatibility to feedparser (bug 588) Modify feedparser's default "Accept:" header to also include a plain "/" content type, so that the web server at EA.com will happily return the RSS feed contents instead of an error. Thanks to Romain Janvier for reporting this bug. 2009-10-05 19:27:25 +02:00			`# Fix problem with the EA.com official podcast`
			`# https://bugs.gpodder.org/show_bug.cgi?id=588`
			`if '/' not in feedparser.ACCEPT_HEADER.split(','):`
			`feedparser.ACCEPT_HEADER += ',/'`

First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00			`patch_feedparser()`


			`class ExceptionWithData(Exception):`
			`"""Base exception with additional payload"""`
			`def __init__(self, data):`
			`Exception.__init__(self)`
			`self.data = data`

			`def __str__(self):`
			`return '%s: %s' % (self.__class__.__name__, str(self.data))`


			`# Temporary errors`
			`class Offline(Exception): pass`
			`class BadRequest(Exception): pass`
			`class InternalServerError(Exception): pass`
			`class WifiLogin(ExceptionWithData): pass`

			`# Fatal errors`
			`class Unsubscribe(Exception): pass`
			`class NotFound(Exception): pass`
			`class InvalidFeed(Exception): pass`
			`class UnknownStatusCode(ExceptionWithData): pass`

			`# Authentication error`
			`class AuthenticationRequired(Exception): pass`

			`# Successful parsing of the feed`
			`class UpdatedFeed(ExceptionWithData): pass`
			`class NewLocation(ExceptionWithData): pass`
			`class NotModified(ExceptionWithData): pass`



			`class Fetcher(object):`
			`# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html`
			`FEED_TYPES = ('application/rss+xml',`
			`'application/atom+xml',`
			`'application/rdf+xml',`
			`'application/xml',`
			`'text/xml')`

			`def __init__(self, user_agent):`
			`self.user_agent = user_agent`

			`def _get_handlers(self):`
			`"""Provide additional urllib2 handler objects`

			`Subclasses can override this method to inject urllib2`
			`handler objects into the feedparser.parse() call to`
			`extent the functionalty of this Fetcher (for proxies, ..)`
			`"""`
			`return []`

			`def _resolve_url(self, url):`
			`"""Provide additional ways of resolving an URL`

			`Subclasses can override this method to provide more`
			`ways of resolving a given URL to a feed URL. If the`
			`Fetcher is in "autodiscovery" mode, it will try this`
			`method as a last resort for coming up with a feed URL.`
			`"""`
			`return None`

			`def _autodiscover_feed(self, feed):`
			`try:`
			`# First, try all <link> elements if available`
			`for link in feed.feed.get('links', ()):`
			`is_feed = link.get('type', '') in self.FEED_TYPES`
			`is_alternate = link.get('rel', '') == 'alternate'`
			`url = link.get('href', None)`

			`if url and is_feed and is_alternate:`
			`try:`
			`self._parse_feed(url, None, None, False)`
			`except UpdatedFeed, updated:`
			`raise`
			`except Exception:`
			`pass`

			`# Second, try to resolve the URL`
			`url = self._resolve_url(feed.href)`
			`if url:`
			`self._parse_feed(url, None, None, False)`
			`except UpdatedFeed, updated:`
			`raise NewLocation(updated.data)`
			`except Exception, e:`
			`pass`

			`def _check_offline(self, feed):`
			`if not hasattr(feed, 'headers'):`
			`raise Offline()`

			`def _check_wifi_login_page(self, feed):`
			`html_page = 'text/html' in feed.headers.get('content-type', '')`
			`if not feed.version and feed.status == 302 and html_page:`
			`raise WifiLogin(feed.href)`

			`def _check_valid_feed(self, feed):`
			`if feed is None:`
			`raise InvalidFeed('feed is None')`

			`if not hasattr(feed, 'status'):`
			`raise InvalidFeed('feed has no status code')`

Fix adding password-protected feeds (bug 544) Don't require feed.version when the feed could not be retrieve due to authentication request. 2009-09-02 18:46:00 +02:00			`if not feed.version and feed.status != 304 and feed.status != 401:`
First cut of new feedcore module + related changes This might break a lot, but it makes feed parsing and updating so much cleaner, and also helps with error reporting, because exceptions are bubbled up to the UI. Removed: * Changing feed URL (this introduces too many problems) * Support for FTP proxy servers (who uses FTP for podcasts?) 2009-06-12 00:51:13 +02:00			`raise InvalidFeed('unknown feed type')`

			`def _normalize_status(self, status):`
			`# Based on Mark Pilgrim's "Atom aggregator behaviour" article`
			`if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):`
			`return status`
			`elif status >= 200 and status < 300:`
			`return 200`
			`elif status >= 300 and status < 400:`
			`return 302`
			`elif status >= 400 and status < 500:`
			`return 400`
			`elif status >= 500 and status < 600:`
			`return 500`
			`else:`
			`return status`

			`def _check_statuscode(self, feed):`
			`status = self._normalize_status(feed.status)`
			`if status == 200:`
			`raise UpdatedFeed(feed)`
			`elif status == 301:`
			`raise NewLocation(feed)`
			`elif status == 302:`
			`raise UpdatedFeed(feed)`
			`elif status == 304:`
			`raise NotModified(feed)`
			`elif status == 400:`
			`raise BadRequest('bad request')`
			`elif status == 401:`
			`raise AuthenticationRequired('authentication required')`
			`elif status == 403:`
			`raise Unsubscribe('forbidden')`
			`elif status == 404:`
			`raise NotFound('not found')`
			`elif status == 410:`
			`raise Unsubscribe('resource is gone')`
			`elif status == 500:`
			`raise InternalServerError('internal server error')`
			`else:`
			`raise UnknownStatusCode(status)`

			`def _parse_feed(self, url, etag, modified, autodiscovery=True):`
			`"""Parse the feed and raise the result."""`
			`feed = feedparser.parse(url,`
			`agent=self.user_agent,`
			`modified=modified,`
			`etag=etag,`
			`handlers=self._get_handlers())`

			`self._check_offline(feed)`
			`self._check_wifi_login_page(feed)`

			`if feed.status != 304 and not feed.version and autodiscovery:`
			`self._autodiscover_feed(feed)`

			`self._check_valid_feed(feed)`
			`self._check_statuscode(feed)`

			`def fetch(self, url, etag=None, modified=None):`
			`"""Download a feed, with optional etag an modified values`

			`This method will always raise an exception that tells`
			`the calling code the result of the fetch operation. See`
			`the code for the feedcore module for all the possible`
			`exception types.`
			`"""`
			`self._parse_feed(url, etag, modified)`