Port from feedparser to podcastparser

This commit is contained in:
Thomas Perl 2016-02-06 17:46:07 +01:00
parent 28788865fb
commit 4189cf48fb
7 changed files with 107 additions and 348 deletions

4
README
View File

@ -31,7 +31,7 @@
[ DEPENDENCIES ]
- Python 2.7 or newer http://python.org/
- Feedparser 5.1.2 or newer http://code.google.com/p/feedparser/
- Podcastparser 0.6.0 or newer http://gpodder.org/podcastparser/
- mygpoclient 1.7 or newer http://gpodder.org/mygpoclient/
- Python D-Bus bindings
@ -39,7 +39,7 @@
the dummy (no-op) D-Bus module provided in "tools/fake-dbus-module/".
For quick testing, you can use the script tools/localdepends.py to
install local copies of feedparser and mygpoclient into "src/" from
install local copies of podcastparser and mygpoclient into "src/" from
PyPI. With this, you get a self-contained gPodder CLI codebase.

View File

@ -37,12 +37,12 @@ import locale
# Check if real hard dependencies are available
try:
import feedparser
import podcastparser
except ImportError:
print """
Error: Module "feedparser" (python-feedparser) not found.
The feedparser module can be downloaded from
http://code.google.com/p/feedparser/
Error: Module "podcastparser" (python-podcastparser) not found.
The podcastparser module can be downloaded from
http://gpodder.org/podcastparser/
From a source checkout, you can download local copies of all
CLI dependencies for debugging (will be placed into "src/"):
@ -50,7 +50,7 @@ except ImportError:
python tools/localdepends.py
"""
sys.exit(1)
del feedparser
del podcastparser
try:
import mygpoclient
@ -58,7 +58,7 @@ except ImportError:
print """
Error: Module "mygpoclient" (python-mygpoclient) not found.
The mygpoclient module can be downloaded from
http://thp.io/2010/mygpoclient/
http://gpodder.org/mygpoclient/
From a source checkout, you can download local copies of all
CLI dependencies for debugging (will be placed into "src/"):

View File

@ -22,11 +22,17 @@
# Thomas Perl <thp@gpodder.org>; 2009-06-11
#
import feedparser
import podcastparser
from gpodder import util
import logging
logger = logging.getLogger(__name__)
from urllib2 import HTTPError
from HTMLParser import HTMLParser
import urlparse
try:
# Python 2
from rfc822 import mktime_tz
@ -35,75 +41,6 @@ except ImportError:
from email.utils import mktime_tz
# Version check to avoid bug 1648
feedparser_version = tuple(int(x) if x.isdigit() else x
for x in feedparser.__version__.split('.'))
feedparser_miniumum_version = (5, 1, 2)
if feedparser_version < feedparser_miniumum_version:
installed_version = feedparser.__version__
required_version = '.'.join(str(x) for x in feedparser_miniumum_version)
logger.warn('Your feedparser is too old. Installed: %s, recommended: %s',
installed_version, required_version)
def patch_feedparser():
"""Monkey-patch the Universal Feed Parser"""
# Detect the 'plain' content type as 'text/plain'
# http://code.google.com/p/feedparser/issues/detail?id=80
def mapContentType2(self, contentType):
contentType = contentType.lower()
if contentType == 'text' or contentType == 'plain':
contentType = 'text/plain'
elif contentType == 'html':
contentType = 'text/html'
elif contentType == 'xhtml':
contentType = 'application/xhtml+xml'
return contentType
try:
if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
feedparser._FeedParserMixin.mapContentType = mapContentType2
except:
pass
# Fix parsing of Media RSS with feedparser, as described here:
# http://code.google.com/p/feedparser/issues/detail?id=100#c4
def _start_media_content(self, attrsD):
context = self._getContext()
context.setdefault('media_content', [])
context['media_content'].append(attrsD)
try:
feedparser._FeedParserMixin._start_media_content = _start_media_content
except:
pass
# Fix problem with the EA.com official podcast
# https://bugs.gpodder.org/show_bug.cgi?id=588
if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
feedparser.ACCEPT_HEADER += ',*/*'
# Fix problem with YouTube feeds and pubDate/atom:modified
# https://bugs.gpodder.org/show_bug.cgi?id=1492
# http://code.google.com/p/feedparser/issues/detail?id=310
def _end_updated(self):
value = self.pop('updated')
parsed_value = feedparser._parse_date(value)
overwrite = ('youtube.com' not in self.baseuri)
try:
self._save('updated_parsed', parsed_value, overwrite=overwrite)
except TypeError, te:
logger.warn('Your feedparser version is too old: %s', te)
try:
feedparser._FeedParserMixin._end_updated = _end_updated
except:
pass
patch_feedparser()
class ExceptionWithData(Exception):
"""Base exception with additional payload"""
def __init__(self, data):
@ -114,7 +51,6 @@ class ExceptionWithData(Exception):
return '%s: %s' % (self.__class__.__name__, str(self.data))
# Temporary errors
class Offline(Exception): pass
class BadRequest(Exception): pass
class InternalServerError(Exception): pass
class WifiLogin(ExceptionWithData): pass
@ -137,6 +73,26 @@ class Result:
self.feed = feed
class FeedAutodiscovery(HTMLParser):
def __init__(self, base):
HTMLParser.__init__(self)
self._base = base
self._resolved_url = None
def handle_starttag(self, tag, attrs):
if tag == 'link':
attrs = dict(attrs)
is_feed = attrs.get('type', '') in Fetcher.FEED_TYPES
is_alternate = attrs.get('rel', '') == 'alternate'
url = attrs.get('href', None)
url = urlparse.urljoin(self._base, url)
if is_feed and is_alternate and url:
logger.info('Feed autodiscovery: %s', url)
self._resolved_url = url
class Fetcher(object):
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
FEED_TYPES = ('application/rss+xml',
@ -145,9 +101,6 @@ class Fetcher(object):
'application/xml',
'text/xml')
def __init__(self, user_agent):
self.user_agent = user_agent
def _resolve_url(self, url):
"""Provide additional ways of resolving an URL
@ -158,45 +111,6 @@ class Fetcher(object):
"""
return None
def _autodiscover_feed(self, feed):
# First, try all <link> elements if available
for link in feed.feed.get('links', ()):
is_feed = link.get('type', '') in self.FEED_TYPES
is_alternate = link.get('rel', '') == 'alternate'
url = link.get('href', None)
if url and is_feed and is_alternate:
try:
return self._parse_feed(url, None, None, False)
except Exception, e:
pass
# Second, try to resolve the URL
url = self._resolve_url(feed.href)
if url:
result = self._parse_feed(url, None, None, False)
result.status = NEW_LOCATION
return result
def _check_offline(self, feed):
if not hasattr(feed, 'headers'):
raise Offline()
def _check_wifi_login_page(self, feed):
html_page = 'text/html' in feed.headers.get('content-type', '')
if not feed.version and feed.status == 302 and html_page:
raise WifiLogin(feed.href)
def _check_valid_feed(self, feed):
if feed is None:
raise InvalidFeed('feed is None')
if not hasattr(feed, 'status'):
raise InvalidFeed('feed has no status code')
if not feed.version and feed.status != 304 and feed.status != 401:
raise InvalidFeed('unknown feed type')
def _normalize_status(self, status):
# Based on Mark Pilgrim's "Atom aggregator behaviour" article
if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
@ -212,16 +126,9 @@ class Fetcher(object):
else:
return status
def _check_rss_redirect(self, feed):
new_location = feed.feed.get('newlocation', None)
if new_location:
feed.href = feed.feed.newlocation
return Result(NEW_LOCATION, feed)
def _check_statuscode(self, response, feed):
status = self._normalize_status(response.getcode())
return None
def _check_statuscode(self, feed):
status = self._normalize_status(feed.status)
if status == 200:
return Result(UPDATED_FEED, feed)
elif status == 301:
@ -247,69 +154,43 @@ class Fetcher(object):
raise UnknownStatusCode(status)
def _parse_feed(self, url, etag, modified, autodiscovery=True):
headers = {}
if modified is not None:
headers['If-Modified-Since'] = modified
if etag is not None:
headers['If-None-Match'] = etag
if url.startswith('file://'):
is_local = True
url = url[len('file://'):]
stream = open(url)
else:
is_local = False
try:
stream = util.urlopen(url, headers)
except HTTPError as e:
return self._check_statuscode(e, e.geturl())
feed = feedparser.parse(url,
agent=self.user_agent,
modified=modified,
etag=etag)
if stream.headers.get('content-type', '').startswith('text/html'):
if autodiscovery:
ad = FeedAutodiscovery(url)
ad.feed(stream.read())
if ad._resolved_url:
try:
self._parse_feed(ad._resolved_url, None, None, False)
return Result(NEW_LOCATION, ad._resolved_url)
except Exception as e:
logger.warn('Feed autodiscovery failed', exc_info=True)
if is_local:
if feed.version:
feed.headers = {}
return Result(UPDATED_FEED, feed)
else:
raise InvalidFeed('Not a valid feed file')
else:
self._check_offline(feed)
self._check_wifi_login_page(feed)
# Second, try to resolve the URL
url = self._resolve_url(url)
if url:
return Result(NEW_LOCATION, url)
if feed.status != 304 and not feed.version and autodiscovery:
feed = self._autodiscover_feed(feed).feed
raise InvalidFeed('Got HTML document instead')
self._check_valid_feed(feed)
redirect = self._check_rss_redirect(feed)
if redirect is not None:
return redirect
return self._check_statuscode(feed)
feed = podcastparser.parse(url, stream)
return self._check_statuscode(stream, feed)
def fetch(self, url, etag=None, modified=None):
return self._parse_feed(url, etag, modified)
def get_pubdate(entry):
"""Try to determine the real pubDate of a feedparser entry
This basically takes the updated_parsed value, but also uses some more
advanced techniques to work around various issues with ugly feeds.
"published" now also takes precedence over "updated" (with updated used as
a fallback if published is not set/available). RSS' "pubDate" element is
"updated", and will only be used if published_parsed is not available.
If parsing the date into seconds since epoch returns an error (date is
before epoch or after the end of time), epoch is used as fallback.
This fixes https://bugs.gpodder.org/show_bug.cgi?id=2023
"""
pubdate = entry.get('published_parsed', None)
if pubdate is None:
pubdate = entry.get('updated_parsed', None)
if pubdate is None:
# Cannot determine pubdate - party like it's 1970!
return 0
try:
pubtimeseconds = mktime_tz(pubdate + (0,))
return pubtimeseconds
except(OverflowError,ValueError):
logger.warn('bad pubdate %s is before epoch or after end of time (2038)',pubdate)
return 0

View File

@ -44,7 +44,7 @@ import time
import datetime
import hashlib
import feedparser
import podcastparser
import collections
import string
@ -60,12 +60,9 @@ class gPodderFetcher(feedcore.Fetcher):
"""
custom_handlers = []
def __init__(self):
feedcore.Fetcher.__init__(self, gpodder.user_agent)
def fetch_channel(self, channel):
etag = channel.http_etag
modified = feedparser._parse_date(channel.http_last_modified)
modified = podcastparser.parse_pubdate(channel.http_last_modified)
# If we have a username or password, rebuild the url with them included
# Note: using a HTTPBasicAuthHandler would be pain because we need to
# know the realm. It can be done, but I think this method works, too
@ -144,145 +141,54 @@ class PodcastEpisode(PodcastModelObject):
youtube.is_video_link(self.link))
@classmethod
def from_feedparser_entry(cls, entry, channel):
def from_podcastparser_entry(cls, entry, channel):
episode = cls(channel)
episode.guid = entry.get('id', '')
episode.guid = entry['guid']
episode.title = entry['title']
episode.link = entry['link']
episode.description = entry['description']
episode.total_time = entry['total_time']
episode.published = entry['published']
episode.payment_url = entry['payment_url']
# Replace multi-space and newlines with single space (Maemo bug 11173)
episode.title = re.sub('\s+', ' ', entry.get('title', ''))
episode.link = entry.get('link', '')
if 'content' in entry and len(entry['content']) and \
entry['content'][0].get('type', '') == 'text/html':
episode.description = entry['content'][0].value
else:
episode.description = entry.get('summary', '')
audio_available = any(enclosure['mime_type'].startswith('audio/') for enclosure in entry['enclosures'])
video_available = any(enclosure['mime_type'].startswith('video/') for enclosure in entry['enclosures'])
# Fallback to subtitle if summary is not available
if not episode.description:
episode.description = entry.get('subtitle', '')
try:
total_time = 0
# Parse iTunes-specific podcast duration metadata
itunes_duration = entry.get('itunes_duration', '')
if itunes_duration:
total_time = util.parse_time(itunes_duration)
# Parse time from YouTube descriptions if it's a YouTube feed
if youtube.is_youtube_guid(episode.guid):
result = re.search(r'Time:<[^>]*>\n<[^>]*>([:0-9]*)<',
episode.description)
if result:
youtube_duration = result.group(1)
total_time = util.parse_time(youtube_duration)
episode.total_time = total_time
except:
pass
episode.published = feedcore.get_pubdate(entry)
enclosures = entry.get('enclosures', [])
media_rss_content = entry.get('media_content', [])
audio_available = any(e.get('type', '').startswith('audio/') \
for e in enclosures + media_rss_content)
video_available = any(e.get('type', '').startswith('video/') \
for e in enclosures + media_rss_content)
# XXX: Make it possible for hooks/extensions to override this by
# giving them a list of enclosures and the "self" object (podcast)
# and letting them sort and/or filter the list of enclosures to
# get the desired enclosure picked by the algorithm below.
filter_and_sort_enclosures = lambda x: x
payment_info = [link['href'] for link in entry.get('links', [])
if link['rel'] == 'payment']
if payment_info:
episode.payment_url = payment_info[0]
# Enclosures
for e in filter_and_sort_enclosures(enclosures):
episode.mime_type = e.get('type', 'application/octet-stream')
if episode.mime_type == '':
# See Maemo bug 10036
logger.warn('Fixing empty mimetype in ugly feed')
episode.mime_type = 'application/octet-stream'
if '/' not in episode.mime_type:
continue
for enclosure in entry['enclosures']:
episode.mime_type = enclosure['mime_type']
# Skip images in feeds if audio or video is available (bug 979)
# This must (and does) also look in Media RSS enclosures (bug 1430)
if episode.mime_type.startswith('image/') and \
(audio_available or video_available):
if episode.mime_type.startswith('image/') and (audio_available or video_available):
continue
# If we have audio or video available later on, skip
# 'application/octet-stream' data types (fixes Linux Outlaws)
if episode.mime_type == 'application/octet-stream' and \
(audio_available or video_available):
if episode.mime_type == 'application/octet-stream' and (audio_available or video_available):
continue
episode.url = util.normalize_feed_url(e.get('href', ''))
episode.url = util.normalize_feed_url(enclosure['url'])
if not episode.url:
continue
try:
episode.file_size = int(e.length) or -1
except:
episode.file_size = -1
episode.file_size = enclosure['file_size']
return episode
# Media RSS content
for m in filter_and_sort_enclosures(media_rss_content):
episode.mime_type = m.get('type', 'application/octet-stream')
if '/' not in episode.mime_type:
continue
# Skip images in Media RSS if we have audio/video (bug 1444)
if episode.mime_type.startswith('image/') and \
(audio_available or video_available):
continue
episode.url = util.normalize_feed_url(m.get('url', ''))
if not episode.url:
continue
try:
episode.file_size = int(m.get('filesize', 0)) or -1
except:
episode.file_size = -1
try:
episode.total_time = int(m.get('duration', 0)) or 0
except:
episode.total_time = 0
# Brute-force detection of the episode link
episode.url = util.normalize_feed_url(entry['link'])
if not episode.url:
return None
if any(mod.is_video_link(episode.url) for mod in (youtube, vimeo, escapist_videos)):
return episode
# Brute-force detection of any links
for l in entry.get('links', ()):
episode.url = util.normalize_feed_url(l.get('href', ''))
if not episode.url:
continue
# Check if we can resolve this link to a audio/video file
filename, extension = util.filename_from_url(episode.url)
file_type = util.file_type_by_extension(extension)
if ( youtube.is_video_link(episode.url) or \
vimeo.is_video_link(episode.url) or \
escapist_videos.is_video_link(episode.url) ):
return episode
# Check if we can resolve this link to a audio/video file
filename, extension = util.filename_from_url(episode.url)
file_type = util.file_type_by_extension(extension)
if file_type is None and hasattr(l, 'type'):
extension = util.extension_from_mimetype(l.type)
file_type = util.file_type_by_extension(extension)
# The link points to a audio or video file - use it!
if file_type is not None:
return episode
# The link points to a audio or video file - use it!
if file_type is not None:
return episode
return None
@ -1009,29 +915,11 @@ class PodcastChannel(PodcastModelObject):
self.remove_unreachable_episodes(existing, seen_guids, max_episodes)
def _consume_updated_feed(self, feed, max_episodes=0):
# Cover art URL
cover_url = None
if hasattr(feed.feed, 'image'):
for attribute in ('href', 'url'):
new_value = getattr(feed.feed.image, attribute, None)
if new_value is not None:
cover_url = new_value
elif hasattr(feed.feed, 'icon'):
cover_url = feed.feed.icon
# Payment URL information
payment_info = [link['href'] for link in feed.feed.get('links', [])
if link['rel'] == 'payment']
if payment_info:
payment_url = payment_info[0]
else:
payment_url = None
self._consume_metadata(feed.feed.get('title', self.url),
feed.feed.get('link', self.link),
feed.feed.get('subtitle', self.description),
cover_url,
payment_url)
self._consume_metadata(feed.get('title', self.url),
feed.get('link', self.link),
feed.get('description', ''),
feed.get('cover_url', None),
feed.get('payment_url', None))
# Load all episodes to update them properly.
existing = self.get_all_episodes()
@ -1040,7 +928,7 @@ class PodcastChannel(PodcastModelObject):
# because if the feed lists items in ascending order and has >
# max_episodes old episodes, new episodes will not be shown.
# See also: gPodder Bug 1186
entries = sorted(feed.entries, key=feedcore.get_pubdate, reverse=True)
entries = sorted(feed.get('episodes', []), key=lambda episode: episode['published'], reverse=True)
# We can limit the maximum number of entries that gPodder will parse
if max_episodes > 0 and len(entries) > max_episodes:
@ -1060,18 +948,8 @@ class PodcastChannel(PodcastModelObject):
# Search all entries for new episodes
for entry in entries:
episode = self.EpisodeClass.from_feedparser_entry(entry, self)
episode = self.EpisodeClass.from_podcastparser_entry(entry, self)
if episode is not None:
if not episode.title:
logger.warn('Using filename as title for %s', episode.url)
basename = os.path.basename(episode.url)
episode.title, ext = os.path.splitext(basename)
# Maemo bug 12073
if not episode.guid:
logger.warn('Using download URL as GUID for %s', episode.title)
episode.guid = episode.url
seen_guids.add(episode.guid)
else:
continue
@ -1140,12 +1018,14 @@ class PodcastChannel(PodcastModelObject):
elif result.status == feedcore.UPDATED_FEED:
self._consume_updated_feed(result.feed, max_episodes)
elif result.status == feedcore.NEW_LOCATION:
url = result.feed.href
url = result.feed
logger.info('New feed location: %s => %s', self.url, url)
if url in set(x.url for x in self.model.get_podcasts()):
raise Exception('Already subscribed to ' + url)
self.url = url
self._consume_updated_feed(result.feed, max_episodes)
# With the updated URL, fetch the feed again
self.update(max_episodes)
return
elif result.status == feedcore.NOT_MODIFIED:
pass

View File

@ -62,8 +62,6 @@ import webbrowser
import mimetypes
import itertools
import feedparser
import StringIO
import xml.dom.minidom

View File

@ -24,7 +24,7 @@ tmp_dir = tempfile.mkdtemp()
MODULES = [
# Module name, Regex-file chooser (1st group = location in "src/")
('feedparser', r'feedparser-[0-9.]+/feedparser/(feedparser.py)'),
('podcastparser', r'podcastparser-[0-9.]+/(podcastparser.py)'),
('mygpoclient', r'mygpoclient-[0-9.]+/(mygpoclient/[^/]*\.py)')
]

View File

@ -252,7 +252,7 @@ int main(int argc, char** argv)
// decref GtkModule
#endif
// XXX: Test for feedparser, mygpoclient, dbus
// XXX: Test for podcastparser, mygpoclient, dbus
MainPy = (void*)PyFile_FromString(MAIN_MODULE, "r");
if (MainPy == NULL) { BAILOUT("Cannot load main file") }