Cleanup feed parsing, remove some special-cases

This commit is contained in:
Thomas Perl 2012-12-28 15:34:20 +01:00
parent 0733bfd248
commit 5b552131f8
2 changed files with 61 additions and 106 deletions

View File

@ -299,15 +299,6 @@ def get_pubdate(entry):
if pubdate is None:
pubdate = entry.get('updated_parsed', None)
if pubdate is None:
# See http://code.google.com/p/feedparser/issues/detail?id=327
updated = entry.get('published', entry.get('updated', None))
if updated is not None:
# FIXME: This is kludgy. We should write our own date handler
# and register it with feedparser.registerDateHandler() and/or
# wait for feedparser to add support for this bogus date format.
pubdate = feedparser._parse_date(updated.replace(',', ''))
if pubdate is None:
# Cannot determine pubdate - party like it's 1970!
return 0

View File

@ -188,6 +188,10 @@ class PodcastEpisode(PodcastModelObject):
else:
episode.description = entry.get('summary', '')
# Fallback to subtitle if summary is not available
if not episode.description:
episode.description = entry.get('subtitle', '')
try:
total_time = 0
@ -208,10 +212,6 @@ class PodcastEpisode(PodcastModelObject):
except:
pass
# Fallback to subtitle if summary is not available0
if not episode.description:
episode.description = entry.get('subtitle', '')
episode.published = feedcore.get_pubdate(entry)
enclosures = entry.get('enclosures', [])
@ -315,14 +315,6 @@ class PodcastEpisode(PodcastModelObject):
if file_type is not None:
return episode
# Scan MP3 links in description text
mp3s = re.compile(r'http://[^"]*\.mp3')
for content in entry.get('content', ()):
html = content.value
for match in mp3s.finditer(html):
episode.url = match.group(0)
return episode
return None
def __init__(self, channel):
@ -764,15 +756,6 @@ class PodcastEpisode(PodcastModelObject):
else:
return '-'
def is_duplicate(self, episode):
if self.title == episode.title and self.published == episode.published:
logger.warn('Possible duplicate detected: %s', self.title)
return True
return False
def duplicate_id(self):
return hash((self.title, self.published))
def update_from(self, episode):
for k in ('title', 'url', 'description', 'link', 'published', 'guid', 'file_size', 'payment_url'):
setattr(self, k, getattr(episode, k))
@ -1009,70 +992,68 @@ class PodcastChannel(PodcastModelObject):
self.title = self.title[len(VIMEO_PREFIX):] + ' on Vimeo'
# End YouTube- and Vimeo-specific title FIX
def _consume_custom_feed(self, custom_feed, max_episodes=0):
self._consume_updated_title(custom_feed.get_title())
self.link = custom_feed.get_link()
self.description = custom_feed.get_description()
self.cover_url = custom_feed.get_image()
def _consume_metadata(self, title, link, description, cover_url,
payment_url):
self._consume_updated_title(title)
self.link = link
self.description = description
self.cover_url = cover_url
self.payment_url = payment_url
self.save()
def _consume_custom_feed(self, custom_feed, max_episodes=0):
self._consume_metadata(custom_feed.get_title(),
custom_feed.get_link(),
custom_feed.get_description(),
custom_feed.get_image(),
None)
existing = self.get_all_episodes()
existing_guids = [episode.guid for episode in existing]
assert self.children is not None
# Insert newly-found episodes into the database + local cache
new_episodes, seen_guids = custom_feed.get_new_episodes(self,
existing_guids)
new_episodes, seen_guids = custom_feed.get_new_episodes(self, existing_guids)
self.children.extend(new_episodes)
self.remove_unreachable_episodes(existing, seen_guids, max_episodes)
def _consume_updated_feed(self, feed, max_episodes=0):
#self.parse_error = feed.get('bozo_exception', None)
self._consume_updated_title(feed.feed.get('title', self.url))
self.link = feed.feed.get('link', self.link)
self.description = feed.feed.get('subtitle', self.description)
# read the flattr auto-url, if exists
payment_info = [link['href'] for link in feed.feed.get('links', [])
if link['rel'] == 'payment']
if payment_info:
self.payment_url = sorted(payment_info, key=get_payment_priority)[0]
# Cover art URL
if hasattr(feed.feed, 'image'):
for attribute in ('href', 'url'):
new_value = getattr(feed.feed.image, attribute, None)
if new_value is not None:
self.cover_url = new_value
cover_url = new_value
elif hasattr(feed.feed, 'icon'):
cover_url = feed.feed.icon
if hasattr(feed.feed, 'icon'):
self.cover_url = feed.feed.icon
# Payment URL (Flattr auto-payment) information
payment_info = [link['href'] for link in feed.feed.get('links', [])
if link['rel'] == 'payment']
if payment_info:
payment_url = sorted(payment_info, key=get_payment_priority)[0]
else:
payment_url = None
self.save()
self._consume_metadata(feed.feed.get('title', self.url),
feed.feed.get('link', self.link),
feed.feed.get('subtitle', self.description),
cover_url,
payment_url)
# Load all episodes to update them properly.
existing = self.get_all_episodes()
try:
# We have to sort the entries in descending chronological order,
# because if the feed lists items in ascending order and has >
# max_episodes old episodes, new episodes will not be shown.
# See also: gPodder Bug 1186
entries = sorted(feed.entries, key=feedcore.get_pubdate,
reverse=True)
except Exception, e:
logger.warn('Could not sort episodes: %s', e, exc_info=True)
entries = feed.entries
# We have to sort the entries in descending chronological order,
# because if the feed lists items in ascending order and has >
# max_episodes old episodes, new episodes will not be shown.
# See also: gPodder Bug 1186
entries = sorted(feed.entries, key=feedcore.get_pubdate, reverse=True)
# We can limit the maximum number of entries that gPodder will parse
if max_episodes > 0 and len(entries) > max_episodes:
entries = entries[:max_episodes]
# Title + PubDate hashes for existing episodes
existing_dupes = dict((e.duplicate_id(), e) for e in existing)
# GUID-based existing episode list
existing_guids = dict((e.guid, e) for e in existing)
@ -1087,27 +1068,20 @@ class PodcastChannel(PodcastModelObject):
# Search all entries for new episodes
for entry in entries:
try:
episode = self.EpisodeClass.from_feedparser_entry(entry, self)
if episode is not None:
if not episode.title:
logger.warn('Using filename as title for %s',
episode.url)
basename = os.path.basename(episode.url)
episode.title, ext = os.path.splitext(basename)
episode = self.EpisodeClass.from_feedparser_entry(entry, self)
if episode is not None:
if not episode.title:
logger.warn('Using filename as title for %s', episode.url)
basename = os.path.basename(episode.url)
episode.title, ext = os.path.splitext(basename)
# Maemo bug 12073
if not episode.guid:
logger.warn('Using download URL as GUID for %s',
episode.title)
episode.guid = episode.url
# Maemo bug 12073
if not episode.guid:
logger.warn('Using download URL as GUID for %s', episode.title)
episode.guid = episode.url
seen_guids.add(episode.guid)
except Exception, e:
logger.error('Skipping episode: %s', e, exc_info=True)
continue
if episode is None:
seen_guids.add(episode.guid)
else:
continue
# Detect (and update) existing episode based on GUIDs
@ -1117,20 +1091,6 @@ class PodcastChannel(PodcastModelObject):
existing_episode.save()
continue
# Detect (and update) existing episode based on duplicate ID
existing_episode = existing_dupes.get(episode.duplicate_id(), None)
if existing_episode:
if existing_episode.is_duplicate(episode):
existing_episode.update_from(episode)
existing_episode.save()
continue
new_episodes += 1
# Only allow a certain number of new episodes per update
if (self.download_strategy == PodcastChannel.STRATEGY_LATEST and
new_episodes > 1):
episode.is_new = False
# Workaround for bug 340: If the episode has been
# published earlier than one week before the most
# recent existing episode, do not mark it as new.
@ -1138,12 +1098,16 @@ class PodcastChannel(PodcastModelObject):
logger.debug('Episode with old date: %s', episode.title)
episode.is_new = False
episode.save()
if episode.is_new:
new_episodes += 1
# This episode is new - if we already loaded the
# list of "children" episodes, add it to this list
if self.children is not None:
self.children.append(episode)
# Only allow a certain number of new episodes per update
if (self.download_strategy == PodcastChannel.STRATEGY_LATEST and
new_episodes > 1):
episode.is_new = False
episode.save()
self.children.append(episode)
self.remove_unreachable_episodes(existing, seen_guids, max_episodes)