Auto-discover RSS/Atom feeds from HTML pages (bug 201)

For text/html content types, try to discover the correct
RSS/Atom feed URL by looking at the <link> meta elements
and getting the feed URL from that. If that fails, or if
there are not <link> meta elements, make sure that the UI
code can display the "This URL is a website" message box
that will allow the user to open a web browser window to
search for the feed URL manually.

Based on a patch by Justin Forest.
This commit is contained in:
Thomas Perl 2008-09-30 22:07:06 +02:00
parent ad90b685c3
commit 097ab04f62
3 changed files with 47 additions and 2 deletions

View File

@ -73,6 +73,10 @@ class Cache:
caching.
"""
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
'application/rdf+xml', 'application/xml', 'text/xml')
def __init__(self, timeToLiveSeconds=3600):
"""
Arguments:
@ -112,6 +116,32 @@ class Cache:
etag=etag,
)
content_type = parsed_result.headers.get('content-type', '').lower()
# TODO: Also detect OPML feeds and other content types here
if content_type.startswith('text/html'):
log('%s looks like a webpage - trying feed autodiscovery.', url, sender=self)
if not hasattr(parsed_result.feed, 'links'):
return (False, None)
try:
found_alternate_feed = False
for link in parsed_result.feed.links:
if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
log('Found alternate feed link: %s', link.href, sender=self)
parsed_result = feedparser.parse(link.href,
agent=self.user_agent,
modified=modified,
etag=etag,
)
found_alternate_feed = True
break
# We have not found a valid feed - abort here!
if not found_alternate_feed:
return (False, None)
except:
log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
updated = False
status = parsed_result.get('status', None)

View File

@ -95,7 +95,8 @@ class podcastChannel(object):
return tmp[0]
elif create:
tmp = podcastChannel(url)
tmp.update()
if not tmp.update():
return None
tmp.save()
db.force_last_new(tmp)
return tmp
@ -111,6 +112,13 @@ class podcastChannel(object):
def update(self):
(updated, c) = self.fc.fetch(self.url, self)
if c is None:
return False
if self.url != c.url:
log('Updating channel URL from %s to %s', self.url, c.url, sender=self)
self.url = c.url
# update the cover if it's not there
self.update_cover()
@ -118,7 +126,7 @@ class podcastChannel(object):
# feedcache says the feed hasn't changed, return old
if not updated:
log('Channel %s is up to date', self.url)
return
return True
# Save etag and last-modified for later reuse
if c.headers.get('etag'):
@ -168,6 +176,7 @@ class podcastChannel(object):
# Now we can flush the updates.
db.commit()
return True
def update_cover(self, force=False):
if self.cover_file is None or not os.path.exists(self.cover_file) or force:

View File

@ -112,11 +112,17 @@ def normalize_feed_url( url):
This will also normalize feed:// and itpc:// to http://
Also supported are phobos.apple.com links (iTunes podcast)
and itms:// links (iTunes podcast direct link).
If no URL scheme is defined (e.g. "curry.com"), we will
simply assume the user intends to add a http:// feed.
"""
if not url or len( url) < 8:
return None
if not '://' in url:
url = 'http://' + url
if url.startswith('itms://'):
url = parse_itunes_xml(url)