Auto-discover RSS/Atom feeds from HTML pages (bug 201)
For text/html content types, try to discover the correct RSS/Atom feed URL by looking at the <link> meta elements and getting the feed URL from that. If that fails, or if there are not <link> meta elements, make sure that the UI code can display the "This URL is a website" message box that will allow the user to open a web browser window to search for the feed URL manually. Based on a patch by Justin Forest.
This commit is contained in:
parent
ad90b685c3
commit
097ab04f62
3 changed files with 47 additions and 2 deletions
|
@ -73,6 +73,10 @@ class Cache:
|
|||
caching.
|
||||
"""
|
||||
|
||||
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
|
||||
SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
|
||||
'application/rdf+xml', 'application/xml', 'text/xml')
|
||||
|
||||
def __init__(self, timeToLiveSeconds=3600):
|
||||
"""
|
||||
Arguments:
|
||||
|
@ -112,6 +116,32 @@ class Cache:
|
|||
etag=etag,
|
||||
)
|
||||
|
||||
content_type = parsed_result.headers.get('content-type', '').lower()
|
||||
# TODO: Also detect OPML feeds and other content types here
|
||||
if content_type.startswith('text/html'):
|
||||
log('%s looks like a webpage - trying feed autodiscovery.', url, sender=self)
|
||||
if not hasattr(parsed_result.feed, 'links'):
|
||||
return (False, None)
|
||||
try:
|
||||
found_alternate_feed = False
|
||||
for link in parsed_result.feed.links:
|
||||
if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
|
||||
if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
|
||||
log('Found alternate feed link: %s', link.href, sender=self)
|
||||
parsed_result = feedparser.parse(link.href,
|
||||
agent=self.user_agent,
|
||||
modified=modified,
|
||||
etag=etag,
|
||||
)
|
||||
found_alternate_feed = True
|
||||
break
|
||||
|
||||
# We have not found a valid feed - abort here!
|
||||
if not found_alternate_feed:
|
||||
return (False, None)
|
||||
except:
|
||||
log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
|
||||
|
||||
updated = False
|
||||
status = parsed_result.get('status', None)
|
||||
|
||||
|
|
|
@ -95,7 +95,8 @@ class podcastChannel(object):
|
|||
return tmp[0]
|
||||
elif create:
|
||||
tmp = podcastChannel(url)
|
||||
tmp.update()
|
||||
if not tmp.update():
|
||||
return None
|
||||
tmp.save()
|
||||
db.force_last_new(tmp)
|
||||
return tmp
|
||||
|
@ -111,6 +112,13 @@ class podcastChannel(object):
|
|||
def update(self):
|
||||
(updated, c) = self.fc.fetch(self.url, self)
|
||||
|
||||
if c is None:
|
||||
return False
|
||||
|
||||
if self.url != c.url:
|
||||
log('Updating channel URL from %s to %s', self.url, c.url, sender=self)
|
||||
self.url = c.url
|
||||
|
||||
# update the cover if it's not there
|
||||
self.update_cover()
|
||||
|
||||
|
@ -118,7 +126,7 @@ class podcastChannel(object):
|
|||
# feedcache says the feed hasn't changed, return old
|
||||
if not updated:
|
||||
log('Channel %s is up to date', self.url)
|
||||
return
|
||||
return True
|
||||
|
||||
# Save etag and last-modified for later reuse
|
||||
if c.headers.get('etag'):
|
||||
|
@ -168,6 +176,7 @@ class podcastChannel(object):
|
|||
|
||||
# Now we can flush the updates.
|
||||
db.commit()
|
||||
return True
|
||||
|
||||
def update_cover(self, force=False):
|
||||
if self.cover_file is None or not os.path.exists(self.cover_file) or force:
|
||||
|
|
|
@ -112,11 +112,17 @@ def normalize_feed_url( url):
|
|||
This will also normalize feed:// and itpc:// to http://
|
||||
Also supported are phobos.apple.com links (iTunes podcast)
|
||||
and itms:// links (iTunes podcast direct link).
|
||||
|
||||
If no URL scheme is defined (e.g. "curry.com"), we will
|
||||
simply assume the user intends to add a http:// feed.
|
||||
"""
|
||||
|
||||
if not url or len( url) < 8:
|
||||
return None
|
||||
|
||||
if not '://' in url:
|
||||
url = 'http://' + url
|
||||
|
||||
if url.startswith('itms://'):
|
||||
url = parse_itunes_xml(url)
|
||||
|
||||
|
|
Loading…
Reference in a new issue