Better file extension guessing for URLs

git-svn-id: svn://svn.berlios.de/gpodder/trunk@461 b0d088ad-0a06-0410-aad2-9ed5178a7e87
This commit is contained in:
Thomas Perl 2007-11-11 13:29:02 +00:00
parent 5da37feed7
commit 9069d10158
2 changed files with 38 additions and 3 deletions

View File

@ -1,3 +1,12 @@
Sun, 11 Nov 2007 14:24:17 +0100 <thp@perli.net>
Better file extension guessing for URLs
* src/gpodder/util.py: Improve file_extension_from_url() by adding
additional checks for known good extensions and recurse into the query
string if it looks like an URL; this should fix compatibility problems
for feeds with strange URLs; should provide more reliable guessing;
thanks to Nicolas Quienot <niqooo@gmail.com> for the bug report
Fri, 09 Nov 2007 10:05:36 +0100 <thp@perli.net>
Fix format_filesize() usage in episodes selector and podcastItem

View File

@ -245,10 +245,34 @@ def file_extension_from_url( url):
Extracts the (lowercase) file name extension (with dot)
from a URL, e.g. http://server.com/file.MP3?download=yes
will result in the string ".mp3" being returned.
This function will also try to best-guess the "real"
extension for a media file (audio, video, torrent) by
trying to match an extension to these types and recurse
into the query string to find better matches, if the
original extension does not resolve to a known type.
http://my.net/redirect.php?my.net/file.ogg => ".ogg"
http://server/get.jsp?file=/episode0815.MOV => ".mov"
"""
path = urlparse.urlparse( url)[2]
filename = urllib.unquote( os.path.basename( path))
return os.path.splitext( filename)[1].lower()
(scheme, netloc, path, para, query, fragid) = urlparse.urlparse(url)
filename = os.path.basename( urllib.unquote(path))
(filename, extension) = os.path.splitext(filename)
if file_type_by_extension(extension) != None:
# We have found a valid extension (audio, video, torrent)
return extension.lower()
# If the query string looks like a possible URL, try that first
if len(query.strip()) > 0 and query.find('/') != -1:
query_url = '://'.join((scheme, urllib.unquote(query)))
query_extension = file_extension_from_url(query_url)
if file_type_by_extension(query_extension) != None:
return query_extension
# No exact match found, simply return the original extension
return extension.lower()
def file_type_by_extension( extension):
@ -270,6 +294,8 @@ def file_type_by_extension( extension):
if extension[0] == '.':
extension = extension[1:]
extension = extension.lower()
for type in types:
if extension in types[type]:
return type