Thu, 20 Mar 2008 11:07:48 +0100 <thp@perli.net>
Performance-improving patch from Nick (nikosapi) to html entity replacement * src/gpodder/util.py: Merge patch from Nick (nikosapi.org) to really improve performance of the HTML tag stripper/entity replacement code by using regular expressions and some more intelligent code for converting numeric entities to Unicode characters git-svn-id: svn://svn.berlios.de/gpodder/trunk@618 b0d088ad-0a06-0410-aad2-9ed5178a7e87
This commit is contained in:
parent
5b69a14181
commit
7012d63894
|
@ -1,3 +1,11 @@
|
|||
Thu, 20 Mar 2008 11:07:48 +0100 <thp@perli.net>
|
||||
Performance-improving patch from Nick (nikosapi) to html entity replacement
|
||||
|
||||
* src/gpodder/util.py: Merge patch from Nick (nikosapi.org) to really
|
||||
improve performance of the HTML tag stripper/entity replacement code
|
||||
by using regular expressions and some more intelligent code for
|
||||
converting numeric entities to Unicode characters
|
||||
|
||||
Thu, 20 Mar 2008 11:06:32 +0100 <thp@perli.net>
|
||||
Add demo code for displaying a splash screen
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ import stat
|
|||
|
||||
import re
|
||||
import subprocess
|
||||
import htmlentitydefs
|
||||
from htmlentitydefs import entitydefs
|
||||
import time
|
||||
import locale
|
||||
import gzip
|
||||
|
@ -300,24 +300,28 @@ def delete_file( path):
|
|||
pass
|
||||
|
||||
|
||||
def remove_html_tags( html):
|
||||
|
||||
def remove_html_tags(html):
|
||||
"""
|
||||
Remove HTML tags from a string and replace numeric and
|
||||
named entities with the corresponding character, so the
|
||||
HTML text can be displayed in a simple text view.
|
||||
"""
|
||||
# strips html from a string (fix for <description> tags containing html)
|
||||
rexp = re.compile( "<[^>]*>")
|
||||
stripstr = rexp.sub( '', html)
|
||||
# replaces numeric entities with entity names
|
||||
dict = htmlentitydefs.codepoint2name
|
||||
for key in dict.keys():
|
||||
stripstr = stripstr.replace( '&#'+str(key)+';', '&'+unicode( dict[key], 'iso-8859-1')+';')
|
||||
# strips html entities
|
||||
dict = htmlentitydefs.entitydefs
|
||||
for key in dict.keys():
|
||||
stripstr = stripstr.replace( '&'+unicode(key,'iso-8859-1')+';', unicode(dict[key], 'iso-8859-1'))
|
||||
return stripstr
|
||||
# If we would want more speed, we could make these global
|
||||
re_strip_tags = re.compile('<[^>]*>')
|
||||
re_unicode_entities = re.compile('&#(\d{2,4});')
|
||||
re_html_entities = re.compile('&(.{2,8});')
|
||||
|
||||
# Remove all HTML/XML tags from the string
|
||||
result = re_strip_tags.sub('', html)
|
||||
|
||||
# Convert numeric XML entities to their unicode character
|
||||
result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)
|
||||
|
||||
# Convert named HTML entities to their unicode character
|
||||
result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def torrent_filename( filename):
|
||||
|
|
Loading…
Reference in New Issue