Thu, 20 Mar 2008 11:07:48 +0100 <thp@perli.net>

Performance-improving patch from Nick (nikosapi) to html entity replacement

	* src/gpodder/util.py: Merge patch from Nick (nikosapi.org) to really
	improve performance of the HTML tag stripper/entity replacement code
	by using regular expressions and some more intelligent code for
	converting numeric entities to Unicode characters



git-svn-id: svn://svn.berlios.de/gpodder/trunk@618 b0d088ad-0a06-0410-aad2-9ed5178a7e87
This commit is contained in:
Thomas Perl 2008-03-20 10:08:58 +00:00
parent 5b69a14181
commit 7012d63894
2 changed files with 26 additions and 14 deletions

View File

@ -1,3 +1,11 @@
Thu, 20 Mar 2008 11:07:48 +0100 <thp@perli.net>
Performance-improving patch from Nick (nikosapi) to html entity replacement
* src/gpodder/util.py: Merge patch from Nick (nikosapi.org) to really
improve performance of the HTML tag stripper/entity replacement code
by using regular expressions and some more intelligent code for
converting numeric entities to Unicode characters
Thu, 20 Mar 2008 11:06:32 +0100 <thp@perli.net>
Add demo code for displaying a splash screen

View File

@ -42,7 +42,7 @@ import stat
import re
import subprocess
import htmlentitydefs
from htmlentitydefs import entitydefs
import time
import locale
import gzip
@ -300,24 +300,28 @@ def delete_file( path):
pass
def remove_html_tags( html):
def remove_html_tags(html):
"""
Remove HTML tags from a string and replace numeric and
named entities with the corresponding character, so the
HTML text can be displayed in a simple text view.
"""
# strips html from a string (fix for <description> tags containing html)
rexp = re.compile( "<[^>]*>")
stripstr = rexp.sub( '', html)
# replaces numeric entities with entity names
dict = htmlentitydefs.codepoint2name
for key in dict.keys():
stripstr = stripstr.replace( '&#'+str(key)+';', '&'+unicode( dict[key], 'iso-8859-1')+';')
# strips html entities
dict = htmlentitydefs.entitydefs
for key in dict.keys():
stripstr = stripstr.replace( '&'+unicode(key,'iso-8859-1')+';', unicode(dict[key], 'iso-8859-1'))
return stripstr
# If we would want more speed, we could make these global
re_strip_tags = re.compile('<[^>]*>')
re_unicode_entities = re.compile('&#(\d{2,4});')
re_html_entities = re.compile('&(.{2,8});')
# Remove all HTML/XML tags from the string
result = re_strip_tags.sub('', html)
# Convert numeric XML entities to their unicode character
result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)
# Convert named HTML entities to their unicode character
result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)
return result
def torrent_filename( filename):