#This file is part of html2plaintext. The COPYRIGHT file at the top level of
#this repository contains the full copyright notices and license terms.
import re
from BeautifulSoup import BeautifulSoup, SoupStrainer
import htmlentitydefs
def html2plaintext(html, body_id=None, encoding='ascii'):
"""Convert the HTML to plain text"""
urls = []
if body_id is not None:
strainer = SoupStrainer(id=body_id)
else:
if html.count('
', '*').replace('', '*')
html = html.replace('', '*').replace('', '*')
html = html.replace('', '*').replace('
', '*')
html = html.replace('', '**').replace('
', '**')
html = html.replace('', '**').replace('
', '**')
html = html.replace('', '/').replace('', '/')
html = html.replace('\n', ' ')
html = html.replace('
', '\n')
html = html.replace(' ', ' ')
html = html.replace('', '\n\n')
html = re.sub('
', '\n', html)
html = html.replace(' ' * 2, ' ')
def desperate_fixer(g):
return ' '
html = re.sub('<.*?>', desperate_fixer, html)
html = u'\n'.join([x.lstrip() for x in html.splitlines()]) # lstrip lines
for i, url in enumerate(url_index):
if i == 0:
html += u'\n\n'
html += u'[%s] %s\n' % (i + 1, url)
html = unescape(html)
return html
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "":
# character reference
try:
if text[:3] == "":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("?\w+;", fixup, text)