#This file is part of html2plaintext. The COPYRIGHT file at the top level of #this repository contains the full copyright notices and license terms. import re from BeautifulSoup import BeautifulSoup, SoupStrainer import htmlentitydefs def html2plaintext(html, body_id=None, encoding='ascii'): """Convert the HTML to plain text""" urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: if html.count('', '*').replace('', '*') html = html.replace('', '*').replace('', '*') html = html.replace('

', '*').replace('

', '*') html = html.replace('

', '**').replace('

', '**') html = html.replace('

', '**').replace('

', '**') html = html.replace('', '/').replace('', '/') html = html.replace('\n', ' ') html = html.replace('
', '\n') html = html.replace(' ', ' ') html = html.replace('

', '\n\n') html = re.sub('', '\n', html) html = html.replace(' ' * 2, ' ') def desperate_fixer(g): return ' ' html = re.sub('<.*?>', desperate_fixer, html) html = u'\n'.join([x.lstrip() for x in html.splitlines()]) # lstrip lines for i, url in enumerate(url_index): if i == 0: html += u'\n\n' html += u'[%s] %s\n' % (i + 1, url) html = unescape(html) return html def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)