Merge pull request #277 from vossad01/html_description-migration

Data migration of `description` column during migration to v7 schema
This commit is contained in:
Adam Voss 2017-04-18 10:31:14 -05:00 committed by GitHub
commit f14c465d81
3 changed files with 24 additions and 3 deletions

View File

@ -147,11 +147,11 @@ class PodcastEpisode(PodcastModelObject):
episode.description = entry['description']
if entry.get('description_html'):
episode.description_html = entry['description_html']
# XXX: That's not a very well-informed heuristic to check
# if the description already contains HTML. Better ideas?
# TODO: This really should be handled in podcastparser and not here.
elif '<' in entry['description']:
elif util.is_html(entry['description']):
episode.description_html = entry['description']
episode.description = util.remove_html_tags(entry['description'])
episode.total_time = entry['total_time']
episode.published = entry['published']
episode.payment_url = entry['payment_url']

View File

@ -28,6 +28,8 @@ import shutil
import logging
logger = logging.getLogger(__name__)
from gpodder import util
EpisodeColumns = (
'podcast_id',
'title',
@ -108,6 +110,9 @@ UPGRADE_SQL = [
# Version 7: Add HTML description
(6, 7, """
ALTER TABLE episode ADD COLUMN description_html TEXT NOT NULL DEFAULT ''
UPDATE episode SET description_html=description WHERE is_html(description)
UPDATE episode SET description=remove_html_tags(description_html) WHERE is_html(description)
UPDATE podcast SET http_last_modified=NULL, http_etag=NULL
"""),
]
@ -194,6 +199,9 @@ def upgrade(db, filename):
initialize_database(db)
return
db.create_function('is_html', 1, util.is_html)
db.create_function('remove_html_tags', 1, util.remove_html_tags)
version = db.execute('SELECT version FROM version').fetchone()[0]
if version == CURRENT_VERSION:
return

View File

@ -595,6 +595,19 @@ def delete_file(filename):
pass
def is_html(text):
"""Heuristically tell if text is HTML
By looking for an open tag (more or less:)
>>> is_html('<h1>HELLO</h1>')
True
>>> is_html('a < b < c')
False
"""
html_test = re.compile('<[a-z][a-z0-9]*(?:\s.*?>|\/?>)', re.IGNORECASE | re.DOTALL)
return bool(html_test.search(text))
def remove_html_tags(html):
"""
Remove HTML tags from a string and replace numeric and