Merge pull request #277 from vossad01/html_description-migration

Data migration of `description` column during migration to v7 schema
2017-04-18 10:31:14 -05:00 · 2017-04-18 10:31:14 -05:00 · f14c465d81
parent 1fa31c39c2 f456365a14
commit f14c465d81
3 changed files with 24 additions and 3 deletions
--- a/src/gpodder/model.py
+++ b/src/gpodder/model.py
@ -147,11 +147,11 @@ class PodcastEpisode(PodcastModelObject):
        episode.description = entry['description']
        if entry.get('description_html'):
            episode.description_html = entry['description_html']
-        # XXX: That's not a very well-informed heuristic to check
-        # if the description already contains HTML. Better ideas?
        # TODO: This really should be handled in podcastparser and not here.
-        elif '<' in entry['description']:
+        elif util.is_html(entry['description']):
            episode.description_html = entry['description']
+            episode.description = util.remove_html_tags(entry['description'])
+
        episode.total_time = entry['total_time']
        episode.published = entry['published']
        episode.payment_url = entry['payment_url']
--- a/src/gpodder/schema.py
+++ b/src/gpodder/schema.py
@ -28,6 +28,8 @@ import shutil
 import logging
 logger = logging.getLogger(__name__)

+from gpodder import util
+
 EpisodeColumns = (
    'podcast_id',
    'title',
@ -108,6 +110,9 @@ UPGRADE_SQL = [
        # Version 7: Add HTML description
        (6, 7, """
        ALTER TABLE episode ADD COLUMN description_html TEXT NOT NULL DEFAULT ''
+        UPDATE episode SET description_html=description WHERE is_html(description)
+        UPDATE episode SET description=remove_html_tags(description_html) WHERE is_html(description)
+        UPDATE podcast SET http_last_modified=NULL, http_etag=NULL
        """),
 ]

@ -194,6 +199,9 @@ def upgrade(db, filename):
        initialize_database(db)
        return

+    db.create_function('is_html', 1, util.is_html)
+    db.create_function('remove_html_tags', 1, util.remove_html_tags)
+
    version = db.execute('SELECT version FROM version').fetchone()[0]
    if version == CURRENT_VERSION:
        return
--- a/src/gpodder/util.py
+++ b/src/gpodder/util.py
@ -595,6 +595,19 @@ def delete_file(filename):
        pass


+def is_html(text):
+    """Heuristically tell if text is HTML
+
+    By looking for an open tag (more or less:)
+    >>> is_html('<h1>HELLO</h1>')
+    True
+    >>> is_html('a < b < c')
+    False
+    """
+    html_test = re.compile('<[a-z][a-z0-9]*(?:\s.*?>|\/?>)', re.IGNORECASE | re.DOTALL)
+    return bool(html_test.search(text))
+
+
 def remove_html_tags(html):
    """
    Remove HTML tags from a string and replace numeric and