schema v7: update description and description_html fields

1. initialize description_html when description used to contain HTML
 2. strip HTML tags from description
 3. reset last updated from feed to force fetch description/description_html
This commit is contained in:
Eric Le Lay 2017-04-17 19:19:25 +02:00 committed by Adam Voss
parent 1fa31c39c2
commit 4ab407007e
2 changed files with 21 additions and 0 deletions

View File

@ -28,6 +28,8 @@ import shutil
import logging
logger = logging.getLogger(__name__)
from gpodder import util
EpisodeColumns = (
'podcast_id',
'title',
@ -108,6 +110,9 @@ UPGRADE_SQL = [
# Version 7: Add HTML description
(6, 7, """
ALTER TABLE episode ADD COLUMN description_html TEXT NOT NULL DEFAULT ''
UPDATE episode SET description_html=description WHERE is_html(description)
UPDATE episode SET description=remove_html_tags(description_html) WHERE is_html(description)
UPDATE podcast SET http_last_modified=NULL, http_etag=NULL
"""),
]
@ -194,6 +199,9 @@ def upgrade(db, filename):
initialize_database(db)
return
db.create_function('is_html', 1, util.is_html)
db.create_function('remove_html_tags', 1, util.remove_html_tags)
version = db.execute('SELECT version FROM version').fetchone()[0]
if version == CURRENT_VERSION:
return

View File

@ -595,6 +595,19 @@ def delete_file(filename):
pass
def is_html(text):
"""Heuristically tell if text is HTML
By looking for an open tag (more or less:)
>>> is_html('<h1>HELLO</h1>')
True
>>> is_html('a < b < c')
False
"""
e = re.compile('<[a-zA-Z][a-zA-Z0-9]*(\\s.*)?>')
return e.search(text) is not None
def remove_html_tags(html):
"""
Remove HTML tags from a string and replace numeric and