Initial work on statistics-based update optimizations

Currently for feeds updated in the last 24 hours, we
will try to intelligently skip feed updates based on
the expected publishing date of the next episode per
podcast.

Also, when the last feed updates is only 10 minutes
ago, the update is also skipped.
This commit is contained in:
Thomas Perl 2009-06-12 02:44:04 +02:00
parent 4c7088dfa3
commit a6b0187797
4 changed files with 132 additions and 5 deletions

79
src/gpodder/corestats.py Normal file
View File

@ -0,0 +1,79 @@
#
# corestats.py (COREy STATS)
# Copyright (c) 2006-2007, Corey Goldberg (corey@goldb.org)
# http://www.goldb.org/corestats.html
#
# statistical calculation class
# for processing numeric sequences
#
# license: GNU LGPL
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# Slightly modified for use in gPodder
#
class Stats(object):
def __init__(self, sequence):
# sequence of numbers we will process
# convert all items to floats for numerical processing
self.sequence = [float(item) for item in sequence]
def sum(self):
if len(self.sequence) < 1:
return None
else:
return sum(self.sequence)
def count(self):
return len(self.sequence)
def min(self):
if len(self.sequence) < 1:
return None
else:
return min(self.sequence)
def max(self):
if len(self.sequence) < 1:
return None
else:
return max(self.sequence)
def avg(self):
if len(self.sequence) < 1:
return None
else:
return sum(self.sequence) / len(self.sequence)
def median(self):
if len(self.sequence) < 1:
return None
else:
self.sequence.sort()
return self.sequence[len(self.sequence) // 2]
def stdev(self):
if len(self.sequence) < 1:
return None
else:
avg = self.avg()
sdsq = sum([(i - avg) ** 2 for i in self.sequence])
stdev = (sdsq / (len(self.sequence) - 1)) ** .5
return stdev
def percentile(self, percentile):
if len(self.sequence) < 1:
value = None
elif percentile >= 100:
raise ValueError('percentile must be < 100')
else:
element_idx = int(len(self.sequence) * (percentile / 100.0))
self.sequence.sort()
value = self.sequence[element_idx]
return value

View File

@ -174,7 +174,10 @@ class Storage(object):
("deleted", "INTEGER"),
("channel_is_locked", "INTEGER"),
("foldername", "TEXT"),
("auto_foldername", "INTEGER")
("auto_foldername", "INTEGER"),
("release_expected", "INTEGER"),
("release_deviation", "INTEGER"),
("updated_timestamp", "INTEGER"),
))
self.upgrade_table("episodes", (
@ -276,7 +279,10 @@ class Storage(object):
etag,
channel_is_locked,
foldername,
auto_foldername
auto_foldername,
release_expected,
release_deviation,
updated_timestamp
FROM
channels
WHERE
@ -307,6 +313,9 @@ class Storage(object):
'channel_is_locked': row[14],
'foldername': row[15],
'auto_foldername': row[16],
'release_expected': row[17],
'release_deviation': row[18],
'updated_timestamp': row[19],
}
if row[0] in stats:
@ -360,10 +369,10 @@ class Storage(object):
self.log("save_channel((%s)%s)", c.id or "new", c.url)
if c.id is None:
cur.execute("INSERT INTO channels (url, title, override_title, link, description, image, pubDate, sync_to_devices, device_playlist_name, username, password, last_modified, etag, channel_is_locked, foldername, auto_foldername) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, ))
cur.execute("INSERT INTO channels (url, title, override_title, link, description, image, pubDate, sync_to_devices, device_playlist_name, username, password, last_modified, etag, channel_is_locked, foldername, auto_foldername, release_expected, release_deviation, updated_timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.release_expected, c.release_deviation, c.updated_timestamp))
self.channel_map[c.url] = cur.lastrowid
else:
cur.execute("UPDATE channels SET url = ?, title = ?, override_title = ?, link = ?, description = ?, image = ?, pubDate = ?, sync_to_devices = ?, device_playlist_name = ?, username = ?, password = ?, last_modified = ?, etag = ?, channel_is_locked = ?, foldername = ?, auto_foldername = ?, deleted = 0 WHERE id = ?", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.id, ))
cur.execute("UPDATE channels SET url = ?, title = ?, override_title = ?, link = ?, description = ?, image = ?, pubDate = ?, sync_to_devices = ?, device_playlist_name = ?, username = ?, password = ?, last_modified = ?, etag = ?, channel_is_locked = ?, foldername = ?, auto_foldername = ?, release_expected = ?, release_deviation = ?, updated_timestamp = ?, deleted = 0 WHERE id = ?", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.release_expected, c.release_deviation, c.updated_timestamp, c.id, ))
cur.close()
self.lock.release()

View File

@ -118,7 +118,7 @@ app_authors = [
'Alain Tauch', 'Alex Ghitza', 'Alistair Sutton', 'Anders Kvist', 'Andrei Dolganov', 'Andrew Bennett', 'Andy Busch',
'Antonio Roversi', 'Aravind Seshadri', 'Atte André Jensen', 'audioworld',
'Bastian Staeck', 'Bernd Schlapsi', 'Bill Barnard', 'Bill Peters', 'Bjørn Rasmussen', 'Camille Moncelier', 'Casey Watson',
'Carlos Moffat', 'Chris Arnold', 'Chris Moffitt', 'Clark Burbidge', 'Cory Albrecht', 'daggpod', 'Daniel Ramos',
'Carlos Moffat', 'Chris Arnold', 'Chris Moffitt', 'Clark Burbidge', 'Corey Goldberg', 'Cory Albrecht', 'daggpod', 'Daniel Ramos',
'David Spreen', 'Doug Hellmann', 'Edouard Pellerin', 'Fabio Fiorentini', 'FFranci72', 'Florian Richter', 'Frank Harper',
'Franz Seidl', 'FriedBunny', 'Gerrit Sangel', 'Gilles Lehoux', 'Götz Waschk',
'Haim Roitgrund', 'Heinz Erhard', 'Hex', 'Holger Bauer', 'Holger Leskien', 'Iwan van der Kleijn', 'Jens Thiele',
@ -2156,6 +2156,7 @@ class gPodder(BuilderWidget, dbus.service.Object):
break
except Exception, e:
util.idle_add(self.show_message, _('There has been an error updating %s: %s') % (saxutils.escape(channel.url), saxutils.escape(str(e))), _('Error while updating feed'))
log('Error: %s', str(e), sender=self, traceback=True)
# By the time we get here the update may have already been cancelled
if not self.feed_cache_update_cancelled:

View File

@ -40,6 +40,7 @@ from gpodder import draw
from gpodder import libtagupdate
from gpodder import dumbshelve
from gpodder import resolver
from gpodder import corestats
from gpodder.liblogger import log
from gpodder.libgpodder import gl
@ -245,10 +246,24 @@ class PodcastChannel(PodcastModelObject):
db.purge(gl.config.max_episodes_per_feed, self.id)
def _update_etag_modified(self, feed):
self.updated_timestamp = time.time()
self.calculate_publish_behaviour()
self.etag = feed.headers.get('etag', self.etag)
self.last_modified = feed.headers.get('last-modified', self.last_modified)
def update(self):
if self.updated_timestamp > time.time() - 60*60*24:
# If we have updated in the last 24 hours, do some optimizations
if self.release_expected > time.time():
hours = (self.release_expected-time.time())/(60*60)
log('Expecting a release in %.2f hours - skipping %s', hours, self.title, sender=self)
return
# If we have updated in the last 10 minutes, skip the update
if self.updated_timestamp > time.time() - 60*10:
log('Last update still too recent - skipping %s', self.title, sender=self)
return
try:
self.feed_fetcher.fetch_channel(self)
except feedcore.UpdatedFeed, updated:
@ -332,6 +347,29 @@ class PodcastChannel(PodcastModelObject):
self.channel_is_locked = False
self.release_expected = time.time()
self.release_deviation = 0
self.updated_timestamp = 0
def calculate_publish_behaviour(self):
episodes = db.load_episodes(self, factory=self.episode_factory, limit=30)
if len(episodes) < 3:
return
deltas = []
latest = max(e.pubDate for e in episodes)
for index in range(len(episodes)-1):
if episodes[index].pubDate != 0 and episodes[index+1].pubDate != 0:
deltas.append(episodes[index].pubDate - episodes[index+1].pubDate)
if len(deltas) > 1:
stats = corestats.Stats(deltas)
self.release_expected = min([latest+stats.stdev(), latest+(stats.min()+stats.avg())*.5])
self.release_deviation = stats.stdev()
else:
self.release_expected = latest
self.release_deviation = 0
def request_save_dir_size(self):
if not self.__save_dir_size_set:
self.update_save_dir_size()