Initial work on statistics-based update optimizations
Currently for feeds updated in the last 24 hours, we will try to intelligently skip feed updates based on the expected publishing date of the next episode per podcast. Also, when the last feed updates is only 10 minutes ago, the update is also skipped.
This commit is contained in:
parent
4c7088dfa3
commit
a6b0187797
|
@ -0,0 +1,79 @@
|
|||
|
||||
#
|
||||
# corestats.py (COREy STATS)
|
||||
# Copyright (c) 2006-2007, Corey Goldberg (corey@goldb.org)
|
||||
# http://www.goldb.org/corestats.html
|
||||
#
|
||||
# statistical calculation class
|
||||
# for processing numeric sequences
|
||||
#
|
||||
# license: GNU LGPL
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# Slightly modified for use in gPodder
|
||||
#
|
||||
|
||||
class Stats(object):
|
||||
def __init__(self, sequence):
|
||||
# sequence of numbers we will process
|
||||
# convert all items to floats for numerical processing
|
||||
self.sequence = [float(item) for item in sequence]
|
||||
|
||||
def sum(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
return sum(self.sequence)
|
||||
|
||||
def count(self):
|
||||
return len(self.sequence)
|
||||
|
||||
def min(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
return min(self.sequence)
|
||||
|
||||
def max(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
return max(self.sequence)
|
||||
|
||||
def avg(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
return sum(self.sequence) / len(self.sequence)
|
||||
|
||||
def median(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
self.sequence.sort()
|
||||
return self.sequence[len(self.sequence) // 2]
|
||||
|
||||
def stdev(self):
|
||||
if len(self.sequence) < 1:
|
||||
return None
|
||||
else:
|
||||
avg = self.avg()
|
||||
sdsq = sum([(i - avg) ** 2 for i in self.sequence])
|
||||
stdev = (sdsq / (len(self.sequence) - 1)) ** .5
|
||||
return stdev
|
||||
|
||||
def percentile(self, percentile):
|
||||
if len(self.sequence) < 1:
|
||||
value = None
|
||||
elif percentile >= 100:
|
||||
raise ValueError('percentile must be < 100')
|
||||
else:
|
||||
element_idx = int(len(self.sequence) * (percentile / 100.0))
|
||||
self.sequence.sort()
|
||||
value = self.sequence[element_idx]
|
||||
return value
|
||||
|
|
@ -174,7 +174,10 @@ class Storage(object):
|
|||
("deleted", "INTEGER"),
|
||||
("channel_is_locked", "INTEGER"),
|
||||
("foldername", "TEXT"),
|
||||
("auto_foldername", "INTEGER")
|
||||
("auto_foldername", "INTEGER"),
|
||||
("release_expected", "INTEGER"),
|
||||
("release_deviation", "INTEGER"),
|
||||
("updated_timestamp", "INTEGER"),
|
||||
))
|
||||
|
||||
self.upgrade_table("episodes", (
|
||||
|
@ -276,7 +279,10 @@ class Storage(object):
|
|||
etag,
|
||||
channel_is_locked,
|
||||
foldername,
|
||||
auto_foldername
|
||||
auto_foldername,
|
||||
release_expected,
|
||||
release_deviation,
|
||||
updated_timestamp
|
||||
FROM
|
||||
channels
|
||||
WHERE
|
||||
|
@ -307,6 +313,9 @@ class Storage(object):
|
|||
'channel_is_locked': row[14],
|
||||
'foldername': row[15],
|
||||
'auto_foldername': row[16],
|
||||
'release_expected': row[17],
|
||||
'release_deviation': row[18],
|
||||
'updated_timestamp': row[19],
|
||||
}
|
||||
|
||||
if row[0] in stats:
|
||||
|
@ -360,10 +369,10 @@ class Storage(object):
|
|||
self.log("save_channel((%s)%s)", c.id or "new", c.url)
|
||||
|
||||
if c.id is None:
|
||||
cur.execute("INSERT INTO channels (url, title, override_title, link, description, image, pubDate, sync_to_devices, device_playlist_name, username, password, last_modified, etag, channel_is_locked, foldername, auto_foldername) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, ))
|
||||
cur.execute("INSERT INTO channels (url, title, override_title, link, description, image, pubDate, sync_to_devices, device_playlist_name, username, password, last_modified, etag, channel_is_locked, foldername, auto_foldername, release_expected, release_deviation, updated_timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.release_expected, c.release_deviation, c.updated_timestamp))
|
||||
self.channel_map[c.url] = cur.lastrowid
|
||||
else:
|
||||
cur.execute("UPDATE channels SET url = ?, title = ?, override_title = ?, link = ?, description = ?, image = ?, pubDate = ?, sync_to_devices = ?, device_playlist_name = ?, username = ?, password = ?, last_modified = ?, etag = ?, channel_is_locked = ?, foldername = ?, auto_foldername = ?, deleted = 0 WHERE id = ?", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.id, ))
|
||||
cur.execute("UPDATE channels SET url = ?, title = ?, override_title = ?, link = ?, description = ?, image = ?, pubDate = ?, sync_to_devices = ?, device_playlist_name = ?, username = ?, password = ?, last_modified = ?, etag = ?, channel_is_locked = ?, foldername = ?, auto_foldername = ?, release_expected = ?, release_deviation = ?, updated_timestamp = ?, deleted = 0 WHERE id = ?", (c.url, c.title, c.override_title, c.link, c.description, c.image, self.__mktime__(c.pubDate), c.sync_to_devices, c.device_playlist_name, c.username, c.password, c.last_modified, c.etag, c.channel_is_locked, c.foldername, c.auto_foldername, c.release_expected, c.release_deviation, c.updated_timestamp, c.id, ))
|
||||
|
||||
cur.close()
|
||||
self.lock.release()
|
||||
|
|
|
@ -118,7 +118,7 @@ app_authors = [
|
|||
'Alain Tauch', 'Alex Ghitza', 'Alistair Sutton', 'Anders Kvist', 'Andrei Dolganov', 'Andrew Bennett', 'Andy Busch',
|
||||
'Antonio Roversi', 'Aravind Seshadri', 'Atte André Jensen', 'audioworld',
|
||||
'Bastian Staeck', 'Bernd Schlapsi', 'Bill Barnard', 'Bill Peters', 'Bjørn Rasmussen', 'Camille Moncelier', 'Casey Watson',
|
||||
'Carlos Moffat', 'Chris Arnold', 'Chris Moffitt', 'Clark Burbidge', 'Cory Albrecht', 'daggpod', 'Daniel Ramos',
|
||||
'Carlos Moffat', 'Chris Arnold', 'Chris Moffitt', 'Clark Burbidge', 'Corey Goldberg', 'Cory Albrecht', 'daggpod', 'Daniel Ramos',
|
||||
'David Spreen', 'Doug Hellmann', 'Edouard Pellerin', 'Fabio Fiorentini', 'FFranci72', 'Florian Richter', 'Frank Harper',
|
||||
'Franz Seidl', 'FriedBunny', 'Gerrit Sangel', 'Gilles Lehoux', 'Götz Waschk',
|
||||
'Haim Roitgrund', 'Heinz Erhard', 'Hex', 'Holger Bauer', 'Holger Leskien', 'Iwan van der Kleijn', 'Jens Thiele',
|
||||
|
@ -2156,6 +2156,7 @@ class gPodder(BuilderWidget, dbus.service.Object):
|
|||
break
|
||||
except Exception, e:
|
||||
util.idle_add(self.show_message, _('There has been an error updating %s: %s') % (saxutils.escape(channel.url), saxutils.escape(str(e))), _('Error while updating feed'))
|
||||
log('Error: %s', str(e), sender=self, traceback=True)
|
||||
|
||||
# By the time we get here the update may have already been cancelled
|
||||
if not self.feed_cache_update_cancelled:
|
||||
|
|
|
@ -40,6 +40,7 @@ from gpodder import draw
|
|||
from gpodder import libtagupdate
|
||||
from gpodder import dumbshelve
|
||||
from gpodder import resolver
|
||||
from gpodder import corestats
|
||||
|
||||
from gpodder.liblogger import log
|
||||
from gpodder.libgpodder import gl
|
||||
|
@ -245,10 +246,24 @@ class PodcastChannel(PodcastModelObject):
|
|||
db.purge(gl.config.max_episodes_per_feed, self.id)
|
||||
|
||||
def _update_etag_modified(self, feed):
|
||||
self.updated_timestamp = time.time()
|
||||
self.calculate_publish_behaviour()
|
||||
self.etag = feed.headers.get('etag', self.etag)
|
||||
self.last_modified = feed.headers.get('last-modified', self.last_modified)
|
||||
|
||||
def update(self):
|
||||
if self.updated_timestamp > time.time() - 60*60*24:
|
||||
# If we have updated in the last 24 hours, do some optimizations
|
||||
if self.release_expected > time.time():
|
||||
hours = (self.release_expected-time.time())/(60*60)
|
||||
log('Expecting a release in %.2f hours - skipping %s', hours, self.title, sender=self)
|
||||
return
|
||||
|
||||
# If we have updated in the last 10 minutes, skip the update
|
||||
if self.updated_timestamp > time.time() - 60*10:
|
||||
log('Last update still too recent - skipping %s', self.title, sender=self)
|
||||
return
|
||||
|
||||
try:
|
||||
self.feed_fetcher.fetch_channel(self)
|
||||
except feedcore.UpdatedFeed, updated:
|
||||
|
@ -332,6 +347,29 @@ class PodcastChannel(PodcastModelObject):
|
|||
|
||||
self.channel_is_locked = False
|
||||
|
||||
self.release_expected = time.time()
|
||||
self.release_deviation = 0
|
||||
self.updated_timestamp = 0
|
||||
|
||||
def calculate_publish_behaviour(self):
|
||||
episodes = db.load_episodes(self, factory=self.episode_factory, limit=30)
|
||||
if len(episodes) < 3:
|
||||
return
|
||||
|
||||
deltas = []
|
||||
latest = max(e.pubDate for e in episodes)
|
||||
for index in range(len(episodes)-1):
|
||||
if episodes[index].pubDate != 0 and episodes[index+1].pubDate != 0:
|
||||
deltas.append(episodes[index].pubDate - episodes[index+1].pubDate)
|
||||
|
||||
if len(deltas) > 1:
|
||||
stats = corestats.Stats(deltas)
|
||||
self.release_expected = min([latest+stats.stdev(), latest+(stats.min()+stats.avg())*.5])
|
||||
self.release_deviation = stats.stdev()
|
||||
else:
|
||||
self.release_expected = latest
|
||||
self.release_deviation = 0
|
||||
|
||||
def request_save_dir_size(self):
|
||||
if not self.__save_dir_size_set:
|
||||
self.update_save_dir_size()
|
||||
|
|
Loading…
Reference in New Issue