From 20dd397e9ee01031cc11be0d473056ed204f0f22 Mon Sep 17 00:00:00 2001 From: auouymous Date: Sun, 27 Feb 2022 02:52:06 -0700 Subject: [PATCH] Memoize youtube channel ID and feed data. Internal youtube support and the youtube-dl extension both cause the youtube feed URL to be fetched three times per update. Caching the feed data from feedcore allows internal support to only load the feed once. The lru_cache removes one of the youtube-dl fetches, not perfect, but two is better than three. I saw a 40% decrease in update times when using the internal youtube code. Throwing an exception from get_channel_id_url() prevents get_cover() and get_channel_desc() from attempting to fetch a None URL, and provides more accurate errors. The lru_cache on get_youtube_id() saves 1ms per youtube channel when updating. Which adds up with a lot of channels, and might be more on slower devices. --- src/gpodder/feedcore.py | 13 ++++++++++--- src/gpodder/model.py | 7 ++++--- src/gpodder/youtube.py | 20 ++++++++++++++------ tests/test_feedcore.py | 5 +++-- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/gpodder/feedcore.py b/src/gpodder/feedcore.py index 2d536f6f..39cacc3f 100644 --- a/src/gpodder/feedcore.py +++ b/src/gpodder/feedcore.py @@ -110,6 +110,12 @@ class FeedAutodiscovery(HTMLParser): self._resolved_url = url +class FetcherFeedData: + def __init__(self, text, content): + self.text = text + self.content = content + + class Fetcher(object): # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html FEED_TYPES = ('application/rss+xml', @@ -152,7 +158,7 @@ class Fetcher(object): else: raise UnknownStatusCode(status) - def parse_feed(self, url, data_stream, headers, status, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs): """ kwargs are passed from Fetcher.fetch :param str url: real url @@ -169,7 +175,7 @@ class Fetcher(object): if url.startswith('file://'): url = url[len('file://'):] stream = open(url) - return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs) + return self.parse_feed(url, None, stream, {}, UPDATED_FEED, **kwargs) # remote feed headers = {} @@ -210,4 +216,5 @@ class Fetcher(object): # xml documents specify the encoding inline so better pass encoded body. # Especially since requests will use ISO-8859-1 for content-type 'text/xml' # if the server doesn't specify a charset. - return self.parse_feed(url, BytesIO(stream.content), stream.headers, UPDATED_FEED, **kwargs) + return self.parse_feed(url, FetcherFeedData(stream.text, stream.content), BytesIO(stream.content), stream.headers, + UPDATED_FEED, **kwargs) diff --git a/src/gpodder/model.py b/src/gpodder/model.py index 1e130543..e8a37ca5 100644 --- a/src/gpodder/model.py +++ b/src/gpodder/model.py @@ -110,13 +110,13 @@ class PodcastParserFeed(Feed): def get_link(self): vid = youtube.get_youtube_id(self.feed['url']) if vid is not None: - self.feed['link'] = youtube.get_channel_id_url(self.feed['url']) + self.feed['link'] = youtube.get_channel_id_url(self.feed['url'], self.fetcher.feed_data) return self.feed.get('link') def get_description(self): vid = youtube.get_youtube_id(self.feed['url']) if vid is not None: - self.feed['description'] = youtube.get_channel_desc(self.feed['url']) + self.feed['description'] = youtube.get_channel_desc(self.feed['url'], self.fetcher.feed_data) return self.feed.get('description') def get_cover_url(self): @@ -215,7 +215,8 @@ class gPodderFetcher(feedcore.Fetcher): url = vimeo.get_real_channel_url(url) return url - def parse_feed(self, url, data_stream, headers, status, max_episodes=0, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, max_episodes=0, **kwargs): + self.feed_data = feed_data try: feed = podcastparser.parse(url, data_stream) feed['url'] = url diff --git a/src/gpodder/youtube.py b/src/gpodder/youtube.py index 14099b7d..5816bc3c 100644 --- a/src/gpodder/youtube.py +++ b/src/gpodder/youtube.py @@ -26,6 +26,7 @@ import logging import re import urllib import xml.etree.ElementTree +from functools import lru_cache from html.parser import HTMLParser from urllib.parse import parse_qs @@ -366,6 +367,7 @@ def get_real_download_url(url, allow_partial, preferred_fmt_ids=None): return url, duration +@lru_cache(1) def get_youtube_id(url): r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url) if r is not None: @@ -427,10 +429,14 @@ def get_real_channel_url(url): return for_each_feed_pattern(return_user_feed, url, url) -def get_channel_id_url(url): +@lru_cache(1) +def get_channel_id_url(url, feed_data=None): if 'youtube.com' in url: try: - req = util.urlopen(url) + if feed_data is None: + req = util.urlopen(url) + else: + req = feed_data # video page may contain corrupt HTML/XML, search for tag to avoid exception m = re.search(r'', req.text) if m: @@ -445,8 +451,10 @@ def get_channel_id_url(url): except Exception: logger.warning('Could not retrieve youtube channel id.', exc_info=True) + raise Exception('Could not retrieve youtube channel id.') -def get_cover(url): + +def get_cover(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): @@ -471,7 +479,7 @@ def get_cover(url): self.url.append(attribute_dict['src']) try: - channel_url = get_channel_id_url(url) + channel_url = get_channel_id_url(url, feed_data) html_data = util.response_text(util.urlopen(channel_url)) parser = YouTubeHTMLCoverParser() parser.feed(html_data) @@ -523,7 +531,7 @@ def get_gdpr_consent_url(html_data): raise YouTubeError('No acceptable GDPR consent URL') -def get_channel_desc(url): +def get_channel_desc(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLDesc(HTMLParser): @@ -542,7 +550,7 @@ def get_channel_desc(url): self.description = attribute_dict['content'] try: - channel_url = get_channel_id_url(url) + channel_url = get_channel_id_url(url, feed_data) html_data = util.response_text(util.urlopen(channel_url)) parser = YouTubeHTMLDesc() parser.feed(html_data) diff --git a/tests/test_feedcore.py b/tests/test_feedcore.py index 957a32ff..13402424 100644 --- a/tests/test_feedcore.py +++ b/tests/test_feedcore.py @@ -25,10 +25,11 @@ from gpodder.feedcore import Fetcher, Result, NEW_LOCATION, NOT_MODIFIED, UPDATE class MyFetcher(Fetcher): - def parse_feed(self, url, data_stream, headers, status, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs): return Result(status, { 'parse_feed': { 'url': url, + 'feed_data': feed_data, 'data_stream': data_stream, 'headers': headers, 'extra_args': dict(**kwargs), @@ -112,4 +113,4 @@ def test_temporary_error_retry(httpserver): assert res.status == UPDATED_FEED args = res.feed['parse_feed'] assert args['headers']['content-type'] == 'text/xml' - assert args['url'] == httpserver.url_for('/feed') \ No newline at end of file + assert args['url'] == httpserver.url_for('/feed')