Memoize youtube channel ID and feed data.

Internal youtube support and the youtube-dl extension both cause the
youtube feed URL to be fetched three times per update. Caching the feed
data from feedcore allows internal support to only load the feed once.
The lru_cache removes one of the youtube-dl fetches, not perfect, but
two is better than three. I saw a 40% decrease in update times when
using the internal youtube code.

Throwing an exception from get_channel_id_url() prevents get_cover() and
get_channel_desc() from attempting to fetch a None URL, and provides
more accurate errors.

The lru_cache on get_youtube_id() saves 1ms per youtube channel when
updating. Which adds up with a lot of channels, and might be more on
slower devices.
This commit is contained in:
auouymous 2022-02-27 02:52:06 -07:00
parent be64fdda43
commit 20dd397e9e
4 changed files with 31 additions and 14 deletions

View File

@ -110,6 +110,12 @@ class FeedAutodiscovery(HTMLParser):
self._resolved_url = url
class FetcherFeedData:
def __init__(self, text, content):
self.text = text
self.content = content
class Fetcher(object):
# Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
FEED_TYPES = ('application/rss+xml',
@ -152,7 +158,7 @@ class Fetcher(object):
else:
raise UnknownStatusCode(status)
def parse_feed(self, url, data_stream, headers, status, **kwargs):
def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs):
"""
kwargs are passed from Fetcher.fetch
:param str url: real url
@ -169,7 +175,7 @@ class Fetcher(object):
if url.startswith('file://'):
url = url[len('file://'):]
stream = open(url)
return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs)
return self.parse_feed(url, None, stream, {}, UPDATED_FEED, **kwargs)
# remote feed
headers = {}
@ -210,4 +216,5 @@ class Fetcher(object):
# xml documents specify the encoding inline so better pass encoded body.
# Especially since requests will use ISO-8859-1 for content-type 'text/xml'
# if the server doesn't specify a charset.
return self.parse_feed(url, BytesIO(stream.content), stream.headers, UPDATED_FEED, **kwargs)
return self.parse_feed(url, FetcherFeedData(stream.text, stream.content), BytesIO(stream.content), stream.headers,
UPDATED_FEED, **kwargs)

View File

@ -110,13 +110,13 @@ class PodcastParserFeed(Feed):
def get_link(self):
vid = youtube.get_youtube_id(self.feed['url'])
if vid is not None:
self.feed['link'] = youtube.get_channel_id_url(self.feed['url'])
self.feed['link'] = youtube.get_channel_id_url(self.feed['url'], self.fetcher.feed_data)
return self.feed.get('link')
def get_description(self):
vid = youtube.get_youtube_id(self.feed['url'])
if vid is not None:
self.feed['description'] = youtube.get_channel_desc(self.feed['url'])
self.feed['description'] = youtube.get_channel_desc(self.feed['url'], self.fetcher.feed_data)
return self.feed.get('description')
def get_cover_url(self):
@ -215,7 +215,8 @@ class gPodderFetcher(feedcore.Fetcher):
url = vimeo.get_real_channel_url(url)
return url
def parse_feed(self, url, data_stream, headers, status, max_episodes=0, **kwargs):
def parse_feed(self, url, feed_data, data_stream, headers, status, max_episodes=0, **kwargs):
self.feed_data = feed_data
try:
feed = podcastparser.parse(url, data_stream)
feed['url'] = url

View File

@ -26,6 +26,7 @@ import logging
import re
import urllib
import xml.etree.ElementTree
from functools import lru_cache
from html.parser import HTMLParser
from urllib.parse import parse_qs
@ -366,6 +367,7 @@ def get_real_download_url(url, allow_partial, preferred_fmt_ids=None):
return url, duration
@lru_cache(1)
def get_youtube_id(url):
r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url)
if r is not None:
@ -427,10 +429,14 @@ def get_real_channel_url(url):
return for_each_feed_pattern(return_user_feed, url, url)
def get_channel_id_url(url):
@lru_cache(1)
def get_channel_id_url(url, feed_data=None):
if 'youtube.com' in url:
try:
req = util.urlopen(url)
if feed_data is None:
req = util.urlopen(url)
else:
req = feed_data
# video page may contain corrupt HTML/XML, search for tag to avoid exception
m = re.search(r'<meta itemprop="channelId" content="([^"]+)">', req.text)
if m:
@ -445,8 +451,10 @@ def get_channel_id_url(url):
except Exception:
logger.warning('Could not retrieve youtube channel id.', exc_info=True)
raise Exception('Could not retrieve youtube channel id.')
def get_cover(url):
def get_cover(url, feed_data=None):
if 'youtube.com' in url:
class YouTubeHTMLCoverParser(HTMLParser):
@ -471,7 +479,7 @@ def get_cover(url):
self.url.append(attribute_dict['src'])
try:
channel_url = get_channel_id_url(url)
channel_url = get_channel_id_url(url, feed_data)
html_data = util.response_text(util.urlopen(channel_url))
parser = YouTubeHTMLCoverParser()
parser.feed(html_data)
@ -523,7 +531,7 @@ def get_gdpr_consent_url(html_data):
raise YouTubeError('No acceptable GDPR consent URL')
def get_channel_desc(url):
def get_channel_desc(url, feed_data=None):
if 'youtube.com' in url:
class YouTubeHTMLDesc(HTMLParser):
@ -542,7 +550,7 @@ def get_channel_desc(url):
self.description = attribute_dict['content']
try:
channel_url = get_channel_id_url(url)
channel_url = get_channel_id_url(url, feed_data)
html_data = util.response_text(util.urlopen(channel_url))
parser = YouTubeHTMLDesc()
parser.feed(html_data)

View File

@ -25,10 +25,11 @@ from gpodder.feedcore import Fetcher, Result, NEW_LOCATION, NOT_MODIFIED, UPDATE
class MyFetcher(Fetcher):
def parse_feed(self, url, data_stream, headers, status, **kwargs):
def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs):
return Result(status, {
'parse_feed': {
'url': url,
'feed_data': feed_data,
'data_stream': data_stream,
'headers': headers,
'extra_args': dict(**kwargs),
@ -112,4 +113,4 @@ def test_temporary_error_retry(httpserver):
assert res.status == UPDATED_FEED
args = res.feed['parse_feed']
assert args['headers']['content-type'] == 'text/xml'
assert args['url'] == httpserver.url_for('/feed')
assert args['url'] == httpserver.url_for('/feed')