diff --git a/src/gpodder/feedcore.py b/src/gpodder/feedcore.py index 2d536f6f..39cacc3f 100644 --- a/src/gpodder/feedcore.py +++ b/src/gpodder/feedcore.py @@ -110,6 +110,12 @@ class FeedAutodiscovery(HTMLParser): self._resolved_url = url +class FetcherFeedData: + def __init__(self, text, content): + self.text = text + self.content = content + + class Fetcher(object): # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html FEED_TYPES = ('application/rss+xml', @@ -152,7 +158,7 @@ class Fetcher(object): else: raise UnknownStatusCode(status) - def parse_feed(self, url, data_stream, headers, status, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs): """ kwargs are passed from Fetcher.fetch :param str url: real url @@ -169,7 +175,7 @@ class Fetcher(object): if url.startswith('file://'): url = url[len('file://'):] stream = open(url) - return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs) + return self.parse_feed(url, None, stream, {}, UPDATED_FEED, **kwargs) # remote feed headers = {} @@ -210,4 +216,5 @@ class Fetcher(object): # xml documents specify the encoding inline so better pass encoded body. # Especially since requests will use ISO-8859-1 for content-type 'text/xml' # if the server doesn't specify a charset. - return self.parse_feed(url, BytesIO(stream.content), stream.headers, UPDATED_FEED, **kwargs) + return self.parse_feed(url, FetcherFeedData(stream.text, stream.content), BytesIO(stream.content), stream.headers, + UPDATED_FEED, **kwargs) diff --git a/src/gpodder/model.py b/src/gpodder/model.py index 1e130543..e8a37ca5 100644 --- a/src/gpodder/model.py +++ b/src/gpodder/model.py @@ -110,13 +110,13 @@ class PodcastParserFeed(Feed): def get_link(self): vid = youtube.get_youtube_id(self.feed['url']) if vid is not None: - self.feed['link'] = youtube.get_channel_id_url(self.feed['url']) + self.feed['link'] = youtube.get_channel_id_url(self.feed['url'], self.fetcher.feed_data) return self.feed.get('link') def get_description(self): vid = youtube.get_youtube_id(self.feed['url']) if vid is not None: - self.feed['description'] = youtube.get_channel_desc(self.feed['url']) + self.feed['description'] = youtube.get_channel_desc(self.feed['url'], self.fetcher.feed_data) return self.feed.get('description') def get_cover_url(self): @@ -215,7 +215,8 @@ class gPodderFetcher(feedcore.Fetcher): url = vimeo.get_real_channel_url(url) return url - def parse_feed(self, url, data_stream, headers, status, max_episodes=0, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, max_episodes=0, **kwargs): + self.feed_data = feed_data try: feed = podcastparser.parse(url, data_stream) feed['url'] = url diff --git a/src/gpodder/youtube.py b/src/gpodder/youtube.py index 14099b7d..5816bc3c 100644 --- a/src/gpodder/youtube.py +++ b/src/gpodder/youtube.py @@ -26,6 +26,7 @@ import logging import re import urllib import xml.etree.ElementTree +from functools import lru_cache from html.parser import HTMLParser from urllib.parse import parse_qs @@ -366,6 +367,7 @@ def get_real_download_url(url, allow_partial, preferred_fmt_ids=None): return url, duration +@lru_cache(1) def get_youtube_id(url): r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url) if r is not None: @@ -427,10 +429,14 @@ def get_real_channel_url(url): return for_each_feed_pattern(return_user_feed, url, url) -def get_channel_id_url(url): +@lru_cache(1) +def get_channel_id_url(url, feed_data=None): if 'youtube.com' in url: try: - req = util.urlopen(url) + if feed_data is None: + req = util.urlopen(url) + else: + req = feed_data # video page may contain corrupt HTML/XML, search for tag to avoid exception m = re.search(r'', req.text) if m: @@ -445,8 +451,10 @@ def get_channel_id_url(url): except Exception: logger.warning('Could not retrieve youtube channel id.', exc_info=True) + raise Exception('Could not retrieve youtube channel id.') -def get_cover(url): + +def get_cover(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): @@ -471,7 +479,7 @@ def get_cover(url): self.url.append(attribute_dict['src']) try: - channel_url = get_channel_id_url(url) + channel_url = get_channel_id_url(url, feed_data) html_data = util.response_text(util.urlopen(channel_url)) parser = YouTubeHTMLCoverParser() parser.feed(html_data) @@ -523,7 +531,7 @@ def get_gdpr_consent_url(html_data): raise YouTubeError('No acceptable GDPR consent URL') -def get_channel_desc(url): +def get_channel_desc(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLDesc(HTMLParser): @@ -542,7 +550,7 @@ def get_channel_desc(url): self.description = attribute_dict['content'] try: - channel_url = get_channel_id_url(url) + channel_url = get_channel_id_url(url, feed_data) html_data = util.response_text(util.urlopen(channel_url)) parser = YouTubeHTMLDesc() parser.feed(html_data) diff --git a/tests/test_feedcore.py b/tests/test_feedcore.py index 957a32ff..13402424 100644 --- a/tests/test_feedcore.py +++ b/tests/test_feedcore.py @@ -25,10 +25,11 @@ from gpodder.feedcore import Fetcher, Result, NEW_LOCATION, NOT_MODIFIED, UPDATE class MyFetcher(Fetcher): - def parse_feed(self, url, data_stream, headers, status, **kwargs): + def parse_feed(self, url, feed_data, data_stream, headers, status, **kwargs): return Result(status, { 'parse_feed': { 'url': url, + 'feed_data': feed_data, 'data_stream': data_stream, 'headers': headers, 'extra_args': dict(**kwargs), @@ -112,4 +113,4 @@ def test_temporary_error_retry(httpserver): assert res.status == UPDATED_FEED args = res.feed['parse_feed'] assert args['headers']['content-type'] == 'text/xml' - assert args['url'] == httpserver.url_for('/feed') \ No newline at end of file + assert args['url'] == httpserver.url_for('/feed')