# -*- coding: utf-8 -*- # # gPodder - A media aggregator and podcast client # Copyright (c) 2005-2018 The gPodder Team # # gPodder is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # gPodder is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # gpodder.youtube - YouTube and related magic # Justin Forest 2008-10-13 # import io import json import logging import re import urllib import xml.etree.ElementTree from functools import lru_cache from html.parser import HTMLParser from urllib.parse import parse_qs import gpodder from gpodder import registry, util logger = logging.getLogger(__name__) _ = gpodder.gettext # http://en.wikipedia.org/wiki/YouTube#Quality_and_formats # https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py#L447 # adaptive audio formats # 140 MP4 128k # 251 WebM 160k # 250 WebM 70k # 249 WebM 50k # formats and fallbacks of same quality: WebM -> MP4 -> FLV flv_240 = [5] flv_270 = [6] flv_360 = [34] flv_480 = [35] mp4_144 = ['160+140'] mp4_240 = ['133+140'] + flv_240 mp4_360 = [18, '134+140'] + flv_360 mp4_480 = ['135+140'] + flv_480 mp4_720 = [22, '136+140'] mp4_1080 = [37, '137+140'] mp4_1440 = ['264+140'] mp4_2160 = ['266+140'] mp4_3072 = [38] mp4_4320 = ['138+140'] webm_144 = ['278+250'] + mp4_144 webm_240 = ['242+250'] + mp4_240 webm_360 = [43, '243+251'] + mp4_360 webm_480 = [44, '244+251'] + mp4_480 webm_720 = [45, '247+251'] + mp4_720 webm_1080 = [46, '248+251'] + mp4_1080 webm_1440 = ['271+251'] + mp4_1440 webm_2160 = ['313+251'] + mp4_2160 webm_4320 = ['272+251'] + mp4_4320 # fallbacks to lower quality webm_240 += webm_144 webm_360 += flv_270 + webm_240 webm_480 += webm_360 webm_720 += webm_480 webm_1080 += webm_720 webm_1440 += webm_1080 webm_2160 += webm_1440 webm_4320 += mp4_3072 + webm_2160 mp4_240 += mp4_144 mp4_360 += flv_270 + mp4_240 mp4_480 += mp4_360 mp4_720 += mp4_480 mp4_1080 += mp4_720 mp4_1440 += mp4_1080 mp4_2160 += mp4_1440 mp4_3072 += mp4_2160 mp4_4320 += mp4_3072 flv_270 += flv_240 flv_360 += flv_270 flv_480 += flv_360 # format id, (preferred ids, path(?), description) # video bitrate, audio bitrate formats = [ # WebM VP8, VP9 or VP9 HFR video, Vorbis or Opus audio # Fallback to MP4 or FLV (272, (webm_4320, '272/7680x4320/99/0/0', 'WebM 4320p 8K (7680x4320) youtube-dl')), # N/A, 160 kbps (313, (webm_2160, '313/3840x2160/99/0/0', 'WebM 2160p 4K (3840x2160) youtube-dl')), # N/A, 160 kbps (271, (webm_1440, '271/2560x1440/99/0/0', 'WebM 1440p (2560x1440) youtube-dl')), # N/A, 160 kbps (46, (webm_1080, '46/1920x1080/99/0/0', 'WebM 1080p (1920x1080) youtube-dl')), # N/A, 192 kbps (45, (webm_720, '45/1280x720/99/0/0', 'WebM 720p (1280x720) youtube-dl')), # 2.0 Mbps, 192 kbps (44, (webm_480, '44/854x480/99/0/0', 'WebM 480p (854x480) youtube-dl')), # 1.0 Mbps, 128 kbps (43, (webm_360, '43/640x360/99/0/0', 'WebM 360p (640x360)')), # 0.5 Mbps, 128 kbps (242, (webm_240, '242/426x240/99/0/0', 'WebM 240p (426x240) youtube-dl')), # N/A, 70 kbps (278, (webm_144, '278/256x144/99/0/0', 'WebM 144p (256x144) youtube-dl')), # N/A, 70 kbps # MP4 H.264 video, AAC audio # Fallback to FLV (138, (mp4_4320, '138/7680x4320/9/0/115', 'MP4 4320p 8K (7680x4320) youtube-dl')), # N/A, 128 kbps (38, (mp4_3072, '38/4096x3072/9/0/115', 'MP4 3072p 4K (4096x3072)')), # 5.0 - 3.5 Mbps, 192 kbps (266, (mp4_2160, '266/3840x2160/9/0/115', 'MP4 2160p 4K (3840x2160) youtube-dl')), # N/A, 128 kbps (264, (mp4_1440, '264/2560x1440/9/0/115', 'MP4 1440p (2560x1440) youtube-dl')), # N/A, 128 kbps (37, (mp4_1080, '37/1920x1080/9/0/115', 'MP4 1080p (1920x1080) youtube-dl')), # 4.3 - 3.0 Mbps, 192 kbps (22, (mp4_720, '22/1280x720/9/0/115', 'MP4 720p (1280x720)')), # 2.9 - 2.0 Mbps, 192 kbps (135, (mp4_480, '135/854x480/9/0/115', 'MP4 480p (854x480) youtube-dl')), # N/A, 128 kbps (18, (mp4_360, '18/640x360/9/0/115', 'MP4 360p (640x360)')), # 0.5 Mbps, 96 kbps (133, (mp4_240, '133/426x240/9/0/115', 'MP4 240p (426x240) youtube-dl')), # N/A, 128 kbps (160, (mp4_144, '160/256x144/9/0/115', 'MP4 144p (256x144) youtube-dl')), # N/A, 128 kbps # FLV H.264 video, AAC audio # Fallback to FLV 6 or 5 (35, (flv_480, '35/854x480/9/0/115', 'FLV 480p (854x480)')), # 1 - 0.80 Mbps, 128 kbps (34, (flv_360, '34/640x360/9/0/115', 'FLV 360p (640x360)')), # 0.50 Mbps, 128 kbps # FLV Sorenson H.263 video, MP3 audio (6, (flv_270, '6/480x270/7/0/0', 'FLV 270p (480x270)')), # 0.80 Mbps, 64 kbps (5, (flv_240, '5/320x240/7/0/0', 'FLV 240p (320x240)')), # 0.25 Mbps, 64 kbps ] formats_dict = dict(formats) # streaming formats and fallbacks to lower quality hls_144 = [91] hls_240 = [92] + hls_144 hls_360 = [93] + hls_240 hls_480 = [94] + hls_360 hls_720 = [95] + hls_480 hls_1080 = [96] + hls_720 hls_formats = [ (96, (hls_1080, '9/1920x1080/9/0/115', 'MP4 1080p (1920x1080)')), # N/A, 256 kbps (95, (hls_720, '9/1280x720/9/0/115', 'MP4 720p (1280x720)')), # N/A, 256 kbps (94, (hls_480, '9/854x480/9/0/115', 'MP4 480p (854x480)')), # N/A, 128 kbps (93, (hls_360, '9/640x360/9/0/115', 'MP4 360p (640x360)')), # N/A, 128 kbps (92, (hls_240, '9/426x240/9/0/115', 'MP4 240p (426x240)')), # N/A, 48 kbps (91, (hls_144, '9/256x144/9/0/115', 'MP4 144p (256x144)')), # N/A, 48 kbps ] hls_formats_dict = dict(hls_formats) CHANNEL_VIDEOS_XML = 'https://www.youtube.com/feeds/videos.xml' WATCH_ENDPOINT = 'https://www.youtube.com/watch?bpctr=9999999999&has_verified=1&v=' # The page may contain "};" sequences inside the initial player response. # Use a greedy match with script end tag, and fallback to a non-greedy match without. INITIAL_PLAYER_RESPONSE_RE1 = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;\s* %s', url, result) return result return for_each_feed_pattern(return_user_feed, url, url) @lru_cache(1) def get_channel_id_url(url, feed_data=None): if 'youtube.com' in url: try: if feed_data is None: r = util.urlopen(url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) else: r = feed_data # video page may contain corrupt HTML/XML, search for tag to avoid exception m = re.search(r'channel_id=([^"]+)">', r.text) if m: channel_id = m.group(1) else: raw_xml_data = io.BytesIO(r.content) xml_data = xml.etree.ElementTree.parse(raw_xml_data) channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text if channel_id is None: # check entries if feed has an empty channelId m = re.search(r'([^<]+)', r.text) if m: channel_id = m.group(1) if channel_id is None: raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url) channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id) return channel_url except Exception: logger.warning('Could not retrieve YouTube channel ID for URL %s.' % url, exc_info=True) raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url) def get_cover(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): """This custom html parser searches for the youtube channel thumbnail/avatar""" def __init__(self): super().__init__() self.url = [] def handle_starttag(self, tag, attributes): attribute_dict = {attribute[0]: attribute[1] for attribute in attributes} # Look for 900x900px image first. if tag == 'link' \ and 'rel' in attribute_dict \ and attribute_dict['rel'] == 'image_src': self.url.append(attribute_dict['href']) # Fallback to image that may only be 100x100px. elif tag == 'img' \ and 'class' in attribute_dict \ and attribute_dict['class'] == "channel-header-profile-image": self.url.append(attribute_dict['src']) try: channel_url = get_channel_id_url(url, feed_data) r = util.urlopen(channel_url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) html_data = util.response_text(r) parser = YouTubeHTMLCoverParser() parser.feed(html_data) if parser.url: logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url)) return parser.url[0] except Exception: logger.warning('Could not retrieve cover art', exc_info=True) def get_gdpr_consent_url(html_data): """ Creates the URL for automatically accepting GDPR consents EU GDPR redirects to a form that needs to be posted to be redirected to a get request with the form data as input to the youtube video URL. This extracts that form data from the GDPR form and builds up the URL the posted form results. """ class ConsentHTML(HTMLParser): def __init__(self): super().__init__() self.url = '' self.consentForm = False def handle_starttag(self, tag, attributes): attribute_dict = {attribute[0]: attribute[1] for attribute in attributes} if tag == 'form' and attribute_dict['action'] == 'https://consent.youtube.com/s': self.consentForm = True self.url = 'https://consent.google.com/s?' # Get GDPR form elements if self.consentForm and tag == 'input' and attribute_dict['type'] == 'hidden': self.url += '&' + attribute_dict['name'] + '=' + urllib.parse.quote_plus(attribute_dict['value']) def handle_endtag(self, tag): if tag == 'form': self.consentForm = False try: parser = ConsentHTML() parser.feed(html_data) except Exception: raise YouTubeError('Could not retrieve GDPR accepted consent URL') if parser.url: logger.debug('YouTube GDPR accept consent URL is: %s', parser.url) return parser.url else: logger.debug('YouTube GDPR accepted consent URL could not be resolved.') raise YouTubeError('No acceptable GDPR consent URL') def get_channel_desc(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLDesc(HTMLParser): """This custom html parser searches for the YouTube channel description.""" def __init__(self): super().__init__() self.description = '' def handle_starttag(self, tag, attributes): attribute_dict = {attribute[0]: attribute[1] for attribute in attributes} # Get YouTube channel description. if tag == 'meta' \ and 'name' in attribute_dict \ and attribute_dict['name'] == "description": self.description = attribute_dict['content'] try: channel_url = get_channel_id_url(url, feed_data) r = util.urlopen(channel_url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) html_data = util.response_text(r) parser = YouTubeHTMLDesc() parser.feed(html_data) if parser.description: logger.debug('YouTube description for %s is: %s', url, parser.description) return parser.description else: logger.debug('YouTube description for %s is not provided.', url) return _('No description available') except Exception: logger.warning('Could not retrieve YouTube channel description for %s.' % url, exc_info=True) def parse_youtube_url(url): """ Youtube Channel Links are parsed into youtube feed links >>> parse_youtube_url("https://www.youtube.com/channel/CHANNEL_ID") 'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID' Youtube User Links are parsed into youtube feed links >>> parse_youtube_url("https://www.youtube.com/user/USERNAME") 'https://www.youtube.com/feeds/videos.xml?user=USERNAME' Youtube Playlist Links are parsed into youtube feed links >>> parse_youtube_url("https://www.youtube.com/playlist?list=PLAYLIST_ID") 'https://www.youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID' >>> parse_youtube_url(None) None @param url: the path to the channel, user or playlist @return: the feed url if successful or the given url if not """ if url is None: return url scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) logger.debug("Analyzing URL: {}".format(" ".join([scheme, netloc, path, query, fragment]))) if 'youtube.com' in netloc: if path == '/feeds/videos.xml' and re.search(r'^(user|channel|playlist)_id=.*', query): return url if '/user/' in path or '/channel/' in path or 'list=' in query: logger.debug("Valid Youtube URL detected. Parsing...") if path.startswith('/user/'): user_id = path.split('/')[2] query = 'user={user_id}'.format(user_id=user_id) if path.startswith('/channel/'): channel_id = path.split('/')[2] query = 'channel_id={channel_id}'.format(channel_id=channel_id) if 'list=' in query: playlist_query = [query_value for query_value in query.split("&") if 'list=' in query_value][0] playlist_id = playlist_query[5:] query = 'playlist_id={playlist_id}'.format(playlist_id=playlist_id) path = '/feeds/videos.xml' new_url = urllib.parse.urlunsplit((scheme, netloc, path, query, fragment)) logger.debug("New Youtube URL: {}".format(new_url)) return new_url # look for channel URL in page new_url = get_channel_id_url(url) if new_url: logger.debug("New Youtube URL: {}".format(new_url)) return new_url logger.debug("Not a valid Youtube URL: {}".format(url)) return url