2008-10-13 15:28:44 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# gPodder - A media aggregator and podcast client
|
2018-01-28 19:39:53 +01:00
|
|
|
# Copyright (c) 2005-2018 The gPodder Team
|
2008-10-13 15:28:44 +02:00
|
|
|
#
|
|
|
|
# gPodder is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# gPodder is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
2009-08-24 17:02:35 +02:00
|
|
|
# gpodder.youtube - YouTube and related magic
|
2008-10-13 15:28:44 +02:00
|
|
|
# Justin Forest <justin.forest@gmail.com> 2008-10-13
|
|
|
|
#
|
2009-08-24 17:02:35 +02:00
|
|
|
|
2020-07-18 15:04:02 +02:00
|
|
|
import io
|
2016-11-26 16:05:17 +01:00
|
|
|
import json
|
2018-07-24 11:08:10 +02:00
|
|
|
import logging
|
2008-10-13 15:28:44 +02:00
|
|
|
import re
|
2018-04-16 12:10:14 +02:00
|
|
|
import urllib
|
|
|
|
import xml.etree.ElementTree
|
2022-02-27 10:52:06 +01:00
|
|
|
from functools import lru_cache
|
2018-07-24 11:08:10 +02:00
|
|
|
from html.parser import HTMLParser
|
|
|
|
from urllib.parse import parse_qs
|
|
|
|
|
2019-08-27 21:56:13 +02:00
|
|
|
import gpodder
|
2019-08-17 16:25:00 +02:00
|
|
|
from gpodder import registry, util
|
2018-07-24 11:08:10 +02:00
|
|
|
|
2018-04-16 12:10:14 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
2008-10-13 15:28:44 +02:00
|
|
|
|
2019-08-27 21:56:13 +02:00
|
|
|
|
|
|
|
_ = gpodder.gettext
|
|
|
|
|
|
|
|
|
2020-04-24 14:26:00 +02:00
|
|
|
# http://en.wikipedia.org/wiki/YouTube#Quality_and_formats
|
2020-09-12 10:34:39 +02:00
|
|
|
# https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py#L447
|
2020-04-24 14:26:00 +02:00
|
|
|
|
|
|
|
# adaptive audio formats
|
|
|
|
# 140 MP4 128k
|
|
|
|
# 251 WebM 160k
|
|
|
|
# 250 WebM 70k
|
|
|
|
# 249 WebM 50k
|
|
|
|
|
|
|
|
# formats and fallbacks of same quality: WebM -> MP4 -> FLV
|
|
|
|
flv_240 = [5]
|
|
|
|
flv_270 = [6]
|
|
|
|
flv_360 = [34]
|
|
|
|
flv_480 = [35]
|
|
|
|
mp4_144 = ['160+140']
|
|
|
|
mp4_240 = ['133+140'] + flv_240
|
|
|
|
mp4_360 = [18, '134+140'] + flv_360
|
|
|
|
mp4_480 = ['135+140'] + flv_480
|
|
|
|
mp4_720 = [22, '136+140']
|
|
|
|
mp4_1080 = [37, '137+140']
|
|
|
|
mp4_1440 = ['264+140']
|
|
|
|
mp4_2160 = ['266+140']
|
|
|
|
mp4_3072 = [38]
|
|
|
|
mp4_4320 = ['138+140']
|
|
|
|
webm_144 = ['278+250'] + mp4_144
|
|
|
|
webm_240 = ['242+250'] + mp4_240
|
|
|
|
webm_360 = [43, '243+251'] + mp4_360
|
|
|
|
webm_480 = [44, '244+251'] + mp4_480
|
|
|
|
webm_720 = [45, '247+251'] + mp4_720
|
|
|
|
webm_1080 = [46, '248+251'] + mp4_1080
|
|
|
|
webm_1440 = ['271+251'] + mp4_1440
|
|
|
|
webm_2160 = ['313+251'] + mp4_2160
|
|
|
|
webm_4320 = ['272+251'] + mp4_4320
|
|
|
|
# fallbacks to lower quality
|
2020-05-05 04:47:43 +02:00
|
|
|
webm_240 += webm_144
|
|
|
|
webm_360 += flv_270 + webm_240
|
|
|
|
webm_480 += webm_360
|
|
|
|
webm_720 += webm_480
|
|
|
|
webm_1080 += webm_720
|
|
|
|
webm_1440 += webm_1080
|
|
|
|
webm_2160 += webm_1440
|
|
|
|
webm_4320 += mp4_3072 + webm_2160
|
|
|
|
mp4_240 += mp4_144
|
|
|
|
mp4_360 += flv_270 + mp4_240
|
|
|
|
mp4_480 += mp4_360
|
|
|
|
mp4_720 += mp4_480
|
|
|
|
mp4_1080 += mp4_720
|
|
|
|
mp4_1440 += mp4_1080
|
|
|
|
mp4_2160 += mp4_1440
|
|
|
|
mp4_3072 += mp4_2160
|
|
|
|
mp4_4320 += mp4_3072
|
|
|
|
flv_270 += flv_240
|
|
|
|
flv_360 += flv_270
|
|
|
|
flv_480 += flv_360
|
2012-09-19 13:43:20 +02:00
|
|
|
# format id, (preferred ids, path(?), description) # video bitrate, audio bitrate
|
2012-11-18 21:35:21 +01:00
|
|
|
formats = [
|
2020-04-24 14:26:00 +02:00
|
|
|
# WebM VP8, VP9 or VP9 HFR video, Vorbis or Opus audio
|
|
|
|
# Fallback to MP4 or FLV
|
|
|
|
(272, (webm_4320, '272/7680x4320/99/0/0', 'WebM 4320p 8K (7680x4320) youtube-dl')), # N/A, 160 kbps
|
|
|
|
(313, (webm_2160, '313/3840x2160/99/0/0', 'WebM 2160p 4K (3840x2160) youtube-dl')), # N/A, 160 kbps
|
|
|
|
(271, (webm_1440, '271/2560x1440/99/0/0', 'WebM 1440p (2560x1440) youtube-dl')), # N/A, 160 kbps
|
|
|
|
(46, (webm_1080, '46/1920x1080/99/0/0', 'WebM 1080p (1920x1080) youtube-dl')), # N/A, 192 kbps
|
|
|
|
(45, (webm_720, '45/1280x720/99/0/0', 'WebM 720p (1280x720) youtube-dl')), # 2.0 Mbps, 192 kbps
|
|
|
|
(44, (webm_480, '44/854x480/99/0/0', 'WebM 480p (854x480) youtube-dl')), # 1.0 Mbps, 128 kbps
|
|
|
|
(43, (webm_360, '43/640x360/99/0/0', 'WebM 360p (640x360)')), # 0.5 Mbps, 128 kbps
|
|
|
|
(242, (webm_240, '242/426x240/99/0/0', 'WebM 240p (426x240) youtube-dl')), # N/A, 70 kbps
|
|
|
|
(278, (webm_144, '278/256x144/99/0/0', 'WebM 144p (256x144) youtube-dl')), # N/A, 70 kbps
|
2012-09-19 13:43:20 +02:00
|
|
|
|
|
|
|
# MP4 H.264 video, AAC audio
|
2020-04-24 14:26:00 +02:00
|
|
|
# Fallback to FLV
|
|
|
|
(138, (mp4_4320, '138/7680x4320/9/0/115', 'MP4 4320p 8K (7680x4320) youtube-dl')), # N/A, 128 kbps
|
|
|
|
(38, (mp4_3072, '38/4096x3072/9/0/115', 'MP4 3072p 4K (4096x3072)')), # 5.0 - 3.5 Mbps, 192 kbps
|
|
|
|
(266, (mp4_2160, '266/3840x2160/9/0/115', 'MP4 2160p 4K (3840x2160) youtube-dl')), # N/A, 128 kbps
|
|
|
|
(264, (mp4_1440, '264/2560x1440/9/0/115', 'MP4 1440p (2560x1440) youtube-dl')), # N/A, 128 kbps
|
|
|
|
(37, (mp4_1080, '37/1920x1080/9/0/115', 'MP4 1080p (1920x1080) youtube-dl')), # 4.3 - 3.0 Mbps, 192 kbps
|
|
|
|
(22, (mp4_720, '22/1280x720/9/0/115', 'MP4 720p (1280x720)')), # 2.9 - 2.0 Mbps, 192 kbps
|
|
|
|
(135, (mp4_480, '135/854x480/9/0/115', 'MP4 480p (854x480) youtube-dl')), # N/A, 128 kbps
|
|
|
|
(18, (mp4_360, '18/640x360/9/0/115', 'MP4 360p (640x360)')), # 0.5 Mbps, 96 kbps
|
|
|
|
(133, (mp4_240, '133/426x240/9/0/115', 'MP4 240p (426x240) youtube-dl')), # N/A, 128 kbps
|
|
|
|
(160, (mp4_144, '160/256x144/9/0/115', 'MP4 144p (256x144) youtube-dl')), # N/A, 128 kbps
|
2012-09-19 13:43:20 +02:00
|
|
|
|
|
|
|
# FLV H.264 video, AAC audio
|
2020-04-24 14:26:00 +02:00
|
|
|
# Fallback to FLV 6 or 5
|
|
|
|
(35, (flv_480, '35/854x480/9/0/115', 'FLV 480p (854x480)')), # 1 - 0.80 Mbps, 128 kbps
|
|
|
|
(34, (flv_360, '34/640x360/9/0/115', 'FLV 360p (640x360)')), # 0.50 Mbps, 128 kbps
|
2012-09-19 13:43:20 +02:00
|
|
|
|
|
|
|
# FLV Sorenson H.263 video, MP3 audio
|
2020-04-24 14:26:00 +02:00
|
|
|
(6, (flv_270, '6/480x270/7/0/0', 'FLV 270p (480x270)')), # 0.80 Mbps, 64 kbps
|
|
|
|
(5, (flv_240, '5/320x240/7/0/0', 'FLV 240p (320x240)')), # 0.25 Mbps, 64 kbps
|
2012-11-18 21:35:21 +01:00
|
|
|
]
|
|
|
|
formats_dict = dict(formats)
|
2012-09-19 13:43:20 +02:00
|
|
|
|
2020-09-12 10:34:39 +02:00
|
|
|
# streaming formats and fallbacks to lower quality
|
|
|
|
hls_144 = [91]
|
|
|
|
hls_240 = [92] + hls_144
|
|
|
|
hls_360 = [93] + hls_240
|
|
|
|
hls_480 = [94] + hls_360
|
|
|
|
hls_720 = [95] + hls_480
|
2020-09-14 02:55:47 +02:00
|
|
|
hls_1080 = [96] + hls_720
|
2020-09-12 10:34:39 +02:00
|
|
|
hls_formats = [
|
2020-09-14 02:55:47 +02:00
|
|
|
(96, (hls_1080, '9/1920x1080/9/0/115', 'MP4 1080p (1920x1080)')), # N/A, 256 kbps
|
|
|
|
(95, (hls_720, '9/1280x720/9/0/115', 'MP4 720p (1280x720)')), # N/A, 256 kbps
|
|
|
|
(94, (hls_480, '9/854x480/9/0/115', 'MP4 480p (854x480)')), # N/A, 128 kbps
|
|
|
|
(93, (hls_360, '9/640x360/9/0/115', 'MP4 360p (640x360)')), # N/A, 128 kbps
|
|
|
|
(92, (hls_240, '9/426x240/9/0/115', 'MP4 240p (426x240)')), # N/A, 48 kbps
|
|
|
|
(91, (hls_144, '9/256x144/9/0/115', 'MP4 144p (256x144)')), # N/A, 48 kbps
|
2020-09-12 10:34:39 +02:00
|
|
|
]
|
|
|
|
hls_formats_dict = dict(hls_formats)
|
|
|
|
|
2015-05-20 21:10:57 +02:00
|
|
|
CHANNEL_VIDEOS_XML = 'https://www.youtube.com/feeds/videos.xml'
|
2021-10-14 05:58:24 +02:00
|
|
|
WATCH_ENDPOINT = 'https://www.youtube.com/watch?bpctr=9999999999&has_verified=1&v='
|
|
|
|
|
|
|
|
# The page may contain "};" sequences inside the initial player response.
|
|
|
|
# Use a greedy match with script end tag, and fallback to a non-greedy match without.
|
|
|
|
INITIAL_PLAYER_RESPONSE_RE1 = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;\s*</script'
|
|
|
|
INITIAL_PLAYER_RESPONSE_RE2 = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
|
|
|
|
|
|
|
|
|
|
|
def get_ipr(page):
|
|
|
|
for regex in (INITIAL_PLAYER_RESPONSE_RE1, INITIAL_PLAYER_RESPONSE_RE2):
|
|
|
|
ipr = re.search(regex, page)
|
|
|
|
if ipr is not None:
|
|
|
|
return ipr
|
|
|
|
return None
|
2015-05-20 21:10:57 +02:00
|
|
|
|
|
|
|
|
2018-04-08 23:08:05 +02:00
|
|
|
class YouTubeError(Exception):
|
|
|
|
pass
|
2010-12-18 14:32:33 +01:00
|
|
|
|
2012-09-19 13:43:20 +02:00
|
|
|
|
2020-09-12 10:34:39 +02:00
|
|
|
def get_fmt_ids(youtube_config, allow_partial):
|
|
|
|
if allow_partial:
|
|
|
|
if youtube_config.preferred_hls_fmt_id == 0:
|
|
|
|
hls_fmt_ids = (youtube_config.preferred_hls_fmt_ids if youtube_config.preferred_hls_fmt_ids else [])
|
|
|
|
else:
|
|
|
|
format = hls_formats_dict.get(youtube_config.preferred_hls_fmt_id)
|
|
|
|
if format is None:
|
|
|
|
hls_fmt_ids = []
|
|
|
|
else:
|
|
|
|
hls_fmt_ids, path, description = format
|
|
|
|
else:
|
|
|
|
hls_fmt_ids = []
|
|
|
|
|
2020-05-05 07:55:36 +02:00
|
|
|
if youtube_config.preferred_fmt_id == 0:
|
2020-09-12 10:34:39 +02:00
|
|
|
return (youtube_config.preferred_fmt_ids + hls_fmt_ids if youtube_config.preferred_fmt_ids else hls_fmt_ids)
|
2012-10-01 10:56:26 +02:00
|
|
|
|
2020-05-05 07:55:36 +02:00
|
|
|
format = formats_dict.get(youtube_config.preferred_fmt_id)
|
|
|
|
if format is None:
|
2020-09-12 10:34:39 +02:00
|
|
|
return hls_fmt_ids
|
2020-05-05 07:55:36 +02:00
|
|
|
fmt_ids, path, description = format
|
2020-09-12 10:34:39 +02:00
|
|
|
return fmt_ids + hls_fmt_ids
|
2012-10-01 10:56:26 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2019-08-17 16:25:00 +02:00
|
|
|
@registry.download_url.register
|
2020-09-12 10:34:39 +02:00
|
|
|
def youtube_real_download_url(config, episode, allow_partial):
|
|
|
|
fmt_ids = get_fmt_ids(config.youtube, allow_partial) if config else None
|
|
|
|
res, duration = get_real_download_url(episode.url, allow_partial, fmt_ids)
|
2020-01-07 09:16:25 +01:00
|
|
|
if duration is not None:
|
|
|
|
episode.total_time = int(int(duration) / 1000)
|
2019-08-17 16:25:00 +02:00
|
|
|
return None if res == episode.url else res
|
|
|
|
|
|
|
|
|
2021-07-07 05:07:47 +02:00
|
|
|
def youtube_get_old_endpoint(vid):
|
|
|
|
# TODO: changing 'detailpage' to 'embedded' allows age-restricted content
|
|
|
|
url = 'https://www.youtube.com/get_video_info?html5=1&c=TVHTML5&cver=6.20180913&el=detailpage&video_id=' + vid
|
2021-07-18 12:52:01 +02:00
|
|
|
r = util.urlopen(url)
|
2021-07-07 05:07:47 +02:00
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
|
|
|
else:
|
|
|
|
return r.text, None
|
|
|
|
|
|
|
|
|
|
|
|
def youtube_get_new_endpoint(vid):
|
2021-10-14 05:58:24 +02:00
|
|
|
url = WATCH_ENDPOINT + vid
|
2021-07-18 12:52:01 +02:00
|
|
|
r = util.urlopen(url)
|
2021-07-07 05:07:47 +02:00
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
|
|
|
|
2021-10-14 05:58:24 +02:00
|
|
|
ipr = get_ipr(r.text)
|
2021-07-07 05:07:47 +02:00
|
|
|
if ipr is None:
|
|
|
|
try:
|
|
|
|
url = get_gdpr_consent_url(r.text)
|
|
|
|
except YouTubeError as e:
|
|
|
|
raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found and %s' % (url, str(e)))
|
2021-07-18 12:52:01 +02:00
|
|
|
r = util.urlopen(url)
|
2021-07-07 05:07:47 +02:00
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
|
|
|
|
2021-10-14 05:58:24 +02:00
|
|
|
ipr = get_ipr(r.text)
|
2021-07-07 05:07:47 +02:00
|
|
|
if ipr is None:
|
|
|
|
raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found' % url)
|
|
|
|
|
|
|
|
return None, ipr.group(1)
|
|
|
|
|
|
|
|
|
2021-08-15 07:09:07 +02:00
|
|
|
def get_total_time(episode):
|
|
|
|
try:
|
|
|
|
vid = get_youtube_id(episode.url)
|
|
|
|
if vid is None:
|
|
|
|
return 0
|
|
|
|
|
2021-10-14 05:58:24 +02:00
|
|
|
url = WATCH_ENDPOINT + vid
|
2021-08-15 07:09:07 +02:00
|
|
|
r = util.urlopen(url)
|
|
|
|
if not r.ok:
|
|
|
|
return 0
|
|
|
|
|
2021-10-14 05:58:24 +02:00
|
|
|
ipr = get_ipr(r.text)
|
2021-08-15 07:09:07 +02:00
|
|
|
if ipr is None:
|
|
|
|
url = get_gdpr_consent_url(r.text)
|
|
|
|
r = util.urlopen(url)
|
|
|
|
if not r.ok:
|
|
|
|
return 0
|
|
|
|
|
2021-10-14 05:58:24 +02:00
|
|
|
ipr = get_ipr(r.text)
|
2021-08-15 07:09:07 +02:00
|
|
|
if ipr is None:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
player_response = json.loads(ipr.group(1))
|
|
|
|
return int(player_response['videoDetails']['lengthSeconds']) # 0 if live
|
|
|
|
except:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
2020-09-12 10:34:39 +02:00
|
|
|
def get_real_download_url(url, allow_partial, preferred_fmt_ids=None):
|
2012-09-19 13:43:20 +02:00
|
|
|
if not preferred_fmt_ids:
|
2018-04-16 12:10:14 +02:00
|
|
|
preferred_fmt_ids, _, _ = formats_dict[22] # MP4 720p
|
2011-06-08 11:01:36 +02:00
|
|
|
|
2020-01-07 09:16:25 +01:00
|
|
|
duration = None
|
|
|
|
|
2009-01-26 12:37:21 +01:00
|
|
|
vid = get_youtube_id(url)
|
|
|
|
if vid is not None:
|
2021-07-07 05:07:47 +02:00
|
|
|
try:
|
|
|
|
old_page, new_page = youtube_get_new_endpoint(vid)
|
|
|
|
except YouTubeError as e:
|
|
|
|
logger.info(str(e))
|
|
|
|
old_page, new_page = youtube_get_old_endpoint(vid)
|
2018-02-06 18:33:52 +01:00
|
|
|
|
2021-05-21 14:06:02 +02:00
|
|
|
def find_urls(old_page, new_page):
|
2019-12-25 07:09:57 +01:00
|
|
|
# streamingData is preferable to url_encoded_fmt_stream_map
|
|
|
|
# streamingData.formats are the same as url_encoded_fmt_stream_map
|
|
|
|
# streamingData.adaptiveFormats are audio-only and video-only formats
|
2021-05-21 14:06:02 +02:00
|
|
|
|
|
|
|
x = parse_qs(old_page) if old_page else json.loads(new_page)
|
|
|
|
player_response = json.loads(x['player_response'][0]) if old_page and 'player_response' in x else x
|
2020-01-11 02:48:43 +01:00
|
|
|
error_message = None
|
2019-12-25 07:09:57 +01:00
|
|
|
|
|
|
|
if 'reason' in x:
|
2021-05-21 14:06:02 +02:00
|
|
|
# TODO: unknown if this is valid for new_page
|
2019-12-25 07:09:57 +01:00
|
|
|
error_message = util.remove_html_tags(x['reason'][0])
|
2021-05-21 14:06:02 +02:00
|
|
|
elif 'playabilityStatus' in player_response:
|
2020-01-09 11:02:33 +01:00
|
|
|
playabilityStatus = player_response['playabilityStatus']
|
|
|
|
|
|
|
|
if 'reason' in playabilityStatus:
|
|
|
|
error_message = util.remove_html_tags(playabilityStatus['reason'])
|
|
|
|
elif 'liveStreamability' in playabilityStatus \
|
|
|
|
and not playabilityStatus['liveStreamability'].get('liveStreamabilityRenderer', {}).get('displayEndscreen', False):
|
|
|
|
# playabilityStatus.liveStreamability -- video is or was a live stream
|
|
|
|
# playabilityStatus.liveStreamability.liveStreamabilityRenderer.displayEndscreen -- video has ended if present
|
2020-09-12 10:34:39 +02:00
|
|
|
|
|
|
|
if allow_partial and 'streamingData' in player_response and 'hlsManifestUrl' in player_response['streamingData']:
|
2021-07-18 12:52:01 +02:00
|
|
|
r = util.urlopen(player_response['streamingData']['hlsManifestUrl'])
|
2021-05-21 14:06:02 +02:00
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('HLS Manifest: %d %s' % (r.status_code, r.reason))
|
|
|
|
manifest = r.text.splitlines()
|
2020-09-12 10:34:39 +02:00
|
|
|
|
|
|
|
urls = [line for line in manifest if line[0] != '#']
|
2021-01-14 22:17:10 +01:00
|
|
|
itag_re = re.compile(r'/itag/([0-9]+)/')
|
2020-09-12 10:34:39 +02:00
|
|
|
for url in urls:
|
|
|
|
itag = itag_re.search(url).group(1)
|
|
|
|
yield int(itag), [url, None]
|
|
|
|
return
|
|
|
|
|
2019-12-25 07:09:57 +01:00
|
|
|
error_message = 'live stream'
|
|
|
|
elif 'streamingData' in player_response:
|
|
|
|
if 'formats' in player_response['streamingData']:
|
|
|
|
for f in player_response['streamingData']['formats']:
|
2021-05-21 14:06:02 +02:00
|
|
|
if 'url' in f: # DRM videos store url inside a signatureCipher key
|
2020-01-07 09:16:25 +01:00
|
|
|
yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
|
2019-12-25 07:09:57 +01:00
|
|
|
if 'adaptiveFormats' in player_response['streamingData']:
|
|
|
|
for f in player_response['streamingData']['adaptiveFormats']:
|
2021-05-21 14:06:02 +02:00
|
|
|
if 'url' in f: # DRM videos store url inside a signatureCipher key
|
2020-01-07 09:16:25 +01:00
|
|
|
yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
|
2019-12-25 07:09:57 +01:00
|
|
|
return
|
|
|
|
|
2020-01-11 02:48:43 +01:00
|
|
|
if error_message is not None:
|
2021-05-22 05:22:57 +02:00
|
|
|
raise YouTubeError(('Cannot stream video: %s' if allow_partial else 'Cannot download video: %s') % error_message)
|
2019-12-25 07:09:57 +01:00
|
|
|
|
2021-05-21 14:06:02 +02:00
|
|
|
if old_page:
|
|
|
|
r4 = re.search(r'url_encoded_fmt_stream_map=([^&]+)', old_page)
|
|
|
|
if r4 is not None:
|
|
|
|
fmt_url_map = urllib.parse.unquote(r4.group(1))
|
|
|
|
for fmt_url_encoded in fmt_url_map.split(','):
|
|
|
|
video_info = parse_qs(fmt_url_encoded)
|
|
|
|
yield int(video_info['itag'][0]), [video_info['url'][0], None]
|
2010-12-14 18:34:26 +01:00
|
|
|
|
2021-05-21 14:06:02 +02:00
|
|
|
fmt_id_url_map = sorted(find_urls(old_page, new_page), reverse=True)
|
2012-09-19 13:43:20 +02:00
|
|
|
|
|
|
|
if not fmt_id_url_map:
|
2021-06-11 12:52:36 +02:00
|
|
|
drm = re.search(r'(%22(cipher|signatureCipher)%22%3A|"signatureCipher":)', old_page or new_page)
|
2019-12-25 09:07:02 +01:00
|
|
|
if drm is not None:
|
2021-05-21 14:06:02 +02:00
|
|
|
raise YouTubeError('Unsupported DRM content')
|
|
|
|
raise YouTubeError('No formats found')
|
2010-12-18 14:32:33 +01:00
|
|
|
|
2010-12-14 18:34:26 +01:00
|
|
|
formats_available = set(fmt_id for fmt_id, url in fmt_id_url_map)
|
|
|
|
fmt_id_url_map = dict(fmt_id_url_map)
|
2009-09-11 02:07:54 +02:00
|
|
|
|
2012-09-19 13:43:20 +02:00
|
|
|
for id in preferred_fmt_ids:
|
2021-07-07 09:59:41 +02:00
|
|
|
if not re.search(r'^[0-9]+$', str(id)):
|
|
|
|
# skip non-integer formats 'best', '136+140' or twitch '720p'
|
2019-12-31 01:14:30 +01:00
|
|
|
continue
|
2012-10-01 10:56:26 +02:00
|
|
|
id = int(id)
|
|
|
|
if id in formats_available:
|
2020-09-12 10:34:39 +02:00
|
|
|
format = formats_dict.get(id) or hls_formats_dict.get(id)
|
2012-10-01 10:56:26 +02:00
|
|
|
if format is not None:
|
|
|
|
_, _, description = format
|
|
|
|
else:
|
|
|
|
description = 'Unknown'
|
|
|
|
|
|
|
|
logger.info('Found YouTube format: %s (fmt_id=%d)',
|
|
|
|
description, id)
|
2020-01-07 09:16:25 +01:00
|
|
|
url, duration = fmt_id_url_map[id]
|
2012-10-01 10:56:26 +02:00
|
|
|
break
|
2020-01-04 12:30:50 +01:00
|
|
|
else:
|
2021-05-21 14:06:02 +02:00
|
|
|
raise YouTubeError('No preferred formats found')
|
2008-10-13 15:28:44 +02:00
|
|
|
|
2020-01-07 09:16:25 +01:00
|
|
|
return url, duration
|
2008-10-13 15:28:44 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
@lru_cache(1)
|
2009-01-26 12:37:21 +01:00
|
|
|
def get_youtube_id(url):
|
2021-10-15 03:19:12 +02:00
|
|
|
r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url)
|
2009-01-26 12:37:21 +01:00
|
|
|
if r is not None:
|
|
|
|
return r.group(1)
|
|
|
|
|
2021-10-15 03:19:12 +02:00
|
|
|
r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)[?]', re.IGNORECASE).match(url)
|
2009-01-26 12:37:21 +01:00
|
|
|
if r is not None:
|
|
|
|
return r.group(1)
|
2011-08-05 23:58:55 +02:00
|
|
|
|
2021-10-15 03:19:12 +02:00
|
|
|
r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)\.swf', re.IGNORECASE).match(url)
|
2011-08-05 23:58:55 +02:00
|
|
|
if r is not None:
|
|
|
|
return r.group(1)
|
2009-01-26 12:37:21 +01:00
|
|
|
|
2015-07-01 22:59:09 +02:00
|
|
|
return for_each_feed_pattern(lambda url, channel: channel, url, None)
|
2009-01-26 12:37:21 +01:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2009-09-09 19:53:26 +02:00
|
|
|
def is_video_link(url):
|
|
|
|
return (get_youtube_id(url) is not None)
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2012-01-09 19:01:45 +01:00
|
|
|
def is_youtube_guid(guid):
|
|
|
|
return guid.startswith('tag:youtube.com,2008:video:')
|
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2014-09-04 11:21:00 +02:00
|
|
|
def for_each_feed_pattern(func, url, fallback_result):
|
|
|
|
"""
|
|
|
|
Try to find the username for all possible YouTube feed/webpage URLs
|
|
|
|
Will call func(url, channel) for each match, and if func() returns
|
|
|
|
a result other than None, returns this. If no match is found or
|
|
|
|
func() returns None, return fallback_result.
|
|
|
|
"""
|
|
|
|
CHANNEL_MATCH_PATTERNS = [
|
2021-01-14 22:17:10 +01:00
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/user/([a-z0-9]+)',
|
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/profile?user=([a-z0-9]+)',
|
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/rss/user/([a-z0-9]+)/videos\.rss',
|
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/channel/([-_a-z0-9]+)',
|
2021-10-15 03:28:05 +02:00
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?user=([a-z0-9]+)',
|
2021-01-14 22:17:10 +01:00
|
|
|
r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?channel_id=([-_a-z0-9]+)',
|
|
|
|
r'http[s]?://gdata.youtube.com/feeds/users/([^/]+)/uploads',
|
|
|
|
r'http[s]?://gdata.youtube.com/feeds/base/users/([^/]+)/uploads',
|
2014-09-04 11:21:00 +02:00
|
|
|
]
|
|
|
|
|
|
|
|
for pattern in CHANNEL_MATCH_PATTERNS:
|
|
|
|
m = re.match(pattern, url, re.IGNORECASE)
|
|
|
|
if m is not None:
|
|
|
|
result = func(url, m.group(1))
|
|
|
|
if result is not None:
|
|
|
|
return result
|
|
|
|
|
|
|
|
return fallback_result
|
2008-10-13 15:28:44 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2014-09-04 11:21:00 +02:00
|
|
|
def get_real_channel_url(url):
|
|
|
|
def return_user_feed(url, channel):
|
2015-05-20 21:50:10 +02:00
|
|
|
result = 'https://gdata.youtube.com/feeds/users/{0}/uploads'.format(channel)
|
2014-09-04 11:21:00 +02:00
|
|
|
logger.debug('YouTube link resolved: %s => %s', url, result)
|
|
|
|
return result
|
2008-10-13 15:28:44 +02:00
|
|
|
|
2014-09-04 11:21:00 +02:00
|
|
|
return for_each_feed_pattern(return_user_feed, url, url)
|
2008-10-13 17:07:01 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
@lru_cache(1)
|
|
|
|
def get_channel_id_url(url, feed_data=None):
|
2019-08-20 05:53:34 +02:00
|
|
|
if 'youtube.com' in url:
|
|
|
|
try:
|
2022-02-27 10:52:06 +01:00
|
|
|
if feed_data is None:
|
2022-02-28 04:46:50 +01:00
|
|
|
r = util.urlopen(url)
|
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
2022-02-27 10:52:06 +01:00
|
|
|
else:
|
2022-02-28 04:46:50 +01:00
|
|
|
r = feed_data
|
2021-06-01 06:27:44 +02:00
|
|
|
# video page may contain corrupt HTML/XML, search for tag to avoid exception
|
2022-02-28 04:46:50 +01:00
|
|
|
m = re.search(r'<meta itemprop="channelId" content="([^"]+)">', r.text)
|
2021-06-01 06:27:44 +02:00
|
|
|
if m:
|
|
|
|
channel_id = m.group(1)
|
|
|
|
else:
|
2022-02-28 04:46:50 +01:00
|
|
|
raw_xml_data = io.BytesIO(r.content)
|
2021-06-01 06:27:44 +02:00
|
|
|
xml_data = xml.etree.ElementTree.parse(raw_xml_data)
|
|
|
|
channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
|
2019-08-20 05:53:34 +02:00
|
|
|
channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
|
|
|
|
return channel_url
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
logger.warning('Could not retrieve youtube channel id.', exc_info=True)
|
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
raise Exception('Could not retrieve youtube channel id.')
|
|
|
|
|
2019-08-20 05:53:34 +02:00
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
def get_cover(url, feed_data=None):
|
2018-04-16 12:10:14 +02:00
|
|
|
if 'youtube.com' in url:
|
2018-04-16 12:14:02 +02:00
|
|
|
|
|
|
|
class YouTubeHTMLCoverParser(HTMLParser):
|
|
|
|
"""This custom html parser searches for the youtube channel thumbnail/avatar"""
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
2019-08-20 05:53:34 +02:00
|
|
|
self.url = []
|
2018-04-16 12:14:02 +02:00
|
|
|
|
|
|
|
def handle_starttag(self, tag, attributes):
|
|
|
|
attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
|
|
|
|
|
|
|
|
# Look for 900x900px image first.
|
|
|
|
if tag == 'link' \
|
|
|
|
and 'rel' in attribute_dict \
|
|
|
|
and attribute_dict['rel'] == 'image_src':
|
2019-08-20 05:53:34 +02:00
|
|
|
self.url.append(attribute_dict['href'])
|
2018-04-16 12:14:02 +02:00
|
|
|
|
2016-11-20 18:55:53 +01:00
|
|
|
# Fallback to image that may only be 100x100px.
|
2018-04-16 12:14:02 +02:00
|
|
|
elif tag == 'img' \
|
|
|
|
and 'class' in attribute_dict \
|
|
|
|
and attribute_dict['class'] == "channel-header-profile-image":
|
2019-08-20 05:53:34 +02:00
|
|
|
self.url.append(attribute_dict['src'])
|
2015-07-01 22:59:09 +02:00
|
|
|
|
2018-04-16 12:10:14 +02:00
|
|
|
try:
|
2022-02-27 10:52:06 +01:00
|
|
|
channel_url = get_channel_id_url(url, feed_data)
|
2022-02-28 04:46:50 +01:00
|
|
|
r = util.urlopen(channel_url)
|
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
|
|
|
html_data = util.response_text(r)
|
2018-04-16 12:10:14 +02:00
|
|
|
parser = YouTubeHTMLCoverParser()
|
|
|
|
parser.feed(html_data)
|
|
|
|
if parser.url:
|
|
|
|
logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url))
|
2019-08-20 05:53:34 +02:00
|
|
|
return parser.url[0]
|
2014-09-04 11:21:00 +02:00
|
|
|
|
2018-04-22 15:21:24 +02:00
|
|
|
except Exception:
|
2018-04-16 12:10:14 +02:00
|
|
|
logger.warning('Could not retrieve cover art', exc_info=True)
|
2015-05-20 21:10:57 +02:00
|
|
|
|
2021-07-07 03:05:23 +02:00
|
|
|
|
2021-06-18 15:40:26 +02:00
|
|
|
def get_gdpr_consent_url(html_data):
|
|
|
|
"""
|
|
|
|
Creates the URL for automatically accepting GDPR consents
|
|
|
|
EU GDPR redirects to a form that needs to be posted to be redirected to a get request
|
|
|
|
with the form data as input to the youtube video URL. This extracts that form data from
|
2021-07-07 03:05:23 +02:00
|
|
|
the GDPR form and builds up the URL the posted form results.
|
2021-06-18 15:40:26 +02:00
|
|
|
"""
|
|
|
|
class ConsentHTML(HTMLParser):
|
|
|
|
def __init__(self):
|
2021-07-07 03:05:23 +02:00
|
|
|
super().__init__()
|
|
|
|
self.url = ''
|
|
|
|
self.consentForm = False
|
|
|
|
|
2021-06-18 15:40:26 +02:00
|
|
|
def handle_starttag(self, tag, attributes):
|
|
|
|
attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
|
|
|
|
if tag == 'form' and attribute_dict['action'] == 'https://consent.youtube.com/s':
|
|
|
|
self.consentForm = True
|
2021-07-07 03:05:23 +02:00
|
|
|
self.url = 'https://consent.google.com/s?'
|
2021-06-18 15:40:26 +02:00
|
|
|
# Get GDPR form elements
|
|
|
|
if self.consentForm and tag == 'input' and attribute_dict['type'] == 'hidden':
|
2021-07-07 03:05:23 +02:00
|
|
|
self.url += '&' + attribute_dict['name'] + '=' + urllib.parse.quote_plus(attribute_dict['value'])
|
|
|
|
|
2021-06-18 15:40:26 +02:00
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag == 'form':
|
|
|
|
self.consentForm = False
|
2021-07-07 03:05:23 +02:00
|
|
|
|
2021-06-18 15:40:26 +02:00
|
|
|
try:
|
|
|
|
parser = ConsentHTML()
|
|
|
|
parser.feed(html_data)
|
|
|
|
except Exception:
|
2021-07-07 05:07:47 +02:00
|
|
|
raise YouTubeError('Could not retrieve GDPR accepted consent URL')
|
|
|
|
|
|
|
|
if parser.url:
|
|
|
|
logger.debug('YouTube GDPR accept consent URL is: %s', parser.url)
|
|
|
|
return parser.url
|
|
|
|
else:
|
2022-05-09 00:09:21 +02:00
|
|
|
logger.debug('YouTube GDPR accepted consent URL could not be resolved.')
|
2021-07-07 05:07:47 +02:00
|
|
|
raise YouTubeError('No acceptable GDPR consent URL')
|
2021-07-07 03:05:23 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2022-02-27 10:52:06 +01:00
|
|
|
def get_channel_desc(url, feed_data=None):
|
2019-08-20 05:53:34 +02:00
|
|
|
if 'youtube.com' in url:
|
|
|
|
|
|
|
|
class YouTubeHTMLDesc(HTMLParser):
|
|
|
|
"""This custom html parser searches for the YouTube channel description."""
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self.description = ''
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attributes):
|
|
|
|
attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
|
|
|
|
|
|
|
|
# Get YouTube channel description.
|
|
|
|
if tag == 'meta' \
|
|
|
|
and 'name' in attribute_dict \
|
|
|
|
and attribute_dict['name'] == "description":
|
|
|
|
self.description = attribute_dict['content']
|
|
|
|
|
|
|
|
try:
|
2022-02-27 10:52:06 +01:00
|
|
|
channel_url = get_channel_id_url(url, feed_data)
|
2022-02-28 04:46:50 +01:00
|
|
|
r = util.urlopen(channel_url)
|
|
|
|
if not r.ok:
|
|
|
|
raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
|
|
|
|
html_data = util.response_text(r)
|
2019-08-20 05:53:34 +02:00
|
|
|
parser = YouTubeHTMLDesc()
|
|
|
|
parser.feed(html_data)
|
|
|
|
if parser.description:
|
2019-08-27 21:56:13 +02:00
|
|
|
logger.debug('YouTube description for %s is: %s', url, parser.description)
|
2019-08-20 05:53:34 +02:00
|
|
|
return parser.description
|
|
|
|
else:
|
2019-08-27 21:56:13 +02:00
|
|
|
logger.debug('YouTube description for %s is not provided.', url)
|
|
|
|
return _('No description available')
|
2019-08-20 05:53:34 +02:00
|
|
|
|
|
|
|
except Exception:
|
2020-03-13 04:18:29 +01:00
|
|
|
logger.warning('Could not retrieve YouTube channel description for %s.' % url, exc_info=True)
|
2019-08-20 05:53:34 +02:00
|
|
|
|
2018-02-11 00:22:00 +01:00
|
|
|
|
2018-04-08 23:08:05 +02:00
|
|
|
def parse_youtube_url(url):
|
|
|
|
"""
|
|
|
|
Youtube Channel Links are parsed into youtube feed links
|
|
|
|
>>> parse_youtube_url("https://www.youtube.com/channel/CHANNEL_ID")
|
|
|
|
'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID'
|
|
|
|
|
|
|
|
Youtube User Links are parsed into youtube feed links
|
|
|
|
>>> parse_youtube_url("https://www.youtube.com/user/USERNAME")
|
|
|
|
'https://www.youtube.com/feeds/videos.xml?user=USERNAME'
|
2015-05-20 21:19:20 +02:00
|
|
|
|
2018-04-08 23:08:05 +02:00
|
|
|
Youtube Playlist Links are parsed into youtube feed links
|
|
|
|
>>> parse_youtube_url("https://www.youtube.com/playlist?list=PLAYLIST_ID")
|
|
|
|
'https://www.youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID'
|
|
|
|
|
2018-05-24 21:54:38 +02:00
|
|
|
>>> parse_youtube_url(None)
|
|
|
|
None
|
|
|
|
|
2018-04-08 23:08:05 +02:00
|
|
|
@param url: the path to the channel, user or playlist
|
|
|
|
@return: the feed url if successful or the given url if not
|
|
|
|
"""
|
2018-05-24 21:54:38 +02:00
|
|
|
if url is None:
|
|
|
|
return url
|
2018-04-08 23:08:05 +02:00
|
|
|
scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
|
2018-04-10 10:00:28 +02:00
|
|
|
logger.debug("Analyzing URL: {}".format(" ".join([scheme, netloc, path, query, fragment])))
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
if 'youtube.com' in netloc:
|
2021-07-01 06:22:42 +02:00
|
|
|
if path == '/feeds/videos.xml' and re.search(r'^(user|channel|playlist)_id=.*', query):
|
|
|
|
return url
|
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
if '/user/' in path or '/channel/' in path or 'list=' in query:
|
|
|
|
logger.debug("Valid Youtube URL detected. Parsing...")
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
if path.startswith('/user/'):
|
|
|
|
user_id = path.split('/')[2]
|
|
|
|
query = 'user={user_id}'.format(user_id=user_id)
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
if path.startswith('/channel/'):
|
|
|
|
channel_id = path.split('/')[2]
|
|
|
|
query = 'channel_id={channel_id}'.format(channel_id=channel_id)
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
if 'list=' in query:
|
|
|
|
playlist_query = [query_value for query_value in query.split("&") if 'list=' in query_value][0]
|
|
|
|
playlist_id = playlist_query[5:]
|
|
|
|
query = 'playlist_id={playlist_id}'.format(playlist_id=playlist_id)
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
path = '/feeds/videos.xml'
|
2018-04-08 23:08:05 +02:00
|
|
|
|
2021-06-01 06:27:44 +02:00
|
|
|
new_url = urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
|
|
|
|
logger.debug("New Youtube URL: {}".format(new_url))
|
|
|
|
return new_url
|
|
|
|
|
|
|
|
# look for channel URL in page
|
|
|
|
new_url = get_channel_id_url(url)
|
|
|
|
if new_url:
|
|
|
|
logger.debug("New Youtube URL: {}".format(new_url))
|
|
|
|
return new_url
|
|
|
|
|
|
|
|
logger.debug("Not a valid Youtube URL: {}".format(url))
|
|
|
|
return url
|