This commit is contained in:
KurtBestor 2021-04-05 23:38:47 +09:00
parent a69b757610
commit bcd33f9118
41 changed files with 6013 additions and 34 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 475 KiB

After

Width:  |  Height:  |  Size: 974 KiB

View File

@ -14,7 +14,18 @@ import math
import ree as re import ree as re
import utils import utils
from collections import OrderedDict from collections import OrderedDict
_VALID_URL = 'https?://(?:www\\.|bangumi\\.|)bilibili\\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\\d+)/play#)(?P<id>\\d+)' _VALID_URL = r'''(?x)
https?://
(?:(?:www|bangumi)\.)?
bilibili\.(?:tv|com)/
(?:
(?:
video/[aA][vV]|
anime/(?P<anime_id>\d+)/play\#
)(?P<id_bv>\d+)|
video/[bB][vV](?P<id>[^/?#&]+)
)
'''
_APP_KEY = 'iVGUTjsxvpLeuDCf' _APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
RESOLS = OrderedDict() RESOLS = OrderedDict()

View File

@ -0,0 +1,219 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: daumtoon_downloader.pyo
# Compiled at: 2019-10-03 10:11:29
import downloader
from utils import Soup, Session, LazyUrl, Downloader, try_n, get_imgs_already, clean_title, get_print
import json, os
from timee import time, sleep
import ree as re
from translator import tr_
import page_selector
class Page(object):
def __init__(self, id, url, title, serviceType):
self.id = id
self.url = url
self.title = title
self.serviceType = serviceType
class Image(object):
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl(page.url, self.get, self)
ext = os.path.splitext(url.split('?')[0])[1]
if ext.lower()[1:] not in ('jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp'):
ext = '.jpg'
self.filename = (u'{}/{:04}{}').format(clean_title(page.title), p, ext)
def get(self, _):
return self._url
def get_id(url):
if '/league/' in url:
header = 'league_'
else:
header = ''
body = re.find('/viewer/([0-9a-zA-Z_-]+)', url) or re.find('/view/([0-9a-zA-Z_-]+)', url)
return header, body
def get_info(url, session):
referer = url
header, id = get_id(referer)
if 'league_' in id:
type_ = 'leaguetoon'
else:
type_ = 'webtoon'
info = {}
ids = set()
pages = []
for p in range(1, 1+10):
if p == 1:
url = 'http://webtoon.daum.net/data/pc/{}/view/{}?timeStamp={}'.format(type_, id, int(time()))
else:
if type_ == 'webtoon':
break
url = 'http://webtoon.daum.net/data/pc/{}/view/{}?page_no={}&timeStamp={}'.format(type_, id, p, int(time()))
print(url)
info_raw = downloader.read_html(url, referer=referer, session=session)
_info = json.loads(info_raw)
webtoon = _info['data'].get('webtoon') or _info['data'].get('leaguetoon')
if webtoon is None:
raise Exception('No webtoon')
if p == 1:
info['title'] = webtoon['title']
artists = []
for artist in webtoon['cartoon']['artists']:
artist = artist['penName']
if artist in artists:
continue
artists.append(artist)
if len(artists) > 1:
artists = [
artists[1], artists[0]] + artists[2:]
info['artists'] = artists
eps = webtoon.get('webtoonEpisodes') or webtoon.get('leaguetoonEpisodes')
if not eps:
if p > 1:
eps = []
else:
raise Exception('No eps')
c = 0
for ep in eps:
id_ = ep.get('articleId') or ep.get('id')
title = ep['title']
serviceType = 'free' if type_ =='leaguetoon' else ep['serviceType']
if type_ == 'leaguetoon':
url = 'http://webtoon.daum.net/league/viewer/{}'.format(id_)
else:
url = 'http://webtoon.daum.net/webtoon/viewer/{}'.format(id_)
if id_ in ids:
continue
c += 1
ids.add(id_)
page = Page(id_, url, title, serviceType)
pages.append(page)
if c == 0:
print('c == 0; break')
break
info['pages'] = sorted(pages, key=lambda x: x.id)
return info
@Downloader.register
class Downloader_daumtoon(Downloader):
type = 'daumtoon'
URLS = ['webtoon.daum.net']
MAX_CORE = 16
MAX_SPEED = 4.0
display_name = 'Daum Webtoon'
def init(self):
if '/viewer/' in self.url:
return self.Invalid(tr_('목록 주소를 입력해주세요: {}').format(self.url))
if '/view/' not in self.url and not self.url.lower().startswith('http'):
self.url = ('http://webtoon.daum.net/webtoon/view/{}').format(self.url)
self.session = None
self._info = get_info(self.url, self.session)
@property
def name(self):
title = self._info['title']
artists = self._info['artists']
artist = artists[0] if artists else 'N/A'
title = self.format_title('N/A', ''.join(get_id(self.url)), title, artist, 'N/A', 'N/A', 'Korean', prefix='daumtoon_')
return clean_title(title)
def read(self):
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(self.name)
imgs = get_imgs_all(self._info, self.name, self.session, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
self.session = None
return
def get_imgs(page, session, cw):
print_ = get_print(cw)
html = downloader.read_html(page.url, session=session)
header, id = get_id(page.url)
t = int(time())
soup = Soup(html)
if 'league_' in id:
type_ = 'leaguetoon'
else:
type_ = 'webtoon'
url_data = 'http://webtoon.daum.net/data/pc/{}/viewer/{}?timeStamp={}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
m_type = data['data']['webtoonEpisode']['multiType']
print_('m_type: {}'.format(m_type))
if m_type == 'chatting':
page.url = page.url.replace('daum.net/', 'daum.net/m/')
url_data = 'http://webtoon.daum.net/data/mobile/{}/viewer?id={}&{}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
imgs = []
for chat in data['data']['webtoonEpisodeChattings']:
img = chat.get('image')
if not img:
continue
img = Image(img['url'], page, len(imgs))
imgs.append(img)
else:
url_data = 'http://webtoon.daum.net/data/pc/{}/viewer_images/{}?timeStamp={}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
imgs = []
for img in data['data']:
img = Image(img['url'], page, len(imgs))
imgs.append(img)
return imgs
def get_imgs_all(info, title, session, cw=None):
pages = info['pages']
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
if page.serviceType != 'free':
continue
imgs_already = get_imgs_already('daumtoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs(page, session, cw)
if cw is not None:
cw.setTitle(tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages)))
if not cw.alive:
break
return imgs
@page_selector.register('daumtoon')
@try_n(4)
def f(url):
info = get_info(url, None)
return info['pages']

View File

@ -0,0 +1,101 @@
import downloader
from utils import Soup, try_n, LazyUrl, Downloader, lock, get_print, clean_title
from timee import sleep
import base64
import json
import constants
import ree as re
KEY = b'gefdzfdef'
@Downloader.register
class Downloader_epio(Downloader):
type = 'epio'
URLS = ['epio.app']
def read(self):
info = get_info(self.url, cw=self.cw)
imgs = info['imgs']
for img in imgs:
self.urls.append(img.url)
self.title = clean_title(info['title'])
class Image(object):
def __init__(self, url, referer, p):
self._url = url
self.url = LazyUrl(referer, self.get, self)
ext = '.jpg'#
self.filename = u'{:04}{}'.format(p, ext)
def get(self, referer):
return self._url
def get_info(url, cw=None):
info = _get_info(url, cw)
imgs = []
html = info['content']
soup = Soup(html)
for img in soup.findAll('img'):
src = img.attrs.get('src')
if not src:
continue
# 1696
if not isinstance(src, bytes):
src = src.encode('utf8')
t = base64.b64encode(src)
if isinstance(t, bytes):
t = t.decode('utf8')
src = 'https://cdn1-images.epio.app/image/download/{}'.format(t)
img = Image(src, url, len(imgs))
imgs.append(img)
info['imgs'] = imgs
return info
def get_id(url):
return re.find('article/detail/([0-9a-z]+)', url)
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.backends import default_backend
import aes
backend = default_backend()
def decrypt(s, cw=None):
print_ = get_print(cw)
key, iv = aes.key_and_iv(s[:16], KEY)
print_('key: {}\niv: {}'.format(key, iv))
cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=backend)
r = -len(s) % 16
if r:
s += b'\x00' * r
dec = cipher.decryptor()
s_dec = dec.update(s[16:]) + dec.finalize()
s_dec = s_dec[:-s_dec[-1]]
if r:
s_dec = s_dec[:-r]
return s_dec
def _get_info(url, cw=None):
id = get_id(url)
url_api = 'https://girlimg.epio.app/api/articles/{}?lang=en-us'.format(id)
html = downloader.read_html(url_api, referer=url)
s = json.loads(html)['string']
s = base64.b64decode(s)
s = decrypt(s, cw)
info = json.loads(s)
return info

View File

@ -0,0 +1,186 @@
import downloader
import ytdl
from utils import Downloader, Session, try_n, LazyUrl, get_ext, format_filename, clean_title, get_print
from io import BytesIO
import ree as re
from m3u8_tools import playlist2stream, M3u8_stream
import utils
import ffmpeg
@Downloader.register
class Downloader_etc(Downloader):
type = 'etc'
URLS = []
single = True
MAX_PARALLEL = 8
display_name = 'Etc'
def init(self):
self.session = Session()
name = ytdl.get_extractor_name(self.url)
self.print_('extractor: {}'.format(name))
if name == 'generic':
raise NotImplementedError()
def read(self):
video = get_video(self.url, self.session, self.cw)
if video.artist:
self.artist = video.artist
self.urls.append(video.url)
self.print_('url_thumb: {}'.format(video.url_thumb))
self.setIcon(video.thumb)
if video.header.lower() not in ['yourporn', 'spankbang']:
self.enableSegment()#
if isinstance(video.url(), M3u8_stream):
self.disableSegment()
self.title = '[{}] {}'.format(video.header, video.title)
def int_or_none(s):
try:
return int(s)
except:
return None
def format_(f):
if f is None:
return 'None'
return '{} - {} - {} - {}'.format(f['format'], f['_resolution'], f['_audio'], f['url'])
@try_n(4)
def get_video(url, session, cw, ie_key=None):
print_ = get_print(cw)
options = {
'noplaylist': True,
#'extract_flat': True,
'playlistend': 1,
}
ydl = ytdl.YoutubeDL(options)
info = ydl.extract_info(url)
if not ie_key:
ie_key = ytdl.get_extractor_name(url)
info['ie_key'] = ie_key
url_new = info.get('url')
print('url: {} -> {}'.format(url, url_new))
formats = info.get('formats', [])
print(info.keys())
if not formats and (info.get('entries') or 'title' not in info):
if 'entries' in info:
entry = info['entries'][0]
url_new = entry.get('url') or entry['webpage_url']
if url_new != url:
return get_video(url_new, session, cw, ie_key=get_ie_key(info))
session.headers.update(info.get('http_headers', {}))
#session.cookies.update(ydl.cookiejar)
if not formats:
print('no formats')
if url_new:
f = {'url': url_new, 'format': ''}
formats.append(f)
fs = []
for i, f in enumerate(formats):
f['_index'] = i
f['_resolution'] = f.get('vbr') or int_or_none(re.find('([0-9]+)p', f['format'], re.IGNORECASE)) or f.get('height') or f.get('width') or int(f.get('vcodec', 'none') != 'none')
f['_audio'] = f.get('abr') or f.get('asr') or int(f.get('acodec', 'none') != 'none')
print_(format_(f))
fs.append(f)
if not fs:
raise Exception('No videos')
f = sorted(fs, key=lambda f:(f['_resolution'], f['_index']))[-1]
if f['_audio']:
f_audio = None
else:
fs_audio = sorted([f_audio for f_audio in fs if (not f_audio['_resolution'] and f_audio['_audio'])], key=lambda f:(f['_audio'], f['_index']))
if fs_audio:
f_audio = fs_audio[-1]
else:
try:
f = sorted([f for f in fs if f['_audio']], key=lambda f:(f['_resolution'], f['_index']))[-1]
except IndexError:
pass
f_audio = None
print_('video: {}'.format(format_(f)))
print_('audio: {}'.format(format_(f_audio)))
video = Video(f, f_audio, info, session, url, cw=cw)
return video
def get_ie_key(info):
ie_key = info.get('ie_key') or info['extractor']
ie_key = ie_key.split(':')[0]
if ie_key.lower().endswith('playlist'):
ie_key = ie_key[:-len('playlist')]
return ie_key
class Video(object):
def __init__(self, f, f_audio, info, session, referer, cw=None):
self.f_audio = f_audio
self.cw = cw
self.title = title = info['title']
self.id = info['id']
self.url = f['url']
self.artist = info.get('uploader')
self.header = utils.capitalize(get_ie_key(info))
self.session = session
self.referer = referer
self.url_thumb = info.get('thumbnail')
self.thumb = BytesIO()
if self.url_thumb:
downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session)
try:
ext = downloader.get_ext(self.url, session, referer)
except Exception as e:
print(e)
ext = get_ext(self.url)
if not ext:
print('empty ext')
if f['_resolution']:
ext = '.mp4'
else:
ext = '.mp3'
if ext.lower() == '.m3u8':
try:
url = playlist2stream(self.url, referer, session=session, n_thread=4)
except:
url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4)
ext = '.mp4'
else:
url = self.url
self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp)
self.filename = format_filename(title, self.id, ext, header=self.header)
def pp(self, filename):
if self.cw:
with self.cw.convert(self):
return self._pp(filename)
else:
return self._pp(filename)
def _pp(self, filename):
if self.f_audio:
f = BytesIO()
downloader.download(self.f_audio['url'], buffer=f, referer=self.referer, session=self.session)
ffmpeg.merge(filename, f, cw=self.cw)
return filename

View File

@ -0,0 +1,260 @@
#coding:utf8
import downloader
from utils import Session, urljoin, Soup, LazyUrl, try_n, Downloader, get_outdir, clean_title
import ree as re
import json
import os
from translator import tr_
from timee import sleep
from downloader import getsize
import errors
PATTERN_CURSOR = '".+?&cursor=([0-9]+)'
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
class Image(object):
def __init__(self, url):
if 'fbid=' in url:
id = int(re.findall('fbid=([0-9]+)', url)[0])
elif 'photos/' in url:
id = int(url.split('photos/')[1].split('/')[1])
else:
id = int(url)
self.id = id
def f(_):
img = get_img(url)
ext = os.path.splitext(img.split('?')[0])[1]
self.filename = u'{}{}'.format(id, ext)
return img
self.url = LazyUrl(url, f, self)
@try_n(4)
def get_img(url):
#print('get_img', url)
html = read_html(url)
soup = Soup(html)
for div in soup.findAll('div'):
href = div.attrs.get('data-full-size-href')
if href:
img = href
break
else:
img = None
if img is None:
# 1869
for code in soup.findAll('code'):
code = code.string
hidden = Soup(code)
soup.append(hidden)
for a in soup.findAll('a'):
target = a.attrs.get('target')
if target == '_blank':
img = a.attrs['href']
break
else:
raise Exception('No img')
return img
def suitable(url):
if 'facebook.com' not in url.lower():
return False
if '/videos/' in url or 'video.php?' in url:
return False
return True
@Downloader.register
class Downloader_facebook(Downloader):
type = 'facebook'
URLS = [suitable]
_soup = None
MAX_CORE = 8
@classmethod
def fix_url(cls, url):
if 'facebook.com/' not in url:
url = 'https://facebook.com/{}'.format(url)
url = url.replace('m.facebook.', 'facebook.')
if 'www.facebook.com/' not in url:
url = url.replace('facebook.com/', 'www.facebook.com/', 1)
if '/profile.php?' not in url:
url = url.split('?')[0]
return url.split('#')[0].strip('/')
@property
def username(self):
username = get_username(self.url)
return username
@property
def soup(self):
if self._soup is None:
html = read_html(self.url)
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = get_title(self.soup)
id_ = 'facebook_{}'.format(self.username)
title = u'{} ({})'.format(title, id_)
return clean_title(title)
@property
def album(self):
if 'album_id=' in self.url:
album = re.findall('album_id=([0-9]+)', self.url)[0]
else:
album = None
return album
def read(self):
self.print_(self.name)
self.title = tr_(u'읽는 중... {}').format(self.name)
imgs = get_imgs(self.username, self.name, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def read_html(url):
return downloader.read_html(url, user_agent=UA)
def get_title(soup):
html = str(soup)
name = re.find(r'"__isProfile":"Page","name":(".*?")', html) or re.find(r'"name":(".*?")', html)
if not name:
gc = soup.find('div', id='globalContainer')
if gc and gc.find('form', id='login_form'):
raise errors.LoginRequired()
raise Exception('no name')
title = json.loads(name)
return title
def get_imgs(username, title, cw=None):
urls = [
'https://m.facebook.com/{}/photos'.format(username),
'https://m.facebook.com/profile.php?id={}&sk=photos'.format(username), # no custom URL
]
for url in urls:
print('get_imgs url:', url)
try:
html = read_html(url)
except:
continue
soup = Soup(html)
if soup.find('a', id='signup-button'):
raise errors.LoginRequired()
photo = soup.find('div', class_='_5v64')
if photo is not None:
break
else:
raise Exception('No photo div')
cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1]
print('first cursor:', cursor)
href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html)
href = urljoin(url, href)
href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)
cursors = set([cursor])
imgs = []
dups = {}
dir = os.path.join(get_outdir('facebook'), title)
try:
filenames = os.listdir(dir)
except:
filenames = []
for filename in filenames:
name, ext = os.path.splitext(filename)
if name.isdigit():
dups[int(name)] = os.path.join(dir, filename)
pages = set()
while True:
print(href)
html = read_html(href)
data_raw = html.replace('for (;;);', '')
data = json.loads(data_raw)
actions = data['payload']['actions']
for action in actions:
if action['target'] == 'm_more_photos':
break
else:
print('No more photos')
break
html = action['html']
soup = Soup(html)
photos = soup.findAll('div' ,class_='_5v64')
for photo in photos:
for a in photo.findAll('a'):
page = a.attrs['href']
page = urljoin(href, page)
# remove duplicate pages
if page in pages:
continue
pages.add(page)
img = Image(page)
id = img.id
if id in dups and getsize(dups[id]) > 0:
print('skip', id)
imgs.append(dups[id])
else:
imgs.append(img)
s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw is not None:
cw.setTitle(s)
if not cw.alive:
return []
else:
print(s)
cursor = re.find(PATTERN_CURSOR, data_raw)
#print(cursor)
if cursor is None:
print('no cursor')
break
if cursor in cursors:
print('same cursor')
break
cursors.add(cursor)
href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)
return imgs
def get_username(url):
if '/profile.php?' in url:
id = re.find(r'/profile\.php[\?&]id=([0-9]+)', url)
return id
else:
url = url.replace('facebook.com/pg/', 'facebook.com/')
return url.split('?')[0].split('facebook.com/')[1].split('/')[0]

View File

@ -0,0 +1,128 @@
#coding: utf-8
import downloader
import flickr_api
from timee import sleep
from utils import Downloader, LazyUrl, query_url, clean_title
import os
from translator import tr_
import ree as re
from datetime import datetime
import flickr_auth
alphabet = '123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ'
base = len(alphabet)
def b58encode(div, s=''):
if div >= base:
div, mod = divmod(div, base)
return b58encode(div, alphabet[mod] + s)
return alphabet[div] + s
def b58decode(s):
return sum(alphabet.index(c) * pow(base, i) for i, c in enumerate(reversed(s)))
class Image(object):
def __init__(self, photo):
self.photo = photo
self.id = photo.id
self.filename = None
def f(_=None):
url = photo.getPhotoFile()
#url = 'https://flic.kr/p/{}'.format(b58encode(int(photo.id)))
ext = os.path.splitext(url)[1]
date = datetime.fromtimestamp(int(photo.dateuploaded))
date = u'{:02}-{:02}-{:02}'.format(date.year%100, date.month, date.day)
self.filename = u'[{}] {}{}'.format(date, self.id, ext)
return url
self.url = LazyUrl(u'flickr_{}'.format(self.id), f, self)
def find_ps(url):
user = flickr_api.Person.findByUrl(url)
id = re.search('/albums/([0-9]+)', url).groups()[0]
pss = user.getPhotosets()
for ps in pss:
if ps.id == id:
break
else:
raise Exception('Not found photoset id')
return user, ps
@Downloader.register
class Downloader_flickr(Downloader):
type = 'flickr'
URLS = ['flickr.com']
_name = None
def init(self):
if 'flickr.com' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
self.url = 'https://www.flickr.com/people/{}'.format(self.url)
@property
def name(self):
global pss
if self._name is None:
url = self.url
flickr_auth.get_api(url, self.cw)
if '/albums/' in url:
user, ps = find_ps(url)
self._name = u'{} (flickr_album_{}_{})'.format(ps.title, user.id, ps.id)
else:
user = flickr_api.Person.findByUrl(url)
self._name = u'{} (flickr_{})'.format(user.username, user.id)
return clean_title(self._name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.title, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
def get_imgs(url, title=None, cw=None):
flickr_auth.get_api(title, cw)
if not flickr_auth.isAuth:
raise Exception('No Auth')
if '/albums/' in url:
user, ps = find_ps(url)
handle = ps
else:
user = flickr_api.Person.findByUrl(url)
handle = user
photos = []
per_page = 500
for page in range(1, 200):
photos_new = handle.getPhotos(per_page=per_page, page=page)
photos += photos_new
if len(photos_new) < per_page:
break
msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(photos))
if cw:
if not cw.alive:
break
cw.setTitle(msg)
else:
print(msg)
imgs = []
for photo in photos:
img = Image(photo)
imgs.append(img)
return imgs

View File

@ -0,0 +1,131 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: imgur_downloader.pyo
# Compiled at: 2019-10-07 05:58:14
import downloader
from utils import Downloader, Soup, try_n, urljoin, get_max_range, clean_title, cut_pair
import ree as re, json, os
from timee import sleep
from translator import tr_
@Downloader.register
class Downloader_imgur(Downloader):
type = 'imgur'
URLS = ['imgur.com']
MAX_CORE = 16
def init(self):
self.info = get_info(self.url)
@property
def id_(self):
return re.find('imgur.com/.+?/([0-9a-zA-Z]+)', self.url)
@property
def name(self):
title = self.info['title'] or 'N/A'
return clean_title(title, n=100)
def read(self):
imgs = get_imgs(self.url, self.info, self.cw)
for img in imgs:
ext = os.path.splitext(img.split('?')[0])[1]
if len(imgs) > 1:
self.filenames[img] = (u'{:04}{}').format(len(self.urls), ext)
else:
self.filenames[img] = clean_title(self.name, n=-len(ext)) + ext
self.urls.append(img)
self.single = len(imgs) == 1
self.referer = self.url
self.title = u'{} (imgur_{})'.format(self.name, self.id_)
@try_n(4)
def get_info(url):
url = url.replace('/gallery/', '/a/')
if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0:
title = re.find(r'/r/([^/]+)', url)
info = {}
info['title'] = title
info['type'] = 'r'
else:
try: # legacy
html = downloader.read_html(url, cookies={'over18':'1'})
s = re.find('image *: *({.+)', html)
info_raw = cut_pair(s)
except Exception as e: # new
print(e)
id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id')
url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_)
info_raw = downloader.read_html(url_api, cookies={'over18':'1'})
info = json.loads(info_raw)
info['type'] = 'a'
return info
def get_imgs(url, info=None, cw=None):
print('get_imgs', url)
if info is None:
info = get_info(url)
imgs = []
# Range
max_pid = get_max_range(cw)
if info['type'] == 'a':
if 'album_images' in info: # legacy
imgs_ = info['album_images']['images']
elif 'media' in info: # new
imgs_ = info['media']
else: # legacy
imgs_ = [info]
for img in imgs_:
img_url = img.get('url') # new
if not img_url: # legacy
hash = img['hash']
ext = img['ext']
img_url = 'https://i.imgur.com/{}{}'.format(hash, ext)
if img_url in imgs:
continue
imgs.append(img_url)
elif info['type'] == 'r':
urls = set()
for p in range(100):
url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p)
print(url_api)
html = downloader.read_html(url_api, referer=url)
soup = Soup(html)
c = 0
for post in soup.findAll('div', class_='post'):
a = post.find('a', class_='image-list-link')
url_post = urljoin(url, a.attrs['href'])
if url_post in urls:
continue
urls.add(url_post)
c += 1
try: # for r18 images
imgs += get_imgs(url_post)
except Exception as e:
print(e)
s = (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs))
if cw is not None:
if cw.alive:
cw.setTitle(s)
else:
return []
else:
print(s)
if c == 0:
print('same; break')
break
return imgs

View File

@ -0,0 +1,579 @@
#coding:utf8
import downloader
from timee import sleep, clock
from constants import clean_url
from utils import Downloader, LazyUrl, urljoin, get_max_range, Soup, Session, update_url_query, get_print, cut_pair, get_ext, clean_title, lazy, try_n, generate_csrf_token, check_alive
import urllib
from error_printer import print_error
import os, requests
from translator import tr_
import json
from datetime import datetime
import hashlib
import ree as re
from ratelimit import limits, sleep_and_retry
import clf2
import errors
FORMAT_PIN = r'/p/([0-9a-zA-Z-_]+)'
def get_session(url, cw=None):
#res = clf2.solve(url, cw=cw)
#return res['session']
session = Session()
sessionid = session.cookies._cookies.get('.instagram.com', {}).get('/',{}).get('sessionid')
if sessionid is None or sessionid.is_expired():
raise errors.LoginRequired()
session.headers['User-Agent'] = downloader.hdr['User-Agent']
if not session.cookies.get('csrftoken', domain='.instagram.com'):
csrf_token = generate_csrf_token()
print('csrf:', csrf_token)
session.cookies.set("csrftoken", csrf_token, domain='.instagram.com')
return session
@Downloader.register
class Downloader_insta(Downloader):
type = 'insta'
URLS = ['instagram.com']
MAX_CORE = 8
display_name = 'Instagram'
def init(self):
self.session = get_session(self.url, self.cw)
if '/p/' in self.url:
self.print_('single post')
elif '/stories/' in self.url:
self.print_('stories')
elif 'instagram.com' in self.url:
self.url = u'https://www.instagram.com/{}'.format(self.username)
@lazy
def username(self):
return get_username(self.url)
@classmethod
def fix_url(cls, url):
if 'instagram.com' not in url:
url = u'https://www.instagram.com/{}'.format(url)
return url.split('?')[0].split('#')[0].strip('/')
@classmethod
def key_id(cls, url):
return url.replace('://www.', '://')
@lazy
def name(self):
return get_name(self.url)
@property
def id_(self):
return u'{} (insta_{})'.format(clean_title(self.name), self.username)
def read(self):
cw = self.cw
title = self.id_
self.title = title
self.artist = self.name
ui_setting = self.ui_setting
if '/p/' in self.url:
self.print_('single')
iter = get_imgs_single(self.url, self.session, cw=cw)
elif '/stories/highlights/' in self.url:
iter = get_stories_single(self.url, session=self.session, cw=cw)
else:
s = ui_setting.instaStories.isChecked()
self.print_('stories: {}'.format(s))
iter = get_imgs_all(self.url, title, session=self.session, cw=cw, d=self, stories=s)
imgs = []
for img in iter:
if cw and not cw.alive:
return
self.urls.append(img.url)
self.title = title
def get_j(script):
s = script.string
if not s:
return
try:
s = s.replace('window._sharedData', '').strip()[1:-1].strip()
j = json.loads(s)
return j
except ValueError as e:
pass
def read_html(url, session, cw):
#res = clf2.solve(url, session=session, cw=cw)#
#return res['html']
return downloader.read_html(url, session=session)
def check_error(soup, cw, wait):
print_ = get_print(cw)
err = soup.find('div', class_='error-container')
if err:
err = err.text.strip()
if wait:
print_('err: {}'.format(err))
sleep(60*30, cw)
else:
raise Exception(err)
def get_sd(url, session=None, html=None, cw=None, wait=True):
print_ = get_print(cw)
if html:
soup = Soup(html)
check_error(soup, cw, wait)
for script in soup.findAll('script'):
j = get_j(script)
if j:
break
else:
raise Exception('no _sharedData!!')
else:
for try_ in range(4):
_wait(cw)
html = read_html(url, session, cw)
soup = Soup(html)
check_error(soup, cw, wait)
for script in soup.findAll('script'):
j = get_j(script)
if j:
break
else:
continue
break
else:
raise Exception('no _sharedData')
for script in soup.findAll('script'):
s = script.string
if s and 'window.__additionalDataLoaded(' in s:
s = cut_pair(s)
j_add = json.loads(s)
try:
j['entry_data']['PostPage'][0].update(j_add)
except:
j['entry_data']['ProfilePage'][0].update(j_add) #2900
# Challenge
challenge = j['entry_data'].get('Challenge')
if challenge:
for cont in challenge[0]['extraData']['content']:
title = cont.get('title')
if title:
break
else:
title = 'Err'
raise errors.LoginRequired(title)
# LoginAndSignupPage
login = j['entry_data'].get('LoginAndSignupPage')
if login:
raise errors.LoginRequired()
return j
def get_id(url):
j = get_sd(url)
if '/p/' in url:
id = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['id']
elif '/stories/' in url:
id = j['entry_data']['StoriesPage'][0]['user']['username'] # ???
else:
id = j['entry_data']['ProfilePage'][0]['graphql']['user']['id']
return id
def get_username(url):
j = get_sd(url, wait=False)
if '/p/' in url:
id = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['username']
elif '/stories/' in url:
id = j['entry_data']['StoriesPage'][0]['user']['username']
else:
id = j['entry_data']['ProfilePage'][0]['graphql']['user']['username']
return id
def get_name(url):
j = get_sd(url)
if '/p/' in url:
name = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['full_name']
elif '/stories/' in url:
id = get_id(url)
url = 'https://www.instagram.com/{}/'.format(id)
return get_name(url)
else:
name = j['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']
return name
class Image(object):
def __init__(self, url, referer, filename, id=None):
self._url = url
self.url = LazyUrl(referer, self.get, self)
self.filename = filename
self.id = id
def get(self, referer):
wait_download()
return self._url
class Image_lazy(object):
def __init__(self, url, session=None, cw=None):
self.url = url
self.session = session
self.cw = cw
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
cw = self.cw
if cw and not cw.alive:
raise Exception('cw is dead')
node = Node(url, session=self.session, cw=cw)
img = node.imgs[0]
ext = os.path.splitext(url)[1]
wait_download()
url_img = img.url()
self.filename = img.filename
return url_img
@sleep_and_retry
@limits(1, 10)
def _wait(cw=None):
if cw and not cw.alive:
raise Exception('cw is dead while waiting')
##@sleep_and_retry
##@limits(1, 1)
def wait_download():
pass
@try_n(2)
def get_query(query_hash, variables, session, cw=None):
_wait(cw)
print_ = get_print(cw)
csrf_token = session.cookies.get('csrftoken', domain='.instagram.com')
if not csrf_token:
raise Exception('no csrftoken')
hdr = {
"X-CSRFToken" : csrf_token, #2849
"X-IG-App-ID" : "936619743392459",
"X-IG-WWW-Claim" : "0",
"X-Requested-With": "XMLHttpRequest",
}
url_ = update_url_query('https://www.instagram.com/graphql/query/', {'query_hash': query_hash, 'variables': json.dumps(variables)})
#print(len(edges), url_)
r = session.get(url_, headers=hdr)
try:
j = json.loads(r.text)
except Exception as e:
print(e)
j = {}
if not j or j.get('status') == 'fail':
msg = 'Fail: {} {}'.format(j.get('message') or 'Please wait a few minutes before you try again.', variables)
print_(msg)
sleep(60*30, cw)
raise Exception(msg)
return j
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
print_ = get_print(cw)
for try_ in range(4):
try:
html = read_html(url, session, cw)
m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)', html)
if m is None:
raise Exception('Invalid page')
break
except Exception as e:
e_ = e
print_(print_error(e)[0])
else:
raise e_
n = int(m.groups()[0])
n = min(n, n_max)
data = get_sd(url, html=html, cw=cw)
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
csrf_token = data['config']['csrf_token']#
session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com')
cursor = ''
edges = []
bad = 0
while True:
check_alive(cw)
variables = {
'id': uploader_id,
'first': 12,
}
if cursor:
variables['after'] = cursor
#print_(variables)#
media = None
try:
j = get_query('003056d32c2554def87228bc3fd9668a', variables, session, cw)
media = j['data']['user']['edge_owner_to_timeline_media']
sleep(2)#
except Exception as e:
if bad > 10:
raise Exception('no media')
else:
print_(u'no media.. retry... ({}) {}'.format(bad+1, print_error(e)[0]))
sleep(12*bad, cw)
bad += 1
continue
bad = 0
edges_new = media.get('edges')
if not edges_new or not isinstance(edges_new, list):
print('no edges_new')
break
edges += edges_new
s = u'{} {} ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
if cw is not None:
cw.setTitle(s)
if not cw.alive:
return []
else:
print(s)
if len(edges) >= n:
break
page_info = media.get('page_info')
if not page_info:
break
if not page_info.get('has_next_page'):
break
cursor = page_info.get('end_cursor')
if not cursor:
break
if len(edges) <= n/2:
raise Exception(u'Too short: {} / {}'.format(len(edges), n))
imgs = []
for edge in edges:
node = edge['node']
type = node['__typename']
id = node['shortcode']
url = u'https://www.instagram.com/p/{}/'.format(id)
## if type in ['GraphVideo', 'GraphImage']:
## single = True
## else:
## single = False
for img in Node(url, session=session, cw=cw, media=node).imgs:
imgs.append(img)
if len(imgs) >= n_max:
break
return imgs
class Node(object):
def __init__(self, url, format=u'[%y-%m-%d] id_ppage', session=None, cw=None, media=None):
print('Node', url)
print_ = get_print(cw)
self.id = re.search(FORMAT_PIN, url).groups()[0]
self.imgs = []
self.session = session
if not media:
if False: # Original
j = get_sd(url, self.session, cw=cw)
data = j['entry_data']['PostPage'][0]['graphql']
else:
variables = {
"shortcode" : self.id,
"child_comment_count" : 3,
"fetch_comment_count" : 40,
"parent_comment_count" : 24,
"has_threaded_comments": True,
}
j = get_query('a9441f24ac73000fa17fe6e6da11d59d', variables, session, cw)
data = j['data']
media = data['shortcode_media']
if 'video_url' in media:
urls = [
media['video_url']]
elif 'edge_sidecar_to_children' in media:
edges = media['edge_sidecar_to_children']['edges']
urls = []
for edge in edges:
node = edge['node']
if 'video_url' in node:
url_ = node['video_url']
else:
url_ = node['display_resources'][(-1)]['src']
urls.append(url_)
else:
urls = [media['display_resources'][(-1)]['src']]
time = media['taken_at_timestamp']
self.date = datetime.fromtimestamp(time)
self.timeStamp = self.date.strftime(format).replace(':', u'\uff1a')
for p, img in enumerate(urls):
ext = os.path.splitext(img.split('?')[0].split('#')[0])[1]
filename = ('{}{}').format(self.timeStamp, ext).replace('id', str(self.id)).replace('page', str(p))
img = Image(img, url, filename)
self.imgs.append(img)
def get_imgs_all(url, title=None, cw=None, d=None, session=None, stories=True):
max_pid = get_max_range(cw)
url = clean_url(url)
if stories:
imgs_str = get_stories(url, title, cw=cw, session=session)
else:
imgs_str = []
max_pid = max(0, max_pid - len(imgs_str))
imgs = get_imgs(url, max_pid, title=title, cw=cw, session=session)
return imgs_str + imgs[:max_pid]
def get_imgs_single(url, session=None, cw=None):
node = Node(url, session=session, cw=cw)
return node.imgs
def get_stories(url, title=None, cw=None, session=None):
print_ = get_print(cw)
html = downloader.read_html(url, session=session)
data = get_sd(url, html=html, cw=cw)
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
csrf_token = data['config']['csrf_token']#
session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com')
print('uploader_id:', uploader_id)
variables = {
'user_id': uploader_id,
'include_chaining': True,
'include_reel': True,
'include_suggested_users': False,
'include_logged_out_extras': False,
'include_highlight_reels': True,
'include_live_status': True,
}
j = get_query('d4d88dc1500312af6f937f7b804c68c3', variables, session, cw)
imgs = []
ids = set()
data = j['data']
hs = data['user']['edge_highlight_reels']
edges = hs['edges']
edges.insert(0, str(uploader_id))
for i, edge in enumerate(edges):
if isinstance(edge, str):
id = edge
hid = None
url_str = url
else:
id = None
hid = edge['node']['id']
url_str = 'https://www.instagram.com/stories/highlights/{}/'.format(hid)
try:
imgs_new = get_stories_single(url_str, id=id, cw=cw, session=session)
for img in imgs_new:
if img.id in ids:
print('duplicate: {}'.format(img.id))
continue
ids.add(img.id)
imgs.append(img)
print_('stories: {}'.format(hid))
except Exception as e:
print_(u'Failed to get stories: {}'.format(hid))
print(e)
msg = u'{} {} ({}/{})'.format(tr_(u'스토리 읽는 중...'), title, i+1, len(edges))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
imgs = sort_str(imgs)
return imgs
def sort_str(imgs):
imgs = sorted(imgs, key=lambda img: int(img.id), reverse=True)
return imgs
def get_stories_single(url, id=None, cw=None, session=None):
j = get_sd(url, session=session, cw=cw)
hid = re.find('/stories/highlights/([0-9]+)', url)
reel_ids = []
highlight_reel_ids = []
if hid is None:
if id is None:
id = get_id(url) # ???
reel_ids.append(str(id))
else:
highlight_reel_ids.append(str(hid))
print(id, hid)
variables = {
"reel_ids":reel_ids,
"tag_names":[],
"location_ids":[],
"highlight_reel_ids":highlight_reel_ids,
"precomposed_overlay":False,
"show_story_viewer_list":True,
"story_viewer_fetch_count":50,
"story_viewer_cursor":"",
"stories_video_dash_manifest":False
}
print(variables)
j = get_query('f5dc1457da7a4d3f88762dae127e0238', variables, session, cw)
data = j['data']
m = data['reels_media'][0]
items = m['items']
if not items:
raise Exception('no items')
imgs = []
for item in items:
id = item['id']
rs = item.get('video_resources') or item['display_resources']
r = rs[-1]
src = r['src']
ext = get_ext(src)
filename = u'stories_{}{}'.format(id, ext)
img = Image(src, url, filename, id=id)
imgs.append(img)
imgs = sort_str(imgs)
return imgs

View File

@ -93,7 +93,13 @@ class Downloader_iwara(Downloader):
def read_channel(url, type_, cw=None): def read_channel(url, type_, cw=None):
print_ = get_print(cw) print_ = get_print(cw)
username = re.find(r'/users/([^/]+)', url, err='no username') html = downloader.read_html(url)
soup = Soup(html)
if soup.find('div', id='block-mainblocks-user-connect'):
username = re.find(r'''/messages/new\?user=(.+)['"]''', html, err='no username')
else:
username = re.find(r'/users/([^/]+)', url, err='no username')
print_('username: {}'.format(username))
info = {} info = {}
urls = [] urls = []
urls_set = set() urls_set = set()

View File

@ -0,0 +1,79 @@
import downloader
from utils import Downloader, Soup, get_print, json_loads, compatstr, LazyUrl, format_filename, clean_title
import devtools
import js2py
import ree as re
from m3u8_tools import playlist2stream
from io import BytesIO
@Downloader.register
class Downloader_javfinder(Downloader):
type = 'javfinder'
URLS = ['javfinder.la']
single = True
display_name = 'JavFinder'
def read(self):
video = Video(self.url, cw=self.cw)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.title = video.title
class Video(object):
def __init__(self, url, cw=None):
info = solve(url, cw=cw)
url_video = info['file']
stream = playlist2stream(url_video, n_thread=4)
self.url = LazyUrl(url, lambda x: stream, self)
self.title = info['title']
id = info['id']
self.filename = format_filename(self.title, id, '.mp4')
self.thumb = BytesIO()
downloader.download(info['url_thumb'], buffer=self.thumb)
def solve(url, cw=None):
print_ = get_print(cw)
info = {}
res = devtools.watch_network(url, cw=cw)
#html = res['html']
html = downloader.read_html(url) # ???
soup = Soup(html)
info['title'] = soup.find('h1').text.strip()
info['url_thumb'] = soup.find('meta', {'property': 'og:image'})['content'].strip()
for r in res['rs']:
url_player = r.url()
if 'streamsb.net/embed-' in url_player:
break
else:
raise Exception('no player')
print_('player: {}'.format(url_player))
info['id'] = ''#
html = downloader.read_html(url_player, url)
soup = Soup(html)
for script in soup.findAll('script'):
script = script.string or ''
if 'function(p,a,c,k,e,d)' in script:
break
else:
raise Exception('no function(p,a,c,k,e,d)')
js = script.strip()[5:-1].replace('function(p,a,c,k,e,d)', 'function hack(p,a,c,k,e,d)').replace('return p}', 'return p};hack')
context = js2py.EvalJs()
t = context.eval(js)
sources = re.find(r'sources *: *(\[\{.+?\}\])', t, err='no sources')
sources = json_loads(sources)
info['file'] = sources[0]['file']
return info

View File

@ -0,0 +1,207 @@
import downloader
from utils import Soup, urljoin, Downloader, fix_title, Session, get_print, LazyUrl, clean_title, get_imgs_already
import ree as re
from timee import sleep
from translator import tr_
import os
from constants import try_n, clean_url
import urllib, page_selector
import bs4
PATTERN = r'jmana[0-9]*.*/(comic_list_title|book)\?book'
PATTERN_ALL = r'jmana[0-9]*.*/(comic_list_title|book|bookdetail)\?book'
PATTERN_ID = '[?&]bookdetailid=([0-9]+)'
class Image(object):
def __init__(self, url, page, p):
self.url = LazyUrl(page.url, lambda _: url, self)
ext = '.jpg'
name = (u'{:04}{}').format(p, ext)
self.filename = (u'{}/{}').format(page.title, name)
class Page(object):
def __init__(self, title, url):
self.title = clean_title(title)
self.url = url
self.id = int(re.find(PATTERN_ID, url))
@Downloader.register
class Downloader_jmana(Downloader):
type = 'jmana'
URLS = ['regex:'+PATTERN_ALL]
MAX_CORE = 8
_soup = None
def init(self):
self.url = clean_url(self.url)
self.session = Session()
if re.search(PATTERN_ID, self.url): #1799
select = self.soup.find('select', class_='bookselect')
for i, op in enumerate(select.findAll('option')[::-1]):
if 'selected' in op.attrs:
break
else:
raise Exception('no selected option')
for a in self.soup.findAll('a'):
url = urljoin(self.url, a.get('href') or '')
if re.search(PATTERN, url):
break
else:
raise Exception('list not found')
self.url = self.fix_url(url)
self._soup = None
for i, page in enumerate(get_pages(self.url, self.session, self.soup)):
if page.id == int(op['value']):
break
else:
raise Exception('can not find page')
self.cw.range_p = [i]
@classmethod
def fix_url(cls, url):
return url
@property
def soup(self):
if self._soup is None:
html = downloader.read_html(self.url, session=self.session)
soup = Soup(html)
self._soup = soup
return self._soup
@property
def name(self):
title = get_title(self.soup)
artist = get_artist(self.soup)
title = fix_title(self, title, artist)
return title
def read(self):
title = self.name
artist = get_artist(self.soup)
self.artist = artist
for img in get_imgs(self.url, title, self.session, soup=self.soup, cw=self.cw):
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def get_title(soup):
a = soup.find('a', class_='tit')
if a:
return a.text.strip()
return re.find(r'제목 *: *(.+)', soup.find('a', class_='tit').text, err='no title')
def get_artist(soup):
return re.find(r'작가 *: *(.+)', soup.text, default='').strip() or 'N/A'
@try_n(4, sleep=60)
def get_imgs_page(page, referer, session, cw=None):
print_ = get_print(cw)
sleep(5, cw) #2017
html = downloader.read_html(page.url, referer, session=session)
inserted = re.find(r'''var *inserted *= *['"](.*?)['"]''', html)
print_('inserted: {}'.format(inserted))
inserted = set(int(i) for i in inserted.split(',')) if inserted else set()
soup = Soup(html)
view = soup.find(class_='pdf-wrap')
imgs = []
for i, img in enumerate(child for child in view.children if isinstance(child, bs4.element.Tag)):
src = img.get('data-src') or img.get('src') or ''
if i in inserted:
print_('remove: {}'.format(src))
continue
if not src:
continue
src = urljoin(page.url, src.strip())
if '/adimg/' in src:
print('adimg:', src)
continue
if '/notice' in src:
print('notice:', src)
continue
img = Image(src, page, len(imgs))
imgs.append(img)
return imgs
def get_pages(url, session=None, soup=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = []
for inner in soup.findAll('div', class_='inner'):
a = inner.find('a')
if not a:
continue
href = a.attrs.get('href', '')
if not re.search(PATTERN_ID, href):
continue
if a.find('img'):
print('skip img', a.attrs.get('href'))
continue
href = urljoin(url, href)
title_page = a.text
page = Page(title_page, href)
pages.append(page)
pages = list(reversed(pages))
return pages
@page_selector.register('jmana')
@try_n(4)
def f(url):
if re.search(PATTERN_ID, url):
raise Exception(tr_(u'목록 주소를 입력해주세요'))
session = Session()
pages = get_pages(url, session=session)
return pages
def get_imgs(url, title, session, soup=None, cw=None):
print_ = get_print(cw)
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = get_pages(url, soup=soup)
print_('pages: {}'.format(len(pages)))
pages = page_selector.filter(pages, cw)
imgs = []
for i, page in enumerate(pages):
imgs_already = get_imgs_already('jmana', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs_page(page, url, session, cw)
if cw is not None:
if not cw.alive:
return
cw.setTitle((u'{} {} / {} ({} / {})').format(tr_(u'\uc77d\ub294 \uc911...'), title, page.title, i + 1, len(pages)))
if not imgs:
raise Exception('no imgs')
return imgs

View File

@ -0,0 +1,192 @@
import downloader
import ree as re
from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin
from time import sleep
from translator import tr_
import page_selector
import json
UA = downloader.hdr['User-Agent']
class Page(object):
def __init__(self, id_, title):
self.id_ = id_
self.title = title
self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_)
class Image(object):
def __init__(self, url, page, p):
self.url = LazyUrl('https://page.kakao.com/', lambda _: url, self)
ext = '.jpg'
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
@Downloader.register
class Downloader_kakaopage(Downloader):
type = 'kakaopage'
URLS = ['page.kakao.com/home']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'KakaoPage'
def init(self):
self.session = Session()
self.session.headers['User-Agent'] = UA
@classmethod
def fix_url(cls, url):
id = re.find('/home/.+?/([0-9]+)', url)
if id is not None:
url = id
if url.isdecimal():
url = 'https://page.kakao.com/home?seriesId={}'.format(url)
return url
def read(self):
info = get_info(self.url, self.session, cw=self.cw)
for img in info['imgs']:
self.urls.append(img.url)
self.artist = info['artist']
self.title = clean_title('[{}] {}'.format(info['artist'], info['title']))
def get_id(url):
id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId')
return id_
def get_pages(url, session):
id_ = get_id(url)
pages = []
ids = set()
for p in range(100):
url_api = 'https://api2-page.kakao.com/api/v5/store/singles'
data = {
'seriesid': id_,
'page': str(p),
'direction': 'asc',
'page_size': '20',
'without_hidden': 'true',
}
r = session.post(url_api, data=data, headers={'Referer': url})
print(p, r)
data = r.json()
singles = data['singles']
if not singles:
print('no singles')
break
for single in singles:
title_page = single['title']
id_page = single['id']
if id_page in ids:
print('dup id')
continue
ids.add(id_page)
page = Page(id_page, title_page)
pages.append(page)
sleep(.5)
return pages
@try_n(2)
def get_imgs_page(page, session):
html = downloader.read_html(page.url, session=session)
did = re.find('"did" *: *"(.+?)"', html, err='no did')
url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
data = {
'productId': page.id_,
'device_mgr_uid': 'Windows - Chrome',
'device_model': 'Windows - Chrome',
'deviceId': did,
}
print(data)
r = session.post(url_api, data=data, headers={'Referer': page.url})
data = r.json()
if data['result_code']:
raise Exception(data['message'])
imgs = []
for file in data['downloadData']['members']['files']:
url = file['secureUrl']
url = urljoin('https://page-edge-jz.kakao.com/sdownload/resource/', url)
img = Image(url, page, len(imgs))
imgs.append(img)
return imgs
def get_info(url, session, cw=None):
print_ = get_print(cw)
pages = get_pages(url, session)
pages = page_selector.filter(pages, cw)
if not pages:
raise Exception('no pages')
info = {}
html = downloader.read_html(url, session=session)
soup = Soup(html)
__NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
if __NEXT_DATA__:
data = json.loads(__NEXT_DATA__.string)
tid = data['props']['initialState']['common']['constant']['tid']
print_('tid: {}'.format(tid))
session.cookies['_kptid'] = tid
html = downloader.read_html(url, session=session)
soup = Soup(html)
title = soup.find('h2').text.strip()
info['title'] = title
artist = soup.find('meta', {'name': 'author'})['content']
for x in [' ,', ', ']:
while x in artist:
artist = artist.replace(x, ',')
artist = artist.replace(',', ', ')
info['artist'] = artist
imgs = []
for i, page in enumerate(pages):
if cw is not None:
if not cw.alive:
return
cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), title, page.title, i + 1, len(pages)))
try:
_imgs = get_imgs_page(page, session)
e_msg = None
except Exception as e:
_imgs = []
e_msg = print_error(e)[0]
print_('{} {}'.format(page.title, len(_imgs)))
if e_msg:
print_(e_msg)
imgs += _imgs
sleep(.2)
if not imgs:
raise Exception('no imgs')
info['imgs'] = imgs
return info
@page_selector.register('kakaopage')
@try_n(4)
def f(url):
if 'seriesId=' not in url:
raise Exception(tr_('목록 주소를 입력해주세요'))
pages = get_pages(url, Session())
return pages

View File

@ -0,0 +1,55 @@
import downloader
import ytdl
from utils import Downloader, try_n, LazyUrl, get_ext, format_filename
from io import BytesIO as IO
from m3u8_tools import M3u8_stream
@Downloader.register
class Downloader_vlive(Downloader):
type = 'kakaotv'
URLS = ['tv.kakao']
single = True
display_name = 'KakaoTV'
@classmethod
def fix_url(cls, url):
return url.split('?')[0].strip('/')
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(2)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['ext'] == 'mp4']
f = sorted(fs, key=lambda f: f['height'])[-1]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
ext = get_ext(self._url)
self.filename = format_filename(self.title, info['id'], ext)
return self._url

View File

@ -0,0 +1,72 @@
import downloader
from utils import Soup, urljoin, Downloader, LazyUrl, Session, try_n, format_filename, clean_title
from timee import sleep
import ree as re
from io import BytesIO
import clf2
@Downloader.register
class Downloader_kissjav(Downloader):
type = 'kissjav'
URLS = ['kissjav.com']
single = True
display_name = 'KissJAV'
def read(self):
video = get_video(self.url)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.session = get_session(self.url, cw=self.cw)
self.enableSegment(1024*1024//2)
self.title = video.title
def get_video(url):
html = downloader.read_html(url)
soup = Soup(html)
view = soup.find('div', id='player-container-fluid')
src_best = None
res_best = -1
for source in view.findAll('source'):
src = urljoin(url, source.attrs['src'])
res = re.find('([0-9]+)p', source.attrs['title'])
res = int(res) if res else 0
if res > res_best:
src_best = src
res_best = res
if src_best is None:
raise Exception('No source')
title = soup.find('h1').text.strip()
id = soup.find('div', id='video').attrs['data-id']
url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
#src_best = downloader.real_url(src_best)
video = Video(src_best, url_thumb, url, title, id)
return video
class Video(object):
def __init__(self, url, url_thumb, referer, title, id):
self.title = title
self.filename = format_filename(title, id, '.mp4')
self.url = LazyUrl(referer, lambda x: url, self)
self.thumb = BytesIO()
self.url_thumb = url_thumb
downloader.download(url_thumb, buffer=self.thumb)
@try_n(2)
def get_session(url, cw=None):
session = Session()
clf2.solve(url, session=session, cw=cw)
return session

View File

@ -0,0 +1,165 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, LazyUrl, Downloader, try_n, Session, clean_title, get_print
import os
from translator import tr_
import page_selector
import clf2
import utils
import base64
from image_reader import QPixmap
class Image(object):
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl(page.url, self.get, self)#, pp=self.pp)
ext = os.path.splitext(url)[1]
if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
ext = '.jpg'
self.filename = u'{}/{:04}{}'.format(page.title, p, ext)
def get(self, _):
return self._url
## def pp(self, filename):
## pixmap = QPixmap(filename)
## pixmap.save(filename)
## return filename
class Page(object):
def __init__(self, title, url):
self.title = clean_title(title)
self.url = url
@Downloader.register
class Downloader_lhscan(Downloader):
type = 'lhscan'
URLS = ['lhscan.net', 'loveheaven.net', 'lovehug.net']
MAX_CORE = 16
display_name = 'LHScan'
_soup = None
def init(self):
self.url = self.url.replace('lhscan.net', 'loveheaven.net')
self.session = Session()
#clf2.solve(self.url, session=self.session, cw=self.cw)
soup = self.soup
if not soup.find('ul', class_='manga-info'):
self.Invalid(u'{}: {}'.format(tr_(u'목록 주소를 입력해주세요'), self.url))
@property
def soup(self):
if self._soup is None:
for try_ in range(8):
try:
html = downloader.read_html(self.url, session=self.session)
break
except Exception as e:
print(e)
else:
raise
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = self.soup.findAll('span', {'itemprop': 'name'})[-1].text.strip()
return clean_title(title)
def read(self):
self.title = tr_(u'읽는 중... {}').format(self.name)
imgs = get_imgs(self.url, self.name, self.session, self.soup, self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
@try_n(8)
def get_imgs_page(page, session, cw=None):
print_ = get_print(cw)
print_(page.title)
html = downloader.read_html(page.url, session=session)
soup = Soup(html)
view = soup.find('div', class_='chapter-content')
if not view:
raise Exception('no chapter-content')
imgs = []
for img in soup.findAll('img', class_='chapter-img'):
src = img.get('data-pagespeed-lazy-src') or img.get('data-src') or img.get('data-srcset') or img.get('data-aload') or img['src']
try:
src = base64.b64decode(src).strip().decode('utf8')
except:
pass
src = urljoin(page.url, src)
if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
continue
if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
continue
if 'LoveHug_600cfd96e98ff.jpg' in src:
continue
img = Image(src.strip(), page, len(imgs))
imgs.append(img)
return imgs
def get_pages(url, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
tab = soup.find('ul', class_='list-chapters')
pages = []
for li in tab.findAll('li'):
text = li.find('div', class_='chapter-name').text.strip()
href = li.parent['href']
href = urljoin(url, href)
page = Page(text, href)
pages.append(page)
if not pages:
raise Exception('no pages')
return pages[::-1]
@page_selector.register('lhscan')
@try_n(4)
def f(url):
session = Session()
#clf2.solve(url, session=session)
pages = get_pages(url, session)
return pages
@try_n(2)
def get_imgs(url, title, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = get_pages(url, session, soup, cw)
pages = page_selector.filter(pages, cw)
imgs = []
for i, page in enumerate(pages):
imgs += get_imgs_page(page, session, cw)
s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
if cw is not None:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
return imgs

View File

@ -0,0 +1,119 @@
import downloader
from utils import Session, Downloader, get_ext, LazyUrl, get_print
import ree as re
import json
from io import BytesIO
from translator import tr_
@Downloader.register
class Downloader_likee(Downloader):
type = 'likee'
URLS = ['likee.video']
single = True
display_name = 'Likee'
def init(self):
self.session = Session()
def read(self):
info = get_info(self.url, self.session, self.cw)
self.print_('type: {}'.format(info['type']))
self.artist = info['artist']
if info['type'] != 'single':
video = self.process_playlist(info['title'], info['videos'])
else:
video = info['videos'][0]
video.url()
self.urls.append(video.url)
self.title = info['title']
thumb = BytesIO()
downloader.download(video.url_thumb, referer=self.url, buffer=thumb)
self.setIcon(thumb)
def get_info(url, session, cw=None):
print_ = get_print(cw)
info = {}
info['videos'] = []
if '/video/' in url:
info['type'] = 'single'
video = Video(url, session)
video.url()
info['videos'].append(video)
info['title'] = video.id_
info['artist'] = video.artist
return info
info['type'] = 'channel'
html = downloader.read_html(url, session=session)
data_raw = html.split('window.data = ')[1].split('};')[0]+'}'
data = json.loads(data_raw)
info['uid'] = data['userinfo']['uid']
info['username'] = data['userinfo']['yyuid']
info['artist'] = data['userinfo']['nick_name']
info['title'] = '{} (likee_{})'.format(info['artist'], info['username'])
lastPostId = ''
urls = set()
while True:
url_api = 'https://likee.video/official_website/VideoApi/getUserVideo'
r = session.post(url_api, data={'uid': info['uid'], 'count': '30', 'lastPostId': lastPostId})
data = json.loads(r.text)
videos = data['data']['videoList']
if not videos:
break
for data in videos:
url_post = 'https://likee.video/@{}/video/{}'.format(data['likeeId'], data['postId'])
if url_post in urls:
print_('duplicate: {}'.format(url_post))
continue
urls.add(url_post)
video = Video(url_post, session, data)
video.url()
info['videos'].append(video)
lastPostId = data['postId']
msg = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(info['videos']))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
return info
class Video(object):
def __init__(self, url, session, data=None):
self.id_ = re.find('/video/([0-9]+)', url, err='no id')
self._session = session
self._data = data
self.url = LazyUrl(url, self.get, self)
def get(self, url):
if self._data:
video = self._data
else:
url_api = 'https://likee.video/official_website/VideoApi/getVideoInfo'
r = self._session.post(url_api, data={'postIds': str(self.id_)})
data = json.loads(r.text)
video = data['data']['videoList'][0]
url_video = video['videoUrl']
self.url_thumb = video['coverUrl']
self.artist = video['nickname']
ext = get_ext(url_video)
self.title = self.id_
self.filename = '{}{}'.format(self.id_, ext)
return url_video

View File

@ -0,0 +1,145 @@
#coding:utf8
import downloader
from utils import Soup, Downloader, LazyUrl, urljoin, try_n, get_outdir, clean_title
import ree as re
import os
from timee import sleep
from translator import tr_
from io import BytesIO
import json
class Image(object):
def __init__(self, item, referer):
self.item = item
self.id = str(item['id'])
self.referer = referer
self.url = LazyUrl(referer, self.get, self)
def get(self, url):
img = urljoin(url, self.item['url_to_original'])
ext = os.path.splitext(img.split('?')[0])[1]
self.filename = u'{}{}'.format(self.id, ext)
return img
class Video(object):
def __init__(self, url, title, url_thumb):
self.url = url
self.title = title
ext = os.path.splitext(url.split('?')[0])[1]
self.filename = u'{}{}'.format(clean_title(title), ext)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
@Downloader.register
class Downloader_luscious(Downloader):
type = 'luscious'
URLS = ['luscious.net']
MAX_CORE = 4
@classmethod
def fix_url(cls, url):
url = url.replace('members.luscious.', 'www.luscious.')
return url
def read(self):
url = fix_url(self.url)
for try_ in range(8):
try:
html = downloader.read_html(url)
break
except Exception as e:
print(e)
self.print_('retry...')
else:
raise
soup = Soup(html)
title = clean_title(get_title(soup))
self.title = tr_(u'읽는 중... {}').format(title)
if '/videos/' in url:
video = get_video(url, soup)
imgs = [video]
self.setIcon(video.thumb)
else:
imgs = get_imgs(url, soup, self.cw)
dir = os.path.join(get_outdir(self.type), title)
names = {}
try:
for name in os.listdir(dir):
id = os.path.splitext(name)[0]
names[id] = name
except:
pass
for img in imgs:
if img.id in names:
url = os.path.join(dir, names[img.id])
else:
url = img.url
self.urls.append(url)
self.title = title#
def update(cw, title, imgs):
s = u'{} {} ({})'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw is not None:
cw.setTitle(s)
else:
print(s)
def fix_url(url):
url = re.sub(r'[^./]+\.luscious', 'legacy.luscious', url)
return url
def get_imgs(url, soup=None, cw=None):
url = fix_url(url)
if soup is None:
html = downloader.read_html(url)
soup = Soup(html)
title = get_title(soup)
imgs = []
for p in range(1, 81):
imgs_new = get_imgs_p(url, p)
if not imgs_new:
break
imgs += imgs_new
update(cw, title, imgs)
return imgs
@try_n(4, sleep=30)
def get_imgs_p(url, p=1):
id = re.find('/albums/[^/]+?([0-9]+)/', url+'/')
print(url, id)
url_api = 'https://api.luscious.net/graphql/nobatch/?operationName=AlbumListOwnPictures&query=+query+AlbumListOwnPictures%28%24input%3A+PictureListInput%21%29+%7B+picture+%7B+list%28input%3A+%24input%29+%7B+info+%7B+...FacetCollectionInfo+%7D+items+%7B+...PictureStandardWithoutAlbum+%7D+%7D+%7D+%7D+fragment+FacetCollectionInfo+on+FacetCollectionInfo+%7B+page+has_next_page+has_previous_page+total_items+total_pages+items_per_page+url_complete+%7D+fragment+PictureStandardWithoutAlbum+on+Picture+%7B+__typename+id+title+created+like_status+number_of_comments+number_of_favorites+status+width+height+resolution+aspect_ratio+url_to_original+url_to_video+is_animated+position+tags+%7B+category+text+url+%7D+permissions+url+thumbnails+%7B+width+height+size+url+%7D+%7D+&variables=%7B%22input%22%3A%7B%22filters%22%3A%5B%7B%22name%22%3A%22album_id%22%2C%22value%22%3A%22{}%22%7D%5D%2C%22display%22%3A%22position%22%2C%22page%22%3A{}%7D%7D'.format(id, p)
data_raw = downloader.read_html(url_api, referer=url)
data = json.loads(data_raw)
has_next_page = data['data']['picture']['list']['info']['has_next_page']
imgs = []
for item in data['data']['picture']['list']['items']:
img = Image(item, url)
imgs.append(img)
return imgs
def get_video(url, soup):
url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
title = re.find('videos/([^/]+)', url)
video = soup.find('video')
url = video.source.attrs['src']
video = Video(url, title, url_thumb)
return video
def get_title(soup):
return soup.find('h2').text.strip()

View File

@ -0,0 +1,33 @@
from utils import Downloader, LazyUrl, clean_title
from m3u8_tools import playlist2stream, M3u8_stream
import os
@Downloader.register
class Downloader_m3u8(Downloader):
type = 'm3u8'
URLS = ['.m3u8']
single = True
display_name = 'M3U8'
def init(self):
if '://' not in self.url:
self.url = 'http://' + self.url
def read(self):
video = Video(self.url)
self.urls.append(video.url)
self.title = video.title
class Video(object):
def __init__(self, url):
try:
m = playlist2stream(url)
except:
m = M3u8_stream(url)
self.url = LazyUrl(url, lambda _: m, self)
self.title = os.path.splitext(os.path.basename(url))[0]
self.filename = clean_title(self.title, n=-4) + '.mp4'

View File

@ -0,0 +1,211 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, LazyUrl, Downloader, query_url, try_n, Session, get_print, clean_title
import os
from translator import tr_
from timee import sleep
import requests
import ree as re
import clf2#
class Image(object):
def __init__(self, url, p, page):
ext = os.path.splitext(url)[1]
if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
ext = '.jpg'
self.filename = u'{:04}{}'.format(p, ext)
if page.title is not None:
self.filename = u'{}/{}'.format(page.title, self.filename)
def f(_):
return url
self.url = LazyUrl(page.url, f, self)
class Page(object):
def __init__(self, title, url, soup=None):
self.title = clean_title(title)
self.url = url
self.soup = soup
@Downloader.register
class Downloader_mrm(Downloader):
type = 'mrm'
URLS = ['myreadingmanga.info']
_soup = None
MAX_CORE = 16
display_name = 'MyReadingManga'
def init(self):
self.session = get_session(self.url, self.cw)
@classmethod
def fix_url(cls, url):
return re.find('https?://myreadingmanga.info/[^/]+', url, err='err')
@property
def soup(self):
if self._soup is None:
for try_ in range(8):
try:
html = read_html(self.url, session=self.session, cw=self.cw)
break
except Exception as e:
e_ = e
self.print_(e)
else:
raise e_
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = get_title(self.soup)
return title
def read(self):
self.title = u'읽는 중... {}'.format(self.name)
imgs = get_imgs(self.url, self.soup, self.session, self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
def get_title(soup):
title = soup.find('h1', class_='entry-title').text.strip()
title = fix_title(title)
title = clean_title(title)
return title
def get_imgs(url, soup=None, session=None, cw=None):
if soup is None:
html = read_html(url, session=session, cw=cw)
soup = Soup(html)
title = get_title(soup)
pagination = soup.find('div', class_='pagination')
if pagination is None:
page = Page(None, url, soup)
imgs = get_imgs_page(page, session=session)
else:
pages = get_pages(url, soup, session=session)
imgs = []
for i, page in enumerate(pages):
s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
if cw:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
imgs += get_imgs_page(page, session=session)
if not imgs:
raise Exception('no imgs')
return imgs
def get_pages(url, soup=None, session=None):
if soup is None:
html = read_html(url, session=session, cw=None)
soup = Soup(html)
pagination = soup.find('div', class_='pagination')
pages = []
hrefs = set()
for a in pagination.findAll('a'):
href = a.attrs.get('href', '')
href = urljoin(url, href)
if not href.startswith(url):
print('not match', href)
continue
while href.endswith('/'):
href = href[:-1]
if href in hrefs:
print('duplicate', href)
continue
hrefs.add(href)
text = a.text.strip()
page = Page(text, href)
pages.append(page)
if url not in hrefs:
page = Page('1', url, soup)
pages.insert(0, page)
return pages
@try_n(4)
def get_imgs_page(page, session=None):
url = page.url
soup = page.soup
if soup is None:
html = read_html(url, session=session, cw=None)
soup = Soup(html)
page.soup = soup
view = soup.find('div', class_='entry-content')
imgs = []
for img in view.findAll('img'):
img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src')
if img is None:
continue
img = urljoin(url, img)
img = Image(img, len(imgs), page)
imgs.append(img)
print(page.title, len(imgs), page.url)
return imgs
def fix_title(title):
title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title)
while ' ' in title:
title = title.replace(' ', ' ')
return title
def read_html(url, session, cw):
## html = downloader.read_html(url, session=session)
## soup = Soup(html)
##
## cf = soup.find('div', class_='cf-browser-verification')
## if cf is None:
## return html
r = clf2.solve(url, cw=cw, session=session)
return r['html']
@try_n(4)
def get_session(url, cw=None):
print_ = get_print(cw)
## html = downloader.read_html(url)
## soup = Soup(html)
##
## cf = soup.find('div', class_='cf-browser-verification')
## if cf is None:
## print_('no cf protection')
## return None
print_('cf protection')
r = clf2.solve(url, cw=cw)
session = r['session']
return session

View File

@ -0,0 +1,170 @@
#coding:utf-8
import downloader
import re
from utils import urljoin, Downloader, Soup, LazyUrl, clean_title
import json
from timee import sleep
import collections
PATTERNS = ['.*blog.naver.com/(?P<username>.+)/(?P<pid>[0-9]+)',
'.*blog.naver.com/.+?blogId=(?P<username>[^&]+).+?logNo=(?P<pid>[0-9]+)',
'.*?(?P<username>[0-9a-zA-Z_-]+)\.blog\.me/(?P<pid>[0-9]+)']
HDR = {
'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ko, en-US; q=0.7, en; q=0.3',
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
}
def get_id(url):
for pattern in PATTERNS:
m = re.match(pattern, url)
if m is None:
continue
username = m.group('username')
pid = m.group('pid')
break
else:
username, pid = None, None
return username, pid
@Downloader.register
class Downloader_naver(Downloader):
type = 'naver'
URLS = ['blog.naver.', '.blog.me']
display_name = 'Naver Blog'
def init(self):
username, pid = get_id(self.url)
if username is None:
return self.Invalid('Invalid format')
self.url = 'https://blog.naver.com/{}/{}'.format(username, pid)
self.headers = {'User-Agent': downloader.hdr['User-Agent']}
@property
def name(self):
username, pid = get_id(self.url)
return clean_title(u'{}/{}'.format(username, pid))
def read(self):
self.title = u'읽는 중... {}'.format(self.name)
imgs = get_imgs(self.url)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class Image(object):
def __init__(self, url):
self.url = url
class Video(object):
def __init__(self, url, referer, p):
self.url = LazyUrl(referer, lambda _: url, self)
self.filename = 'video_{}.mp4'.format(p)
def read_page(url, depth=0):
print('read_page', url, depth)
if depth > 10:
raise Exception('Too deep')
html = downloader.read_html(url, header=HDR)
if len(html) < 5000:
id = re.findall('logNo=([0-9]+)', html)[0]
usernames = re.findall('blog.naver.com/([0-9a-zA-Z]+)', url)
if not usernames:
usernames = re.findall('blogId=([0-9a-zA-Z]+)', url)
username = usernames[0]
url = 'https://m.blog.naver.com/PostView.nhn?blogId={}&logNo={}&proxyReferer='.format(username, id)
print('###', username, id, url)
soup = Soup(html)
if soup.find('div', {'id': 'viewTypeSelector'}):
return url, soup
frame = soup.find('frame')
if frame is None:
print('frame is None')
return read_page(url, depth+1)
return read_page(urljoin('https://blog.naver.com', frame.attrs['src']), depth+1)
def get_imgs(url):
url = url.replace('blog.naver', 'm.blog.naver')
url_frame, soup = read_page(url)
imgs = []
urls = set()
view = soup.find('div', {'id': 'viewTypeSelector'})
print('view', view is not None)
imgs_ = view.findAll('span', class_='_img') + view.findAll('img')
for img in imgs_:
url = img.attrs.get('src', None)
if url is None:
url = img.attrs.get('thumburl', None)
if url is None:
print(u'invalid img: {}'.format(url))
continue
if 'ssl.pstatic.net' in url: #
continue
if 'blogpfthumb-phinf.pstatic.net' in url: # profile
continue
if 'dthumb-phinf.pstatic.net' in url: # link
continue
if 'storep-phinf.pstatic.net' in url: # emoticon
continue
url = url.replace('mblogthumb-phinf', 'blogfiles')
#url = re.sub('\?type=[a-zA-Z0-9]*', '?type=w1@2x', url)
#url = re.sub('\?type=[a-zA-Z0-9]*', '', url)
url = url.split('?')[0]
if url in urls:
print('### Duplicate:', url)
continue
urls.add(url)
#url = url.split('?type=')[0]
img = Image(url)
imgs.append(img)
pairs = []
for video in soup.findAll('span', class_='_naverVideo'):
vid = video.attrs['vid']
key = video.attrs['key']
pairs.append((vid, key))
for script in soup.findAll('script', class_='__se_module_data'):
data_raw = script['data-module']
data = json.loads(data_raw)['data']
vid = data.get('vid')
if not vid:
continue
key = data['inkey']
pairs.append((vid, key))
videos = []
for vid, key in pairs:
url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key)
data_raw = downloader.read_html(url_api)
data = json.loads(data_raw)
fs = data['videos']['list']
fs = sorted(fs, key=lambda f: f['size'], reverse=True)
video = Video(fs[0]['source'], url_frame, len(videos))
videos.append(video)
return imgs + videos

View File

@ -0,0 +1,244 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: navertoon_downloader.pyo
# Compiled at: 2019-10-03 10:19:35
import downloader
from utils import Soup, urljoin, Downloader, LazyUrl, get_imgs_already, clean_title, get_ext, get_print
from constants import try_n
import ree as re, os
from timee import sleep
import page_selector
from translator import tr_
import json
class Page(object):
def __init__(self, url, title, p):
self.url = url
self.title = title
self.p = p
class Image(object):
def __init__(self, url, page, p):
ext = get_ext(url)
self.filename = (u'{}/{:04}{}').format(clean_title(page.title), p, ext)
self.url = LazyUrl(page.url, lambda _: url, self)
class Info(object):
def __init__(self, id, title, artist):
self.id = id
self.title = title
self.artist = artist
@Downloader.register
class Downloader_navertoon(Downloader):
type = 'navertoon'
URLS = ['comic.naver.com']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'Naver Webtoon'
def init(self):
self.url = get_main(self.url)
self.__info, _ = get_pages(self.url, self.cw)
@property
def name(self):
id = self.__info.id
title = self.__info.title
artist = self.__info.artist
title = self.format_title('N/A', id, title, artist, 'N/A', 'N/A', 'Korean', prefix='navertoon_')
return clean_title(title)
def read(self):
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(self.name)
imgs = get_imgs_all(self.url, self.name, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def get_main(url):
url_main = re.sub('[?&]page=[0-9]+', '', re.sub('[?&]no=[0-9]+', '', url)).replace('detail.nhn', 'list.nhn').replace('m.comic.naver.', 'comic.naver.')
while url_main.endswith('#'):
url_main = url_main[:-1]
return url_main
def set_no(url, p):
if '&no=' not in url:
url = url + ('&no={}').format(p)
return url
url = re.sub('&no=[0-9]+', ('&no={}').format(p), url)
return url
def get_id(url):
return int(url.lower().split('titleid=')[1].split('&')[0])
def set_page(url, p):
if '&page=' in url:
url = re.sub('&page=[0-9]+', ('&page={}').format(p), url)
else:
url += ('&page={}').format(p)
return url
@try_n(4)
def get_pages(url, cw=None):
print_ = get_print(cw)
url = get_main(url).replace('comic.naver.', 'm.comic.naver.')
id = get_id(url)
print('id:', id)
print(url)
html = downloader.read_html(url)
soup = Soup(html)
try:
info = soup.find('div', class_='area_info')
artist = info.find('span', class_='author').text.strip()
except Exception as e:
print(e)
try:
title = ('\n').join(soup.find('div', class_='title').text.strip().split('\n')[:-1]).strip()
except:
title = 'artist not found'
raise Exception(title)
print('artist:', artist)
title = soup.find('meta', {'property': 'og:title'}).attrs['content']
pages = []
nos = set()
for p in range(1, 100):
if p == 1:
url_page = url
else:
url_page = set_page(url, p)
html = downloader.read_html(url_page)
print('read page:', url_page)
soup = Soup(html)
view = soup.findAll('ul', class_='section_episode_list')[(-1)]
for lst in view.findAll('li'):
url_page = urljoin(url, lst.find('a').attrs['href'])
if 'detail.nhn' not in url_page.lower():
continue
print_('url_page: {}'.format(url_page))
text = lst.find('strong', class_='title').find('span', class_='name').text.strip()
no = int(re.findall('[?&]no=([0-9]+)', url_page)[0])
if no in nos:
print('duplicate no: {}'.format(no))
continue
nos.add(no)
text = '{:04} - {}'.format(no, text)
page = Page(url_page, text, p)
pages.append(page)
btn_next = soup.find('a', class_='btn_next')
if btn_next is None or btn_next.attrs['href'] == '#':
print('end of page')
break
info = Info(id, title, artist)
return (
info, pages)
@page_selector.register('navertoon')
@try_n(4)
def f(url):
url = get_main(url)
info, pages = get_pages(url)
return pages
@try_n(6)
def get_imgs(page, cw=None):
print_ = get_print(cw)
html = downloader.read_html(page.url)
soup = Soup(html)
type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html)
print_('type: {}'.format(type_))
imgs = []
if type_ == 'DEFAULT': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772
view = soup.find('div', class_='toon_view_lst')
for img in view.findAll('img'):
img = img.attrs.get('data-src')
if not img:
continue
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
elif type_ == 'CUTTOON': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803
view = soup.find('div', class_='swiper-wrapper')
for div in view.findAll('div', class_='swiper-slide'):
if div.parent != view:
continue
if div.find('div', class_='cut_viewer_last'):
print('cut_viewer_last')
continue
if div.find('div', class_='cut_viewer_recomm'):
print('cut_viewer_recomm')
continue
img = div.find('img')
img = img.attrs['data-src']
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
elif type_ == 'EFFECTTOON': #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144
img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/'
print('img_base:', img_base)
url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html)
data_raw = downloader.read_html(url_api, page.url)
data = json.loads(data_raw)
for img in data['assets']['stillcut'].values(): # ordered in python3.7+
img = urljoin(img_base, img)
img = Image(img, page, len(imgs))
imgs.append(img)
else:
_imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html)
if not _imgs:
raise Exception('no imgs')
for img in _imgs:
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
return imgs
def get_imgs_all(url, title, cw=None):
print_ = get_print(cw)
info, pages = get_pages(url, cw)
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
imgs_already = get_imgs_already('navertoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs_new = get_imgs(page, cw)
print_('{}: {}'.format(page.title, len(imgs_new)))
imgs += imgs_new
if cw is not None:
cw.setTitle(tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages)))
if not cw.alive:
break
return imgs

View File

@ -0,0 +1,63 @@
import downloader
import ree as re
from io import BytesIO as IO
import os
from constants import try_n
from error_printer import print_error
from utils import Downloader, compatstr, LazyUrl, get_ext, format_filename, clean_title
import ytdl
@Downloader.register
class Downloader_navertv(Downloader):
type = 'navertv'
single = True
URLS = ['tv.naver.com']
display_name = 'Naver TV'
def init(self):
if not re.match('https?://.+', self.url, re.IGNORECASE):
self.url = 'https://tv.naver.com/v/{}'.format(self.url)
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
if not fs:
raise Exception('No MP4 videos')
f = fs[0]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
id = info['id']
ext = get_ext(self._url)
self.filename = format_filename(self.title, id, ext)
return self._url

View File

@ -0,0 +1,97 @@
#coding:utf8
import downloader
import nndownload
from io import BytesIO
import ree as re
from utils import Downloader, get_print, compatstr, format_filename, clean_title, try_n
from nico_login import login, logout
def get_id(url):
if '/watch/' in url:
id = re.findall('/watch/([a-zA-Z0-9]+)', url)[0]
else:
id = url
return id
class Video(object):
def __init__(self, session, info):
self.session = session
self.info = info
self.url = info['url']
self.title = info['title']
self.ext = info['ext']
self.id = info['id']
self.fileName = format_filename(self.title, self.id, self.ext)
self.url_thumb = info['thumbnail_url']
print('thumb:', self.url_thumb)
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
def __repr__(self):
return u'Video({})'.format(self.id)
@Downloader.register
class Downloader_nico(Downloader):
type = 'nico'
single = True
URLS = ['nicovideo.jp']
display_name = 'Niconico'
def init(self):
if not re.match('https?://.+', self.url, re.IGNORECASE):
self.url = 'https://www.nicovideo.jp/watch/{}'.format(self.url)
@property
def id_(self):
return get_id(self.url)
def read(self):
ui_setting = self.ui_setting
if ui_setting.nicoBox.isChecked():
username = compatstr(ui_setting.nico_id.text())
password = compatstr(ui_setting.nico_pw.text())
else:
username = ''
password = ''
try:
session = login(username, password)
except Exception as e:
logout()
return self.Invalid(u'Failed to login: {}'.format(self.url), fail=True)
self.session = session
try:
video = get_video(session, self.id_, cw=self.cw)
except Exception as e:
logout()
raise
self.urls.append(video.url)
self.filenames[video.url] = video.fileName
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
@try_n(2)
def get_video(session, id, cw=None):
print_ = get_print(cw)
try:
info = nndownload.request_video(session, id)
except:
raise Exception('Err')
video = Video(session, info)
return video

View File

@ -0,0 +1,164 @@
#coding: utf-8
import downloader
from utils import Downloader, urljoin, get_max_range, query_url, Soup, Session, LazyUrl, get_print, clean_title, try_n, get_ext
from translator import tr_
from constants import clean_url
import ree as re
from errors import LoginRequired
def get_id(url):
return re.find('id=([0-9]+)', url)
def get_name(soup):
return soup.find('p', class_='user_icon').find('a', class_='name').text.strip()
def isLogin(soup):
if soup.find('ul', id="sub-menu"):
return True
return False
@Downloader.register
class Downloader_nijie(Downloader):
type = 'nijie'
URLS = ['nijie.info']
MAX_CORE = 4
display_name = 'ニジエ'
def init(self):
if 'members.php' not in self.url and 'members_illust.php' not in self.url:
raise NotImplementedError()
id = get_id(self.url)
html = downloader.read_html('https://nijie.info/members.php?id={}'.format(id))
self.soup = Soup(html)
if not isLogin(self.soup):
raise LoginRequired()
@classmethod
def fix_url(cls, url):
if 'nijie.info' not in url.lower():
url = 'https://nijie.info/members.php?id={}'.format(url)
return url.replace('http://', 'https://')
@property
def name(self):
name = u'{} (nijie_{})'.format(get_name(self.soup), get_id(self.url))
return clean_title(name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.name, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class Image(object):
def __init__(self, id, url, p, lazy=True, img=None):
self.id = id
self.p = p
if lazy:
self.url = LazyUrl(url, self.get_single, self)
else:
self.url = LazyUrl(url, lambda _:img, self)
ext = get_ext(img)
self.filename = '{}_p{}{}'.format(id, p, ext)
def get_single(self, url): # single
img = get_imgs_post(self.id, url)[0].url()
ext = get_ext(img)
self.filename = '{}_p{}{}'.format(self.id, self.p, ext)
return img
@try_n(8, sleep=10)
def get_imgs_post(id, url):
#print('get_imgs_post', id, url)
html = downloader.read_html(url)
soup = Soup(html)
view = soup.find('div', id='gallery')
imgs = []
for img in view.findAll(class_='mozamoza'):
url_img = urljoin(url, img['src'])
url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img)
img = Image(id, url, len(imgs), False, url_img)
imgs.append(img)
return imgs
def setPage(url, page):
# Always use HTTPS
url = url.replace('http://', 'https://')
# Change the page
if 'p=' in url:
url = re.sub('p=[0-9]*', 'p={}'.format(page), url)
else:
url += '&p={}'.format(page)
return url
def get_imgs(url, title=None, cw=None):
print_ = get_print(cw)
url = clean_url(url)
id = get_id(url)
url = u'https://nijie.info/members_illust.php?id={}'.format(id)
# Range
max_pid = get_max_range(cw)
imgs = []
url_imgs = set()
for p in range(1, 1+100):
url = setPage(url, p)
print_(url)
html = downloader.read_html(url)
soup = Soup(html)
posts = soup.findAll('div', class_='nijie')
if not posts:
print('no posts')
break
c = 0
for post in posts:
url_img = urljoin(url, post.a.attrs['href'])
if url_img in url_imgs:
print('duplicate:', url_img)
continue
url_imgs.add(url_img)
id = int(re.find('[?&]id=([0-9]+)', url_img))
multi = post.find('div', class_='thumbnail-icon')
if multi:
imgs_ = get_imgs_post(id, url_img)#
else:
imgs_ = [Image(id, url_img, 0)]
imgs += imgs_
c += 1
if len(imgs) >= max_pid:
break
msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
if len(imgs) >= max_pid or c == 0:
break
return imgs

View File

@ -0,0 +1,109 @@
import downloader
from utils import Session, Soup, LazyUrl, get_print, Downloader, get_ext, try_n, format_filename, clean_title
import ree as re
import json
from io import BytesIO
class EmbedUrlError(Exception): pass
@Downloader.register
class Downloader_pandoratv(Downloader):
type = 'pandoratv'
URLS = ['pandora.tv']
single = True
display_name = 'Pandora TV'
@classmethod
def fix_url(cls, url):
return url.split('#')[0]
def read(self):
video = Video(self.url, format, cw=self.cw)
try:
video.url()#
except EmbedUrlError as e:
return self.Invalid(e.args[0])
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
def extract(name, html, cw=None):
print_ = get_print(cw)
value = re.find(r'''{} *= *['"](.*?)['"]'''.format(name), html)
if value is None:
value = json.loads(re.find(r'''{} *= *(\[.*?\])'''.format(name), html))
print_('{}: {}'.format(name, value))
if value is None:
raise Exception('No {}'.format(name))
return value
class Video(object):
_url_video = None
def __init__(self, url, format='title', cw=None):
self.url = LazyUrl(url, self.get, self)
self.format = format
self.cw = cw
@try_n(2)
def get(self, url):
if self._url_video:
return self._url_video
cw = self.cw
print_ = get_print(cw)
html = downloader.read_html(url)
soup = Soup(html)
embedUrl = extract('embedUrl', html, cw)
if embedUrl:
raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl))
uid = extract('strLocalChUserId', html, cw)
pid = extract('nLocalPrgId', html, cw)
fid = extract('strFid', html, cw)
resolType = extract('strResolType', html, cw)
resolArr = extract('strResolArr', html, cw)
vodSvr = extract('nVodSvr', html, cw)
resols = extract('nInfo', html, cw)
runtime = extract('runtime', html, cw)
url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/'
data = {
'userId': uid,
'prgId': pid,
'fid': fid,
'resolType': resolType,
'resolArr': ','.join(map(str, resolArr)),
'vodSvr': vodSvr,
'resol': max(resols),
'runtime': runtime,
'tvbox': 'false',
'defResol': 'true',
'embed': 'false',
}
session = Session()
r = session.post(url_api, headers={'Referer': url}, data=data)
data = json.loads(r.text)
self._url_video = data['src']
self.title = soup.find('meta', {'property': 'og:description'})['content']
ext = get_ext(self._url_video)
self.filename = format_filename(self.title, pid, ext)
self.url_thumb = soup.find('meta', {'property': 'og:image'})['content']
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
return self._url_video

View File

@ -0,0 +1,216 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: pinter_downloader.pyo
# Compiled at: 2019-10-21 07:44:55
import downloader
from utils import Session, Downloader, LazyUrl, clean_url, try_n, Soup, clean_title
import json, os, ree as re
from timee import sleep
from translator import tr_
import urllib
import constants
from ratelimit import limits, sleep_and_retry
BASE_URL = 'https://www.pinterest.com'
def get_info(username, board, api):
if '/' in board:
section = (u'/').join(board.split('/')[1:])
board = board.split('/')[0]
info = api.board(username, board)
for s in api.board_sections(info['id']):
print(s['slug'].lower(), section)
if s['slug'].lower() == section.lower():
break
else:
raise Exception('Invalid section')
title = s['title']
info.update(s)
info['name'] = (u'{}/{}').format(info['name'], title)
print('section_id:', info['id'])
else:
info = api.board(username, board)
#info = board_info(username, board)
return info
def board_info(username, board):
url = u'https://www.pinterest.com/{}/{}/'.format(username, board)
html = downloader.read_html(url)
soup = Soup(html)
data = soup.find('script', id='initial-state').text
data = json.loads(data)['resourceResponses']
info = data[0]['response']['data']
return info
@Downloader.register
class Downloader_pinter(Downloader):
type = 'pinter'
URLS = ['pinterest.']
type_pinter = 'board'
display_name = 'Pinterest'
@try_n(4)
def init(self):
if 'pinterest.' not in self.url:
self.url = u'https://www.pinterest.com/{}'.format(self.url)
self.api = PinterestAPI()
username, board = get_username_board(self.url)
if '/' in board:
self.type_pinter = 'section'
self.print_(('type: {}').format(self.type_pinter))
self.info = get_info(username, board, self.api)
@property
def name(self):
username = self.info['owner']['username']
name = self.info['name']
return clean_title((u'{}/{}').format(username, name))
def read(self):
self.title = self.name
id = self.info['id']
imgs = get_imgs(id, self.api, cw=self.cw, title=self.name, type=self.type_pinter)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class PinterestAPI:
HEADERS = {'Accept': 'application/json, text/javascript, */*, q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'X-Pinterest-AppState': 'active',
'X-APP-VERSION': 'cb1c7f9',
'X-Requested-With': 'XMLHttpRequest',
'Origin': BASE_URL + '/'}
def __init__(self):
self.session = Session()
self.session.headers.update(self.HEADERS)
def pin(self, pin_id):
options = {'id': pin_id, 'field_set_key': 'detailed'}
return self._call('Pin', options)['resource_response']['data']
def pin_related(self, pin_id):
options = {'pin': pin_id, 'add_vase': True, 'pins_only': True}
return self._pagination('RelatedPinFeed', options)
def board(self, user, board):
options = {'slug': board, 'username': user, 'field_set_key': 'detailed'}
return self._call('Board', options)['resource_response']['data']
def board_pins(self, board_id):
options = {'board_id': board_id}
return self._pagination('BoardFeed', options)
def board_related(self, board_id):
options = {'board_id': board_id, 'add_vase': True}
return self._pagination('BoardRelatedPixieFeed', options)
def board_sections(self, board_id):
options = {'board_id': board_id}
return self._pagination('BoardSections', options)
def board_section_pins(self, section_id):
options = {'section_id': section_id}
return self._pagination('BoardSectionPins', options)
@try_n(4)
@sleep_and_retry
@limits(1, 4) # 1000 calls per hour
def _call(self, resource, options):
url = ('{}/resource/{}Resource/get/').format(BASE_URL, resource)
params = {'data': json.dumps({'options': options}), 'source_url': ''}
print('_call: {}, {}'.format(url, params))
r = self.session.get(url, params=params)
print(r)
s = r.text
status_code = r.status_code
try:
data = json.loads(s)
except ValueError:
data = {}
else:
if status_code < 400 and not r.history:
return data
if status_code == 404 or r.history:
raise Exception('Not Found')
raise Exception('API request failed: {}'.format(status_code))
def _pagination(self, resource, options):
while True:
data = self._call(resource, options)
for x in data['resource_response']['data']:
yield x
try:
bookmarks = data['resource']['options']['bookmarks']
if not bookmarks or bookmarks[0] == '-end-' or bookmarks[0].startswith('Y2JOb25lO'):
return
options['bookmarks'] = bookmarks
except KeyError:
return
class Image(object):
def __init__(self, img):
self.id = img['id']
print(self.id)
self.url0 = img['images']['orig']['url']
def f(_):
return self.url0
self.url = LazyUrl(('{}/pin/{}/').format(BASE_URL, self.id), f, self)
ext = os.path.splitext(self.url0.split('?')[0].split('#')[0])[1]
self.filename = ('{}{}').format(self.id, ext)
def get_imgs(id, api, cw=None, title=None, type='board'):
imgs = []
ids = set()
print('get_imgs: type={}'.format(type))
if type == 'board':
gen = api.board_pins(id)
elif type == 'section':
gen = api.board_section_pins(id)
else:
raise Exception((u'Type "{}" is not supported').format(type))
for img in gen:
if 'images' not in img:
print('skip img:', img['id'])
continue
img = Image(img)
if img.id in ids:
print('duplicate:', img.id)
continue
ids.add(img.id)
print(img.url)
print(img.filename)
print
imgs.append(img)
if cw is not None:
if not cw.alive:
return []
cw.setTitle((u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), title, len(imgs)))
return imgs
def get_username_board(url):
url = clean_url(url)
m = re.search('pinterest.[a-zA-Z.]+?/([^/]+)/([^#\\?]+)', url)
username, board = m.groups()
board = urllib.parse.unquote(board).strip()
while board.endswith('/'):
board = board[:-1].strip()
return (username, board)

View File

@ -14,6 +14,10 @@ except ImportError:
import constants import constants
from datetime import datetime from datetime import datetime
import requests import requests
from timee import sleep
from collections import deque
from locker import lock
import threading
FORCE_LOGIN = True FORCE_LOGIN = True
LIMIT = 48 LIMIT = 48
for header in ['pixiv_illust', 'pixiv_bmk', 'pixiv_search', 'pixiv_following', 'pixiv_following_r18']: for header in ['pixiv_illust', 'pixiv_bmk', 'pixiv_search', 'pixiv_following', 'pixiv_following_r18']:
@ -27,6 +31,7 @@ class Downloader_pixiv(Downloader):
type = 'pixiv' type = 'pixiv'
MAX_CORE = 16 MAX_CORE = 16
keep_date = True keep_date = True
STEP = 8, 32
@classmethod @classmethod
def fix_url(cls, url): def fix_url(cls, url):
@ -107,10 +112,10 @@ class PixivAPI():
def profile(self, id_): def profile(self, id_):
return self.call('user/{}/profile/all?lang=en'.format(id_)) return self.call('user/{}/profile/all?lang=en'.format(id_))
def bookmarks(self, id_, offset=0, limit=None): def bookmarks(self, id_, offset=0, limit=None, rest='show'):
if limit is None: if limit is None:
limit = LIMIT limit = LIMIT
return self.call('user/{}/illusts/bookmarks?tag=&offset={}&limit={}&rest=show&lang=en'.format(id_, offset, limit)) return self.call('user/{}/illusts/bookmarks?tag=&offset={}&limit={}&rest={}&lang=en'.format(id_, offset, limit, rest))
def search(self, q, order='date_d', mode='all', p=1, s_mode='s_tag', type_='all'): def search(self, q, order='date_d', mode='all', p=1, s_mode='s_tag', type_='all'):
return self.call('search/artworks/{0}?word={0}&order={1}&mode={2}&p={3}&s_mode={4}&type={5}&lang=en'.format(quote(q), order, mode, p, s_mode, type_)) return self.call('search/artworks/{0}?word={0}&order={1}&mode={2}&p={3}&s_mode={4}&type={5}&lang=en'.format(quote(q), order, mode, p, s_mode, type_))
@ -254,13 +259,17 @@ def get_info(url, cw=None, depth=0):
id_ = api.user_id(url) id_ = api.user_id(url)
if id_ is None: # if id_ is None: #
id_ = my_id() id_ = my_id()
if id_ == my_id():
rest = 'all'
else:
rest = 'show'
process_user(id_, info, api) process_user(id_, info, api)
info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id']) info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id'])
ids = [] ids = []
ids_set = set() ids_set = set()
offset = 0 offset = 0
while len(ids) < max_pid: while len(ids) < max_pid:
data = api.bookmarks(id_, offset) data = api.bookmarks(id_, offset, rest=rest)
c = 0 c = 0
for id in [work['id'] for work in data['works']]: for id in [work['id'] for work in data['works']]:
if id in ids_set: if id in ids_set:
@ -359,15 +368,54 @@ def process_user(id_, info, api):
def process_ids(ids, info, imgs, cw, depth=0): def process_ids(ids, info, imgs, cw, depth=0):
print_ = get_print(cw) print_ = get_print(cw)
max_pid = get_max_range(cw) max_pid = get_max_range(cw)
for i, id_illust in enumerate(ids): class Thread(threading.Thread):
try: alive = True
info_illust = get_info('https://www.pixiv.net/en/artworks/{}'.format(id_illust), cw, depth=depth+1) rem = 0
except Exception as e:
if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired): # logout during extraction def __init__(self, queue):
raise e super().__init__(daemon=True)
print_('process_ids error ({}):\n{}'.format(depth, print_error(e)[0])) self.queue = queue
continue
imgs += info_illust['imgs'] @classmethod
@lock
def add_rem(cls, x):
cls.rem += x
def run(self):
while self.alive:
try:
id_, res, i = self.queue.popleft()
except Exception as e:
sleep(.1)
continue
try:
info_illust = get_info('https://www.pixiv.net/en/artworks/{}'.format(id_), cw, depth=depth+1)
res[i] = info_illust['imgs']
except Exception as e:
if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired): # logout during extraction
res[i] = e
print_('process_ids error ({}):\n{}'.format(depth, print_error(e)[0]))
finally:
Thread.add_rem(-1)
queue = deque()
n, step = Downloader_pixiv.STEP
print_('{} / {}'.format(n, step))
ts = []
for i in range(n):
t = Thread(queue)
t.start()
ts.append(t)
for i in range(0, len(ids), step):
res = [[]]*step
for j, id_illust in enumerate(ids[i:i+step]):
queue.append((id_illust, res, j))
Thread.add_rem(1)
while Thread.rem:
sleep(.001, cw)
for imgs_ in res:
if isinstance(imgs_, Exception):
raise imgs_
imgs += imgs_
s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs)) s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
if cw: if cw:
cw.setTitle(s) cw.setTitle(s)
@ -377,3 +425,5 @@ def process_ids(ids, info, imgs, cw, depth=0):
break break
if depth == 0: if depth == 0:
check_alive(cw) check_alive(cw)
for t in ts:
t.alive = False

View File

@ -0,0 +1,530 @@
#coding:utf8
'''
Pornhub Downloader
'''
from __future__ import division, print_function, unicode_literals
from io import BytesIO
import os
import js2py
import downloader
import ree as re
from utils import (Downloader, Soup, try_n, LazyUrl, urljoin, get_print,
Session, get_max_range, filter_range, get_ext,
lock, format_filename, clean_title, get_resolution)
import clf2
import utils
from m3u8_tools import playlist2stream, M3u8_stream
class File(object):
'''
File
'''
def __init__(self, id_, title, url, url_thumb):
self.id_ = id_
self.title = clean_title('{}'.format(title))
self.url = url
ext = get_ext(self.url)
if ext.lower() == '.m3u8':
try:
self.url = playlist2stream(self.url, n_thread=4)
except:
self.url = M3u8_stream(self.url, n_thread=4)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
if ext.lower() == '.m3u8':
ext = '.mp4'
self.filename = format_filename(self.title, self.id_, ext)
print('filename:', self.filename)
class Video(object):
'''
Video
'''
_url = None
filename = None
thumb = None
def __init__(self, url, cw, session):
self.url = LazyUrl(url, self.get, self)
self.cw = cw
self.session = session
def get(self, url):
'''
get
'''
cw = self.cw
session = self.session
print_ = get_print(cw)
if self._url:
return self._url
id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
re.find(r'/embed/(\w+)', url, re.IGNORECASE)
print('id: {}'.format(id_))
if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))
html = downloader.read_html(url, session=session)
soup = Soup(html)
soup = fix_soup(soup, url, session, cw)
html = str(soup)
# removed
if soup.find('div', class_='removed'):
raise Exception('removed')
gif = soup.find('div', {'id': 'gifImageSection'})
if gif:
print_('GIF')
id_ = url.split('/gif/')[1]
id_ = re.findall('[0-9a-zA-Z]+', id_)[0]
jss = list(gif.children)
for js in jss:
if 'data-mp4' in getattr(js, 'attrs', {}):
break
else:
raise Exception('gif mp4 url not found')
title = js['data-gif-title']
url = js['data-mp4']
url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb')
file = File('gif_{}'.format(id_), title, url, url_thumb)
else:
if id_ is None:
raise Exception('no id')
print_('Video')
j = decode(html, cw)
# 1968
#title = j['video_title']
title = soup.find('h1', class_='title').text.strip()
url_thumb = j['image_url']
videos = []
for video in j['mediaDefinitions']:
url_ = video.get('videoUrl').strip()
ext = get_ext(url_)
if ext.lower() not in ['.mp4', '.m3u8']:
print('not mp4: {}'.format(ext))
continue
quality = video.get('quality', 0)
if isinstance(quality, list):
quality = quality[0]
video['quality'] = int(quality)
print_('[{}p] {}'.format(quality, url_))
videos.append(video)
if not videos:
raise Exception('No videos')
videos = sorted(videos, key=lambda video: video['quality'])
res = get_resolution()
videos_good = [video for video in videos if video['quality'] <= res]
if videos_good:
video = videos_good[-1]
else:
video = videos[0]
print_('\n[{}p] {}'.format(video['quality'], video['videoUrl']))
file = File(id_, title, video['videoUrl'].strip(), url_thumb)
self._url = file.url
self.title = file.title
self.filename = file.filename
self.thumb = file.thumb
return self._url
def is_login(session, cw=None, n=2):
'''
is_login
'''
print_ = get_print(cw)
print_('is_login {}'.format(n))
if n <= 0:
return False
url = 'https://www.pornhubpremium.com'
soup = downloader.read_soup(url, session=session)
soup = fix_soup(soup, url, session, cw)
html = str(soup)
if soup.find('ul', id='profileMenuDropdown'):
return True
return is_login(session, cw, n-1)
@Downloader.register
class Downloader_pornhub(Downloader):
'''
Downloader
'''
type = 'pornhub'
single = True
strip_header = False
URLS = ['pornhub.com', 'pornhubpremium.com']
def init(self):
self.session = Session() # 1791
if 'pornhub_gif_' in self.url:
self.url = 'https://www.pornhub.com/gif/{}'.format(
self.url.replace('pornhub_gif_', ''))
elif 'pornhub_album_' in self.url:
self.url = 'https://www.pornhub.com/album/{}'.format(
self.url.replace('pornhub_album_', ''))
elif 'pornhub_' in self.url:
self.url = 'https://www.pornhub.com/view_video.php?viewkey={}'\
.format(self.url.replace('pornhub_', ''))
if 'pornhubpremium.com' in self.url.lower() and\
not is_login(self.session, self.cw):
return self.Invalid('[Pornhub] Login cookies required')
@classmethod
def key_id(cls, url):
for domain in cls.URLS:
if domain in url:
id_ = domain + url.split(domain)[1]
break
else:
raise Exception('no id')
return id_.split('#')[0]
def read(self):
cw = self.cw
session = self.session
videos = []
tab = ''.join(self.url.replace('pornhubpremium.com', 'pornhub.com', 1).split('?')[0].split('#')[0].split('pornhub.com/')[-1].split('/')[2:3])
if '/album/' in self.url:
self.print_('Album')
info = read_album(self.url, session=session)
self.single = False
for photo in info['photos']:
self.urls.append(photo.url)
self.title = clean_title(info['title'])
elif '/photo/' in self.url:
self.print_('Photo')
info = read_photo(self.url, session=session)
for photo in info['photos']:
self.urls.append(photo.url)
self.title = info['title']
elif tab not in ['', 'videos']:
raise NotImplementedError(tab)
elif 'viewkey=' not in self.url.lower() and\
'/embed/' not in self.url.lower() and\
'/gif/' not in self.url.lower():
self.print_('videos')
info = get_videos(self.url, cw)
hrefs = info['hrefs']
self.print_('videos: {}'.format(len(hrefs)))
if not hrefs:
raise Exception('no hrefs')
videos = [Video(href, cw, session) for href in hrefs]
video = self.process_playlist(info['title'], videos)
self.setIcon(video.thumb)
self.enableSegment()
else:
video = Video(self.url, cw, session)
video.url()
self.urls.append(video.url)
self.setIcon(video.thumb)
self.title = video.title
self.enableSegment()
def fix_soup(soup, url, session=None, cw=None):
'''
fix_soup
'''
print_ = get_print(cw)
if soup.find('div', class_='logo'):
return soup
print_('invalid soup: {}'.format(url))
res = clf2.solve(url, session=session, cw=cw)
return Soup(res['html'])
class Photo(object):
'''
Photo
'''
def __init__(self, id_, url, referer):
self.id_ = id_
self.url = LazyUrl(referer, lambda x: url, self)
ext = os.path.splitext(url.split('?')[0])[1]
self.filename = '{}{}'.format(id_, ext)
@try_n(8)
def read_album(url, session=None):
'''
read_album
'''
soup = downloader.read_soup(url, session=session)
id_album = re.find('/album/([0-9]+)', url, err='no album id')
url_json = 'https://www.pornhub.com/album/show_album_json?album={}'.format(id_album)
data = downloader.read_json(url_json, url, session=session)
block = soup.find('div', class_='photoAlbumListBlock')
href = block.a.attrs['href']
id_ = re.find('/photo/([0-9]+)', href, err='no photo id')
ids = [id_]
while True:
item = data[id_]
id_ = item['next']
if id_ in ids:
break
ids.append(id_)
photos = []
for id_ in ids:
item = data[id_]
img = item['img_large']
referer = 'https://www.pornhub.com/photo/{}'.format(id_)
photo = Photo(id_, img, referer)
photos.append(photo)
info = {}
title = clean_title(soup.find('h1', class_='photoAlbumTitleV2').text)
info['title'] = format_filename(title, 'album_{}'.format(id_album))
info['photos'] = photos
return info
@try_n(8)
def read_photo(url, session=None):
'''
read_photo
'''
id_ = re.find('/photo/([0-9]+)', url, err='no photo id')
soup = downloader.read_soup(url, session=session)
div = soup.find('div', id='thumbSlider')
href = urljoin(url, div.find('a').attrs['href'])
info = read_album(href)
photos = []
for photo in info['photos']:
if str(photo.id_) == id_:
photos.append(photo)
info['photos'] = photos
info['title'] = '{} - {}'.format(info['title'], photos[0].filename)
return info
@try_n(4)
def get_videos(url, cw=None):
'''
get_videos
'''
print_ = get_print(cw)
if '/users/' in url:
mode = 'users'
username = url.split('/users/')[1].split('/')[0]
elif '/pornstar/' in url:
mode = 'pornstar'
username = url.split('/pornstar/')[1].split('/')[0]
elif '/model/' in url:
mode = 'model'
username = url.split('/model/')[1].split('/')[0]
elif '/channels/' in url:
mode = 'channels'
username = url.split('/channels/')[1].split('/')[0]
elif '/playlist/' in url:
mode = 'playlist'
username = url.split('/playlist/')[1].split('/')[0]
else:
raise Exception('Not supported url')
username = username.split('?')[0].split('#')[0]
session = Session()
if mode in ['pornstar']:
url_main = 'https://www.pornhub.com/{}/{}'.format(mode, username)
html = downloader.read_html(url_main, session=session)
soup = Soup(html)
soup = fix_soup(soup, url_main, session, cw)
for a in soup.findAll('a'):
if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''):
free = True
break
else:
free = False
print_('free: {}'.format(free))
# Range
max_pid = get_max_range(cw, 500)
max_pid = min(max_pid, 2000)#
html = downloader.read_html(url, session=session)
soup = fix_soup(Soup(html), url, session, cw)
info = {}
# get title
h1 = soup.find('h1')
if h1:
header = 'Playlist'
title = h1.find(id='watchPlaylist')
else:
title = None
if not title:
header = 'Channel'
profile = soup.find('div', class_='profileUserName')
wrapper = soup.find('div', class_='titleWrapper')
bio = soup.find('div', class_='withBio')
title = soup.find('h1', {'itemprop':'name'})
if not title and profile:
title = profile.a
if not title and wrapper:
title = wrapper.h1
if not title and bio:
title = bio.h1
if not title:
raise Exception('No title')
#print(title)
info['title'] = '[{}] {}'.format(header, title.text.strip())
token = re.find('''token *= *['"](.*?)['"]''', html)
print_('token: {}'.format(token))
# get links
hrefs = []
fail = 0
for p in range(1, 1+100):
try:
if mode in ['users', 'model']:
if mode == 'users':
url_api = 'https://www.pornhub.com/users/{}/videos/public/'\
'ajax?o=mr&page={}'.format(username, p)
elif mode == 'model':
url_api = 'https://www.pornhub.com/model/{}/videos/upload/'\
'ajax?o=mr&page={}'.format(username, p)
r = session.post(url_api)
soup = Soup(r.text)
if soup.find('h1'):
print('break: h1')
break
elif mode in ['pornstar']:
if free:
url_api = 'https://www.pornhub.com/{}/{}/videos/upload'\
'?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('div', class_='videoUList')
else:
url_api = 'https://www.pornhub.com/{}/{}?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('ul', class_='pornstarsVideos')
elif mode in ['channels']:
url_api = 'https://www.pornhub.com/{}/{}/videos?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
try:
soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide')
except:
break
elif mode in ['playlist']:
#url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(username, len(hrefs))
if token is None:
raise Exception('no token')
url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&token={}&page={}'.format(username, token, p)
soup = downloader.read_soup(url_api, session=session)
else:
raise NotImplementedError(mode)
fail = 0
except Exception as e:
print_(e)
fail += 1
if fail < 2:
continue
else:
break
finally:
print_('{} ({})'.format(url_api, len(hrefs)))
if cw and not cw.alive:
return
lis = soup.findAll('li', class_='videoblock')
if not lis:
print_('break: no lis')
break
if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found':
print_('Page Not Found')
break
c = 0
for li in lis:
a = li.find('a')
href = a.attrs['href']
href = urljoin(url, href)
if href in hrefs:
continue
c += 1
if href.startswith('javascript:'): # Remove Pornhub Premium
print(href)
continue
hrefs.append(href)
if c == 0:
print('c==0')
break
print(c) # 1320
if len(hrefs) >= max_pid:
break
if cw:
hrefs = filter_range(hrefs, cw.range)
info['hrefs'] = hrefs
return info
@lock
def decode(html, cw=None):
'''
decode
'''
print_ = get_print(cw)
print_('decode')
soup = Soup(html)
for script in soup.findAll('script'):
script = script.text or script.string or ''
script = script.strip()
if 'videoUrl' in script:
break
else:
raise Exception('No script')
flashvars = script.split()[1]
script = 'playerObjList={};' + script
context = js2py.EvalJs()
context.execute(script)
return context.eval(flashvars).to_dict()

View File

@ -0,0 +1,133 @@
import downloader
import ree as re
import os
from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, clean_title
from translator import tr_
try:
from urllib import quote # python2
except:
from urllib.parse import quote # python3
import sys
from timee import sleep
from constants import clean_url
LIMIT = 100
def get_tags(url):
url = clean_url(url)
qs = query_url(url)
if 'page=favorites' in url:
id = qs.get('id', ['N/A'])[0]
id = u'fav_{}'.format(id)
else:
tags = qs.get('tags', [])
tags.sort()
id = u' '.join(tags)
if not id:
id = u'N/A'
return id
@Downloader.register
class Downloader_rule34_xxx(Downloader):
type = 'rule34_xxx'
URLS = ['rule34.xxx']
MAX_CORE = 8
display_name = 'Rule34.xxx'
_name = None
def init(self):
if 'rule34.xxx' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
url = self.url
url = url.replace(' ', '+')
while '++' in url:
url = url.replace('++', '+')
url = quote(url)
url = url.replace('%2B', '+')
self.url = u'https://rule34.xxx/index.php?page=post&s=list&tags={}'.format(url)
@property
def name(self):
if self._name is None:
tags = get_tags(self.url)
self._name = tags
return clean_title(self._name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.name, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.filenames[img.url] = img.filename
self.title = self.name
class Image(object):
def __init__(self, id_, url):
self.url = url
ext = os.path.splitext(url)[1]
self.filename = u'{}{}'.format(id_, ext)
def setPage(url, page):
# Always use HTTPS
url = url.replace('http://', 'https://')
# Change the page
if 'pid=' in url:
url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url)
else:
url += '&pid={}'.format(page)
return url
def get_imgs(url, title=None, cw=None):
url = clean_url(url)
if 's=view' in url and 'page=favorites' not in url:
raise NotImplementedError('Not Implemented')
if 'page=dapi' not in url.lower():
tags = get_tags(url)
tags = quote(tags, safe='/')
tags = tags.replace('%20', '+')
url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(tags, 0, LIMIT)
print_ = get_print(cw)
# Range
max_pid = get_max_range(cw)
imgs = []
ids = set()
for p in range(500): #1017
url = setPage(url, p)
print_(url)
html = downloader.read_html(url)
soup = Soup(html)
posts = soup.findAll('post')
if not posts:
break
for post in posts:
id_ = post.attrs['id']
if id_ in ids:
print('duplicate:', id_)
continue
ids.add(id_)
url_img = post.attrs['file_url']
img = Image(id_, url_img)
imgs.append(img)
if len(imgs) >= max_pid:
break
if cw is not None:
if not cw.alive:
break
cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
return imgs

View File

@ -0,0 +1,180 @@
#coding: utf8
import downloader
import json
from io import BytesIO
from utils import Downloader, LazyUrl, get_print, try_n, lock, clean_title
from error_printer import print_error
import os
from timee import sleep
import ffmpeg
import ytdl
from m3u8_tools import M3u8_stream
CLIENT_ID = None
@lock
def get_cid(force=False):
global CLIENT_ID
if CLIENT_ID is None or force:
print('update cid...')
d = ytdl.YoutubeDL()
e = ytdl.extractor.soundcloud.SoundcloudIE(d)
e._update_client_id()
CLIENT_ID = e._CLIENT_ID
return CLIENT_ID
class Audio(object):
_url = None
def __init__(self, info, album_art, cw=None):
self.info = info
self.album_art = album_art
self.cw = cw
self.url = LazyUrl(info['webpage_url'], self.get, self, pp=self.pp)
def get(self, url):
print_ = get_print(self.cw)
if self._url:
return self._url
info = self.info
## ydl = ytdl.YoutubeDL()
## info = ydl.extract_info(url)
formats = info['formats']
print(formats)
formats = sorted(formats, key=lambda x: int(x.get('abr', 0)), reverse=True)
url_audio = None
for format in formats:
protocol = format['protocol']
print_(u'{}】 format【{}】 abr【{}'.format(protocol, format['format'], format.get('abr', 0)))
if not url_audio and protocol in ['http', 'https']:
url_audio = format['url']
if not url_audio:
url_audio = M3u8_stream(formats[0]['url'])
self.album_art = False#
self.username = info['uploader']
self.title = u'{} - {}'.format(self.username, info['title'])
self.filename = u'{}{}'.format(clean_title(self.title, allow_dot=True, n=-4), '.mp3')
thumb = None
for t in info['thumbnails'][::-1]:
width = t.get('width', 1080)
if not 100 <= width <= 500:
continue
url_thumb = t['url']
thumb = BytesIO()
try:
downloader.download(url_thumb, buffer=thumb)
break
except Exception as e:
print(e)
thumb = None
self.thumb = thumb
self._url = url_audio
return self._url
def pp(self, filename):
cw = self.cw
with cw.convert(self):
return self._pp(filename)
def _pp(self, filename):
if self.thumb and self.album_art:
self.thumb.seek(0)#
ffmpeg.add_cover(filename, self.thumb, {'artist':self.username, 'title':self.info['title']}, cw=self.cw)
@Downloader.register
class Downloader_soundcloud(Downloader):
type = 'soundcloud'
single = True
URLS = ['soundcloud.com']
#lock = True
audio = None
display_name = 'SoundCloud'
def init(self):
if 'soundcloud.com' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
self.url = 'https://soundcloud.com/{}'.format(self.url)
def read(self):
album_art = self.ui_setting.albumArt.isChecked()
info = get_audios(self.url, self.cw, album_art)
audios = info['audios']
if not audios:
raise Exception('no audios')
# first audio must be valid
while audios:
audio = audios[0]
try:
audio.url()
break
except Exception as e:
e_ = e
print(e)
audios.remove(audio)
else:
raise e_
if len(audios) > 1:
audio = self.process_playlist(info['title'], audios)
else:
self.urls.append(audio.url)
self.title = audio.title
self.artist = audio.username
self.setIcon(audio.thumb)
@try_n(2)
def get_audios(url, cw, album_art):
print_ = get_print(cw)
url = url.rstrip('/')
if url.count('/') == 3:
url += '/tracks'
info = {
#'extract_flat': True,
}
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
if 'entries' in info:
entries = info['entries']
title = info['title']
for _type in ['All', 'Tracks', 'Albums', 'Sets', 'Reposts', 'Likes', 'Spotlight']:
x = '({})'.format(_type)
if x in title:
title = title.replace(x, '')
kind = _type
break
else:
kind = 'Playlist'
print_(u'kind: {}'.format(kind))
info['title'] = u'[{}] {}'.format(kind.capitalize(), title)
else:
entries = [info]
audios = []
for e in entries:
if '/sets/' in e['webpage_url']:
continue
audio = Audio(e, album_art, cw=cw)
audios.append(audio)
info['audios'] = audios
return info

View File

@ -0,0 +1,250 @@
from __future__ import division, print_function, unicode_literals
import downloader
import ree as re
from utils import urljoin, Soup, LazyUrl, Downloader, try_n, compatstr, get_print, clean_title, Session, get_max_range
import os
import json
import ast
from io import BytesIO
import random
import clf2
from translator import tr_
from timee import sleep
from error_printer import print_error
import devtools
HDR = {'User-Agent': downloader.hdr['User-Agent']}
PATTERN_VID = '/(v|video)/(?P<id>[0-9]+)'
def is_captcha(soup):
return soup.find('div', class_="verify-wrap") is not None
@Downloader.register
class Downloader_tiktok(Downloader):
type = 'tiktok'
single = True
URLS = ['tiktok.com']
display_name = 'TikTok'
def init(self):
cw = self.cw
self.session = Session()
res = clf2.solve(self.url, self.session, cw)
soup = Soup(res['html'])
if is_captcha(soup):
def f(html):
return not is_captcha(Soup(html))
clf2.solve(self.url, self.session, cw, show=True, f=f)
@classmethod
def fix_url(cls, url):
url = url.split('?')[0].split('#')[0].strip('/')
if 'tiktok.com' not in url.lower():
url = 'https://www.tiktok.com/@{}'.format(url)
return url
def read(self):
format = compatstr(self.ui_setting.youtubeFormat.currentText()).lower().strip()
if re.search(PATTERN_VID, self.url) is None:
info = read_channel(self.url, self.session, self.cw)
items = info['items']
videos = [Video('https://www.tiktok.com/@{}/video/{}'.format(info['uid'], item['id']), self.session, format) for item in items]
title = '{} (tiktok_{})'.format(info['nickname'], info['uid'])
video = self.process_playlist(title, videos)
else:
video = Video(self.url, self.session, format)
video.url()
self.urls.append(video.url)
self.title = clean_title(video.title)
self.setIcon(video.thumb)
class Video(object):
_url = None
def __init__(self, url, session, format='title (id)'):
self.url = LazyUrl(url, self.get, self)
self.session = session
self.format = format
@try_n(2)
def get(self, url):
if self._url:
return self._url
m = re.search(PATTERN_VID, url)
id = m.group('id')
ext = '.mp4'
self.title = id#
self.filename = '{}{}'.format(clean_title(self.title, n=-len(ext)), ext)
html = downloader.read_html(url, session=self.session)
soup = Soup(html)
data = soup.find(id='__NEXT_DATA__')
props = data.contents[0]
data_encode = json.dumps(props)
ast_le = ast.literal_eval(data_encode)
data = json.loads(ast_le)
#info = data['props']['pageProps']['videoData']['itemInfos']
info = data['props']['pageProps']['itemInfo']['itemStruct']
self._url = info['video']['downloadAddr']
self.url_thumb = info['video']['cover']
self.thumb = BytesIO()
downloader.download(self.url_thumb, referer=url, buffer=self.thumb)
return self._url
def read_channel(url, session, cw=None):
print_ = get_print(cw)
info = {}
info['items'] = []
ids = set()
info['items'] = []
sd = {
'count_empty': 0,
'shown': False,
}
max_pid = get_max_range(cw)
def f(html, browser=None):
soup = Soup(html)
if is_captcha(soup):
print('captcha')
browser.show()
sd['shown'] = True
elif sd['shown']:
browser.hide()
sd['shown'] = False
try:
info['uid'] = soup.find('h2', class_='share-title').text.strip()
info['nickname'] = soup.find('h1', class_='share-sub-title').text.strip()
except Exception as e:
print_(print_error(e)[0])
c = 0
ids_now = set()
for div in soup.findAll('div', class_='video-feed-item'):
a = div.find('a')
if a is None:
continue
href = a['href']
if not href:
continue
m = re.search(PATTERN_VID, href)
if m is None:
continue
id_video = int(m.group('id'))
ids_now.add(id_video)
if id_video in ids:
continue
ids.add(id_video)
info['items'].append({'id': id_video})
c += 1
print_('items: {}'.format(len(info['items'])))
if len(info['items']) >= max_pid:
info['items'] = info['items'][:max_pid]
return True
browser.runJavaScript('window.scrollTo(0, document.body.scrollHeight);')
sleep(15, cw)
if c or (ids_now and min(ids_now) > min(ids)):
sd['count_empty'] = 0
else:
print_('empty')
sd['count_empty'] += 1
msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items']))
if cw:
if not cw.alive:
raise Exception('cw dead')
cw.setTitle(msg)
else:
print(msg)
return sd['count_empty'] > 4
res = clf2.solve(url, session, cw, f=f, timeout=1800, show=True)
if not info['items']:
raise Exception('no items')
return info
@try_n(2)
def read_channel_legacy(url, session, cw=None):
print_ = get_print(cw)
html = downloader.read_html(url, session=session, headers=HDR)
uid = re.find('//user/profile/([0-9]+)', html, err='no uid')
secUid = re.find('"secUid" *: *"([^"]+?)"', html, err='no secUid')
verifyFp = ''.join(random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for i in range(16))
maxCursor = 0
info = {}
info['items'] = []
ids = set()
for i in range(100):
url_api = 'https://t.tiktok.com/api/item_list/?count=30&id={uid}&type=1&secUid={secUid}&maxCursor={maxCursor}&minCursor=0&sourceType=8&appId=1180&region=US&language=en&verifyFp={verifyFp}'.format(uid=uid, secUid=secUid, verifyFp=verifyFp, maxCursor=maxCursor)
js = 'window.byted_acrawler.sign({url:"{}"});'.replace('{}', url_api)
print(js)
for try_ in range(4):
try:
sign = devtools.eval_js(url, js, session)['output']
break
except Exception as e:
print(e)
e_ = e
else:
raise e_
url_api += '&_signature=' + sign
print_(url_api)
data_raw = downloader.read_html(url_api, url, session=session, headers=HDR)
data = json.loads(data_raw)
items = []
for item in data.get('items', []):
id_video = item['id']
if id_video in ids:
print('duplicate:', id_video)
continue
ids.add(id_video)
items.append(item)
if not items:
print('no items')
break
info['items'] += items
if i == 0:
info['uid'] = items[0]['author']['uniqueId']
info['nickname'] = items[0]['author']['nickname']
msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info['nickname'], info['uid'], len(info['items']))
if cw:
if not cw.alive:
break
cw.setTitle(msg)
else:
print(msg)
if not data['hasMore']:
break
maxCursor = data['maxCursor']
if not info['items']:
raise Exception('no items')
return info

View File

@ -0,0 +1,100 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, Downloader, cut_pair, LazyUrl, clean_title
from timee import sleep
from translator import tr_
from io import BytesIO
import ree as re
import os
@Downloader.register
class Downloader_tokyomotion(Downloader):
type = 'tokyomotion'
URLS = ['tokyomotion.net']
single = True
_type = None
display_name = 'TOKYO Motion'
def init(self):
html = downloader.read_html(self.url)
self.soup = Soup(html)
if '/album/' in self.url:
self._type = 'album'
else:
self._type = 'video'
@property
def name(self):
title = get_title(self.soup)
return clean_title(title)
def read(self):
if self._type == 'video':
video = get_video(self.url, self.soup)
self.urls.append(video.url)
self.setIcon(video.thumb)
elif self._type == 'album':
imgs = get_imgs(self.url)
for img in imgs:
self.urls.append(img.url)
self.single = False
else:
raise NotImplementedError('Unknown type: {}'.format(self._type))
self.title = self.name
class Video(object):
def __init__(self, url, url_thumb, referer, filename):
self.url = LazyUrl(referer, lambda x: url, self)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(url_thumb, referer=referer, buffer=self.thumb)
self.filename = filename
def get_title(soup):
video = soup.find('video', id='vjsplayer')
if video:
title = soup.find('h3').text.strip()
else:
title = soup.find('title').text.split(' Album - ')[0].strip()
return title
def get_video(url, soup=None):
if soup is None:
html = downloader.read_html(url)
soup = Soup(html)
video = soup.find('video', id='vjsplayer').find('source').attrs['src']
url_thumb = soup.find('video', id='vjsplayer').attrs['poster']
title = get_title(soup)
filename = u'{}.mp4'.format(clean_title(title))
video = Video(video, url_thumb, url, filename)
return video
class Image(object):
def __init__(self, url, referer):
self.url = LazyUrl(referer, lambda x: url, self)
self.filename = os.path.basename(url.split('?')[0])
def get_imgs(url):
id = re.find('album/.*?([0-9]+)', url)
print('id:', id)
url = 'https://www.tokyomotion.net/album/slideshow/{}'.format(id)
html = downloader.read_html(url)
soup = Soup(html)
imgs = []
for a in soup.findAll('a', {'data-lightbox': 'slideshow-{}'.format(id)}):
img = a.find('img').attrs['src']
img = img.replace('/tmb/', '/')
img = Image(img, url)
imgs.append(img)
return imgs

View File

@ -1,4 +1,4 @@
from utils import Downloader, speed_text, clean_title from utils import Downloader, clean_title
import constants, os, downloader import constants, os, downloader
from size import Size from size import Size
try: try:
@ -54,9 +54,10 @@ class Downloader_torrent(Downloader):
if not files: if not files:
raise Exception('No files') raise Exception('No files')
cw.single = self.single = len(files) == 1 cw.single = self.single = len(files) == 1
for file in files: if not cw.imgs:
filename = os.path.join(self.dir, file) for file in files:
cw.imgs.append(filename) filename = os.path.join(self.dir, file)
cw.imgs.append(filename)
def start_(self): def start_(self):
cw = self.cw cw = self.cw
@ -81,8 +82,11 @@ class Downloader_torrent(Downloader):
if cw.alive: if cw.alive:
cw.setSpeed('') cw.setSpeed('')
if cw.pause_lock and cw.pbar.value() < cw.pbar.maximum(): if cw.pause_lock and cw.pbar.value() < cw.pbar.maximum():
cw.pause_data = {'type': self.type, 'url': self.url, cw.pause_data = {
'filesize': self._filesize_prev} 'type': self.type,
'url': self.url,
'filesize': self._filesize_prev,
}
cw.paused = True cw.paused = True
cw.pause_lock = False cw.pause_lock = False
self.update_tools_buttons() self.update_tools_buttons()
@ -110,8 +114,8 @@ class Downloader_torrent(Downloader):
cw.dones.add(file) cw.dones.add(file)
file = constants.compact(file).replace('\\', '/') file = constants.compact(file).replace('\\', '/')
files = file.split('/') files = file.split('/')
file = (u' / ').join(files[1:]) file = ' / '.join(files[1:])
msg = (u'Completed: {}').format(file) msg = 'Completed: {}'.format(file)
self.print_(msg) self.print_(msg)
if i == 0: if i == 0:
for try_ in range(4): for try_ in range(4):
@ -126,20 +130,20 @@ class Downloader_torrent(Downloader):
downloader.total_download_size += d_size downloader.total_download_size += d_size
cw.pbar.setValue(s.progress * MAX_PBAR) cw.pbar.setValue(s.progress * MAX_PBAR)
if s.state_str == 'queued': if s.state_str == 'queued':
title_ = (u'Waiting... {}').format(title) title_ = 'Waiting... {}'.format(title)
elif s.state_str == 'checking files': elif s.state_str == 'checking files':
title_ = (u'Checking files... {}').format(title) title_ = 'Checking files... {}'.format(title)
self._filesize_prev = filesize self._filesize_prev = filesize
elif s.state_str == 'downloading': elif s.state_str == 'downloading':
title_ = (u'{} (p: {}, s: {})').format(title, s.num_peers, s.num_seeds) title_ = '{} (p: {}, s: {})'.format(title, s.num_peers, s.num_seeds)
cw.setFileSize(filesize) cw.setFileSize(filesize)
text = self.size.speed_text() text = self.size.speed_text()
cw.setSpeed(text) cw.setSpeed(text)
elif s.state_str == 'seeding': elif s.state_str == 'seeding':
title_ = (u'{}').format(title) title_ = '{}'.format(title)
cw.setFileSize(filesize) cw.setFileSize(filesize)
else: else:
title_ = (u'{}... {}').format(s.state_str.capitalize(), title) title_ = '{}... {}'.format(s.state_str.capitalize(), title)
cw.setTitle(title_, update_filter=False) cw.setTitle(title_, update_filter=False)
else: else:
return 'abort' return 'abort'

View File

@ -0,0 +1,204 @@
#coding:utf8
import downloader
from translator import tr_
from utils import Soup, Session, query_url, get_max_range, Downloader, clean_title, update_url_query, get_print, get_ext, LazyUrl
import ree as re
import errors
from ratelimit import limits, sleep_and_retry
from error_printer import print_error
class Image(object):
def __init__(self, url, id, p=0, cw=None):
self._url = url
self.id_ = id
self.p = p
self.cw = cw
self.url = LazyUrl(url, self.get, self)
def get(self, _):
print_ = get_print(self.cw)
url = self._url
ext = get_ext(url)
if ext.lower() == '.gif':
print_('get_ext: {}, {}'.format(self.id_, url))
try:
ext = downloader.get_ext(url)
except Exception as e: #3235
print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0])
self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
return url
@Downloader.register
class Downloader_tumblr(Downloader):
type = 'tumblr'
URLS = ['tumblr.com']
def init(self):
if u'tumblr.com/post/' in self.url:
return self.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url))
self.session = Session()
@classmethod
def fix_url(cls, url):
id = get_id(url)
return 'https://{}.tumblr.com'.format(id)
def read(self):
username = get_id(self.url)
name = get_name(username, self.session)
for img in get_imgs(username, self.session, cw=self.cw):
self.urls.append(img.url)
self.title = clean_title('{} (tumblr_{})'.format(name, username))
class TumblrAPI(object):
_url_base = 'https://www.tumblr.com/api'
_hdr = {
'referer': 'https://www.tumblr.com',
'authorization': 'Bearer aIcXSOoTtqrzR8L8YEIOmBeW94c3FmbSNSWAUbxsny9KKx5VFh',
}
_qs = {
'fields[blogs]': 'name,avatar,title,url,is_adult,?is_member,description_npf,uuid,can_be_followed,?followed,?advertiser_name,is_paywall_on,theme,subscription_plan,?primary,share_likes,share_following,can_subscribe,subscribed,ask,?can_submit,?is_blocked_from_primary,?tweet,?admin,can_message,?analytics_url,?top_tags,paywall_access',
'npf': 'true',
'reblog_info': 'false',
'include_pinned_posts': 'false',
#'page_number': None,
}
def __init__(self, session, cw=None):
self.session = session
self.cw = cw
def print_(self, s):
get_print(self.cw)(s)
@sleep_and_retry
@limits(1, 1)
def call(self, path, qs, default_qs=True):
if default_qs:
qs_new = qs
qs = self._qs.copy()
qs.update(qs_new)
url = self._url_base + path
url = update_url_query(url, qs)
r = self.session.get(url, headers=self._hdr)
data = r.json()
errs = data.get('errors', [])
if errs:
code = int(errs[0]['code'])
if code == 0:
raise Exception('Not found')
elif code == 4012:
raise errors.LoginRequired(errs[0]['detail'])
r.raise_for_status()
return data['response']
def name(self, username):
path = '/v2/blog/{}/posts'.format(username)
data = self.call(path, {})
return data['blog']['title'] or data['blog']['name']
def posts(self, username):
path = '/v2/blog/{}/posts'.format(username)
qs = {}
ids = set()
default_qs = True
while True:
if self.cw and not self.cw.alive:
break
data = self.call(path, qs, default_qs=default_qs)
for post in data['posts']:
id_ = post['id']
if id_ in ids:
self.print_('duplicate: {}'.format(id_))
continue
ids.add(id_)
yield Post(post, self.cw)
try:
links = data.get('links') or data['_links']
path_next = links['next']['href']
except:
path_next = None
if path_next:
path = path_next
default_qs = False
else:
break
class Post(object):
def __init__(self, data, cw=None):
id_ = data['id']
self.imgs = []
cs = data['content']
for trail in data['trail']:
cs += trail['content']
for c in cs:
if c['type'] in ['image', 'video']:
media = c.get('media')
if not media: #2859
continue
if isinstance(media, list):
media = media[0]
img = media['url']
self.imgs.append(Image(img, id_, len(self.imgs), cw))
elif c['type'] in ['text', 'link', 'audio']:
continue
else:
raise NotImplementedError(id_, c)
def get_name(username, session):
return TumblrAPI(session).name(username)
def get_imgs(username, session, cw=None):
print_ = get_print(cw)
artist = get_name(username, session)
imgs = []
error_count = 0
max_pid = get_max_range(cw)
api = TumblrAPI(session, cw)
for post in api.posts(username):
imgs += post.imgs
s = '{} {} (tumblr_{}) - {}'.format(tr_(u'\uc77d\ub294 \uc911...'), artist, username, len(imgs))
if cw:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
if len(imgs) > max_pid:
break
return imgs[:max_pid]
def get_id(url):
if '/dashboard/blog/' in url:
url = re.find('/dashboard/blog/([0-9a-zA-Z_-]+)', url)
if '/login_required/' in url:
url = url.split('/login_required/')[1].split('?')[0].split('/')[0]
if 'tumblr.com/blog/view/' in url:
url = url.split('tumblr.com/blog/view/')[1]
if 'tumblr.com' in url:
if 'www.tumblr.com' in url:
qs = query_url(url)
url = qs.get('url', [url])[0]
url = url.split('.tumblr.com')[0].split('/')[(-1)]
if url == 'www':
raise Exception('no id')
return url

View File

@ -275,6 +275,7 @@ class TwitterAPI(object):
return return
params["cursor"] = cursor params["cursor"] = cursor
if params.get("cursor") is None: # nothing if params.get("cursor") is None: # nothing
print_('no cursor')
break break
@ -328,7 +329,8 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
names[id_].append(name) names[id_].append(name)
else: else:
names[id_] = [name] names[id_] = [name]
max_id = max(ids) if ids else 0 ids_sure = sorted(ids)[:-100]
max_id = max(ids_sure) if ids_sure else 0 #3201
# 2303 # 2303
imgs_old = [] imgs_old = []
@ -341,23 +343,23 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
imgs_new = [] imgs_new = []
enough = False enough = False
c_old = 0
for tweet in TwitterAPI(session, cw).timeline_media(username): for tweet in TwitterAPI(session, cw).timeline_media(username):
id_ = int(tweet['id_str']) id_ = int(tweet['id_str'])
if id_ < max_id: if id_ < max_id:
print_('enough') print_('enough')
enough = True enough = True
break break
imgs_ = get_imgs_from_tweet(tweet, session, types, format, cw)
if id_ in ids: if id_ in ids:
print_('duplicate: {}'.format(id_)) print_('duplicate: {}'.format(id_))
c_old += 1
continue continue
ids.add(id_) ids.add(id_)
imgs_new += imgs_ imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw)
if len(imgs_old) + len(imgs_new) >= n: if len(imgs_new) + c_old >= n: #3201
break break
msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new)) msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new))
@ -368,7 +370,7 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
else: else:
print(msg) print(msg)
if not enough and not imgs_new: if not enough and not imgs_new and c_old == 0:
raise Exception('no imgs') raise Exception('no imgs')
imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True) imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True)

View File

@ -0,0 +1,103 @@
#coding:utf8
from __future__ import division, print_function, unicode_literals
import downloader
from utils import Soup, get_ext, LazyUrl, Downloader, try_n, clean_title, get_print
import ree as re
from translator import tr_
from timee import sleep
import errors
def setPage(url, p):
url = url.split('?')[0]
if p > 1:
url += '?page={}'.format(p)
return url
def getPage(url):
p = re.find('page=([0-9]+)', url)
return int(p or 1)
class Image(object):
def __init__(self, url, referer, p):
self.url = LazyUrl(referer, lambda x: url, self)
ext = get_ext(url)
self.filename = '{:04}{}'.format(p, ext)
@Downloader.register
class Downloader_v2ph(Downloader):
type = 'v2ph'
URLS = ['v2ph.com/album/']
MAX_CORE = 4
display_name = 'V2PH'
@classmethod
def fix_url(cls, url):
return url.split('?')[0]
def read(self):
info = get_info(self.url)
for img in get_imgs(self.url, info['title'], self.cw):
self.urls.append(img.url)
self.title = clean_title(info['title'])
@try_n(2)
def get_info(url):
html = downloader.read_html(url)
soup = Soup(html)
info = {}
info['title'] = soup.find('h1').text.strip()
return info
def get_imgs(url, title, cw=None):
print_ = get_print(cw)
imgs = []
for p in range(1, 1001):
url = setPage(url, p)
print_(url)
for try_ in range(4):
try:
html = downloader.read_html(url, user_agent=downloader.hdr['User-Agent'])
#sleep(1)
break
except Exception as e:
print(e)
else:
raise
soup = Soup(html)
view = soup.find('div', class_='photos-list')
if view is None:
if p == 1:
raise errors.LoginRequired()
else:
break # Guest user
for img in view.findAll('img'):
img = img.attrs['data-src']
img = Image(img, url, len(imgs))
imgs.append(img)
pgn = soup.find('ul', class_='pagination')
ps = [getPage(a.attrs['href']) for a in pgn.findAll('a')]
if p >= max(ps):
print('max p')
break
msg = '{} {} ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps))
if cw:
cw.setTitle(msg)
else:
print(msg)
return imgs

View File

@ -0,0 +1,58 @@
import downloader
import ree as re
from io import BytesIO as IO
from error_printer import print_error
from utils import Downloader, LazyUrl, get_ext, format_filename, try_n
import ytdl
@Downloader.register
class Downloader_vimeo(Downloader):
type = 'vimeo'
URLS = ['vimeo.com']
single = True
def init(self):
if 'vimeo.com' not in self.url.lower():
self.url = u'https://vimeo.com/{}'.format(self.url)
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
if not fs:
raise Exception('No MP4 videos')
f = fs[0]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
ext = get_ext(self._url)
self.filename = format_filename(self.title, info['id'], ext)
return self._url

View File

@ -0,0 +1,76 @@
import downloader
import ytdl
from utils import Downloader, try_n, LazyUrl, get_ext, format_filename, clean_title
from io import BytesIO
import ree as re
from m3u8_tools import M3u8_stream
import os
@Downloader.register
class Downloader_vlive(Downloader):
type = 'vlive'
URLS = ['vlive.tv']
single = True
display_name = 'V LIVE'
def init(self):
if 'channels.vlive.tv' in self.url:
raise NotImplementedError('channel')
def read(self):
video = get_video(self.url)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = clean_title(video.title)
@try_n(4)
def get_video(url):
options = {
'noplaylist': True,
}
ydl = ytdl.YoutubeDL(options)
info = ydl.extract_info(url)
fs = []
for f in info['formats']:
if f['ext'] != 'mp4':
continue
f['quality'] = f.get('vbr') or re.find('([0-9]+)p', f['format'], re.IGNORECASE)
print(f['format'], f['quality'])
fs.append(f)
if not fs:
raise Exception('No videos')
f = sorted(fs, key=lambda f:f['quality'])[-1]
video = Video(f, info)
return video
class Video(object):
def __init__(self, f, info):
self.title = title = info['title']
self.id = info['id']
self.url = f['url']
self.thumb = BytesIO()
downloader.download(info['thumbnail'], buffer=self.thumb)
ext = get_ext(self.url)
if ext.lower() == '.m3u8':
raise NotImplementedError('stream')#
url = M3u8_stream(self.url, n_thread=4)
else:
url = self.url
self.url = LazyUrl(self.url, lambda x: url, self)
self.filename = format_filename(title, self.id, ext)

View File

@ -0,0 +1,147 @@
import downloader
from utils import Soup, LazyUrl, clean_title, get_ext, get_imgs_already, urljoin, try_n, Downloader
import os
import page_selector
from translator import tr_
import ree as re
@Downloader.register
class Downloader_webtoon(Downloader):
type = 'webtoon'
URLS = ['webtoon.com', 'webtoons.com']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'WEBTOON'
def init(self):
self.url = get_main(self.url)
self.soup = downloader.read_soup(self.url)
@classmethod
def fix_url(cls, url):
return url.replace('webtoon.com', 'webtoons.com')
def read(self):
title = clean_title(self.soup.find('h1').text.strip())
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(title)
imgs = get_imgs_all(self.url, title, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = title
class Page(object):
def __init__(self, url, title):
self.url = url
self.title = title
class Image(object):
def __init__(self, url, page, p):
ext = get_ext(url) or downloader.get_ext(url, referer=page.url)
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
self.url = LazyUrl(page.url, lambda _: url, self)
@try_n(2)
def get_imgs(page):
html = downloader.read_html(page.url)
if 'window.__motiontoonViewerState__' in html:
raise NotImplementedError('motiontoon')
soup = Soup(html)
view = soup.find('div', class_='viewer_img')
imgs = []
for img in view.findAll('img'):
src = img.get('data-url') or img['src']
img = Image(urljoin(page.url, src), page, len(imgs))
imgs.append(img)
return imgs
def get_main(url):
if 'episode_no=' in url:
soup = downloader.read_soup(url)
url = urljoin(url, soup.find('div', class_='subj_info').find('a')['href'])
return url
def set_page(url, p):
if '&page=' not in url:
url = url + '&page={}'.format(p)
else:
url = re.sub('&page=[0-9]+', '&page={}'.format(p), url)
if p == 1:
url = url.replace('&page=1', '')
return url
def get_pages(url):
pages = []
urls = set()
for p in range(1, 101):
url_page = set_page(url, p)
print(url_page)
for try_ in range(4):
try:
soup = downloader.read_soup(url_page)
view = soup.find('ul', id='_listUl')
if view is None:
raise Exception('no view')
break
except Exception as e:
e_ = e
print(e)
else:
raise e_
pages_new = []
for li in view.findAll('li', recursive=False):
href = urljoin(url, li.find('a')['href'])
title = li.find('span', class_='subj').text.strip()
if href in urls:
continue
urls.add(href)
no = int(li['data-episode-no'])
title = '{:04} - {}'.format(no, title)
page = Page(href, title)
pages_new.append(page)
if not pages_new:
break
pages += pages_new
return pages[::-1]
@page_selector.register('webtoon')
@try_n(4)
def f(url):
url = get_main(url)
return get_pages(url)
def get_imgs_all(url, title, cw=None):
pages = get_pages(url)
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
imgs_already = get_imgs_already('webtoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs(page)
msg = tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages))
if cw is not None:
cw.setTitle(msg)
if not cw.alive:
break
else:
print(msg)
return imgs

View File

@ -0,0 +1,180 @@
#coding:utf8
import downloader
import ree as re
from timee import sleep, clock, time
from constants import clean_url
from utils import Downloader, urljoin, try_n, Session, get_print, clean_title, Soup, fix_protocol
import os
from translator import tr_
import json
from datetime import datetime
import constants
import clf2
import errors
@Downloader.register
class Downloader_weibo(Downloader):
type = 'weibo'
URLS = ['weibo.com', 'weibo.cn']
def init(self):
self.session = Session()
@classmethod
def fix_url(cls, url):
url = url.replace('weibo.cn', 'weibo.com').split('?')[0]
if 'weibo.com/p/' in url:
id = re.findall('weibo.com/p/([^/]+)', url)[0]
url = 'https://weibo.com/p/{}'.format(id)
elif 'weibo.com/u/' in url:
id = re.findall('weibo.com/u/([^/]+)', url)[0]
url = 'https://weibo.com/u/{}'.format(id)
elif 'weibo.com/' in url:
id = re.findall('weibo.com/([^/]+)', url)[0]
url = 'https://weibo.com/{}'.format(id)
else:
id = url
url = 'https://weibo.com/u/{}'.format(id)
url = fix_protocol(url)
return url
def read(self):
checkLogin(self.session)
uid, oid, name = get_id(self.url, self.cw)
title = clean_title('{} (weibo_{})'.format(name, uid))
for img in get_imgs(uid, oid, title, self.session, cw=self.cw, d=self, parent=self.mainWindow):
self.urls.append(img.url)
self.filenames[img.url] = img.filename
self.title = title
def checkLogin(session):
c = session.cookies._cookies.get('.weibo.com', {}).get('/',{}).get('SUBP')
if not c or c.is_expired():
raise errors.LoginRequired()
class Album(object):
def __init__(self, id, type):
self.id = id
self.type = type
class Image(object):
def __init__(self, url, filename=None, timestamp=0):
self.url = url
if filename is None:
filename = os.path.basename(url)
self.filename = filename
self.timestamp = timestamp
def _get_page_id(html):
m = re.search("CONFIG\\['page_id'\\]='([0-9]+?)'", html)
return m
def get_id(url, cw=None):
for try_ in range(2):
try:
res = clf2.solve(url, cw=cw, f=_get_page_id)
html = res['html']
soup = Soup(html)
if soup.find('div', class_='gn_login'):
raise errors.LoginRequired()
m = _get_page_id(html)
if not m:
raise Exception('no page_id')
oid = m.groups()[0]
uids = re.findall('uid=([0-9]+)', html)
uid = max(set(uids), key=uids.count)
name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0]
break
except errors.LoginRequired as e:
raise
except Exception as e:
e_ = e
print(e)
else:
raise e_
return uid, oid, name
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None):
print_ = get_print(cw)
print_('uid: {}, oid:{}'.format(uid, oid))
@try_n(4)
def get_album_imgs(album, page):
url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format(uid, album.id, page, album.type, int(time()*1000))
referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid)
html = downloader.read_html(url, referer, session=session, timeout=30)
j = json.loads(html)
data = j['data']
imgs = []
for photo in data['photo_list']:
host = photo['pic_host']
name = photo['pic_name']
id = photo['photo_id']
timestamp = photo['timestamp']
date = datetime.fromtimestamp(timestamp)
t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month, date.day)
url = '{}/large/{}'.format(host, name)
ext = os.path.splitext(name)[1]
filename = '[{}] {}{}'.format(t, id, ext)
img = Image(url, filename, timestamp)
imgs.append(img)
return imgs
def get_albums(page):
url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format(uid, page, int(time()*1000))
referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid)
html = downloader.read_html(url, referer, session=session)
j = json.loads(html)
data = j['data']
albums = []
for album in data['album_list']:
id = album['album_id']
type = album['type']
album = Album(id, type)
albums.append(album)
return albums
albums = []
for p in range(1, 101):
albums_new = get_albums(p)
albums += albums_new
print_('p:{}, albums:{}'.format(p, len(albums)))
if not albums_new:
break
imgs = []
for album in albums:
print('Album:', album.id, album.type)
for p in range(1, 101):
imgs_new = get_album_imgs(album, p)
imgs += imgs_new
s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw:
if not cw.alive:
return []
cw.setTitle(s)
else:
print(s)
if not imgs_new:
break
sleep(1)
imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True)
return imgs