This commit is contained in:
KurtBestor 2021-04-05 23:38:47 +09:00
parent a69b757610
commit bcd33f9118
41 changed files with 6013 additions and 34 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 475 KiB

After

Width:  |  Height:  |  Size: 974 KiB

View File

@ -14,7 +14,18 @@ import math
import ree as re
import utils
from collections import OrderedDict
_VALID_URL = 'https?://(?:www\\.|bangumi\\.|)bilibili\\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\\d+)/play#)(?P<id>\\d+)'
_VALID_URL = r'''(?x)
https?://
(?:(?:www|bangumi)\.)?
bilibili\.(?:tv|com)/
(?:
(?:
video/[aA][vV]|
anime/(?P<anime_id>\d+)/play\#
)(?P<id_bv>\d+)|
video/[bB][vV](?P<id>[^/?#&]+)
)
'''
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
RESOLS = OrderedDict()

View File

@ -0,0 +1,219 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: daumtoon_downloader.pyo
# Compiled at: 2019-10-03 10:11:29
import downloader
from utils import Soup, Session, LazyUrl, Downloader, try_n, get_imgs_already, clean_title, get_print
import json, os
from timee import time, sleep
import ree as re
from translator import tr_
import page_selector
class Page(object):
def __init__(self, id, url, title, serviceType):
self.id = id
self.url = url
self.title = title
self.serviceType = serviceType
class Image(object):
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl(page.url, self.get, self)
ext = os.path.splitext(url.split('?')[0])[1]
if ext.lower()[1:] not in ('jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp'):
ext = '.jpg'
self.filename = (u'{}/{:04}{}').format(clean_title(page.title), p, ext)
def get(self, _):
return self._url
def get_id(url):
if '/league/' in url:
header = 'league_'
else:
header = ''
body = re.find('/viewer/([0-9a-zA-Z_-]+)', url) or re.find('/view/([0-9a-zA-Z_-]+)', url)
return header, body
def get_info(url, session):
referer = url
header, id = get_id(referer)
if 'league_' in id:
type_ = 'leaguetoon'
else:
type_ = 'webtoon'
info = {}
ids = set()
pages = []
for p in range(1, 1+10):
if p == 1:
url = 'http://webtoon.daum.net/data/pc/{}/view/{}?timeStamp={}'.format(type_, id, int(time()))
else:
if type_ == 'webtoon':
break
url = 'http://webtoon.daum.net/data/pc/{}/view/{}?page_no={}&timeStamp={}'.format(type_, id, p, int(time()))
print(url)
info_raw = downloader.read_html(url, referer=referer, session=session)
_info = json.loads(info_raw)
webtoon = _info['data'].get('webtoon') or _info['data'].get('leaguetoon')
if webtoon is None:
raise Exception('No webtoon')
if p == 1:
info['title'] = webtoon['title']
artists = []
for artist in webtoon['cartoon']['artists']:
artist = artist['penName']
if artist in artists:
continue
artists.append(artist)
if len(artists) > 1:
artists = [
artists[1], artists[0]] + artists[2:]
info['artists'] = artists
eps = webtoon.get('webtoonEpisodes') or webtoon.get('leaguetoonEpisodes')
if not eps:
if p > 1:
eps = []
else:
raise Exception('No eps')
c = 0
for ep in eps:
id_ = ep.get('articleId') or ep.get('id')
title = ep['title']
serviceType = 'free' if type_ =='leaguetoon' else ep['serviceType']
if type_ == 'leaguetoon':
url = 'http://webtoon.daum.net/league/viewer/{}'.format(id_)
else:
url = 'http://webtoon.daum.net/webtoon/viewer/{}'.format(id_)
if id_ in ids:
continue
c += 1
ids.add(id_)
page = Page(id_, url, title, serviceType)
pages.append(page)
if c == 0:
print('c == 0; break')
break
info['pages'] = sorted(pages, key=lambda x: x.id)
return info
@Downloader.register
class Downloader_daumtoon(Downloader):
type = 'daumtoon'
URLS = ['webtoon.daum.net']
MAX_CORE = 16
MAX_SPEED = 4.0
display_name = 'Daum Webtoon'
def init(self):
if '/viewer/' in self.url:
return self.Invalid(tr_('목록 주소를 입력해주세요: {}').format(self.url))
if '/view/' not in self.url and not self.url.lower().startswith('http'):
self.url = ('http://webtoon.daum.net/webtoon/view/{}').format(self.url)
self.session = None
self._info = get_info(self.url, self.session)
@property
def name(self):
title = self._info['title']
artists = self._info['artists']
artist = artists[0] if artists else 'N/A'
title = self.format_title('N/A', ''.join(get_id(self.url)), title, artist, 'N/A', 'N/A', 'Korean', prefix='daumtoon_')
return clean_title(title)
def read(self):
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(self.name)
imgs = get_imgs_all(self._info, self.name, self.session, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
self.session = None
return
def get_imgs(page, session, cw):
print_ = get_print(cw)
html = downloader.read_html(page.url, session=session)
header, id = get_id(page.url)
t = int(time())
soup = Soup(html)
if 'league_' in id:
type_ = 'leaguetoon'
else:
type_ = 'webtoon'
url_data = 'http://webtoon.daum.net/data/pc/{}/viewer/{}?timeStamp={}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
m_type = data['data']['webtoonEpisode']['multiType']
print_('m_type: {}'.format(m_type))
if m_type == 'chatting':
page.url = page.url.replace('daum.net/', 'daum.net/m/')
url_data = 'http://webtoon.daum.net/data/mobile/{}/viewer?id={}&{}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
imgs = []
for chat in data['data']['webtoonEpisodeChattings']:
img = chat.get('image')
if not img:
continue
img = Image(img['url'], page, len(imgs))
imgs.append(img)
else:
url_data = 'http://webtoon.daum.net/data/pc/{}/viewer_images/{}?timeStamp={}'.format(type_, id, t)
data_raw = downloader.read_html(url_data, session=session, referer=page.url)
data = json.loads(data_raw)
imgs = []
for img in data['data']:
img = Image(img['url'], page, len(imgs))
imgs.append(img)
return imgs
def get_imgs_all(info, title, session, cw=None):
pages = info['pages']
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
if page.serviceType != 'free':
continue
imgs_already = get_imgs_already('daumtoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs(page, session, cw)
if cw is not None:
cw.setTitle(tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages)))
if not cw.alive:
break
return imgs
@page_selector.register('daumtoon')
@try_n(4)
def f(url):
info = get_info(url, None)
return info['pages']

View File

@ -0,0 +1,101 @@
import downloader
from utils import Soup, try_n, LazyUrl, Downloader, lock, get_print, clean_title
from timee import sleep
import base64
import json
import constants
import ree as re
KEY = b'gefdzfdef'
@Downloader.register
class Downloader_epio(Downloader):
type = 'epio'
URLS = ['epio.app']
def read(self):
info = get_info(self.url, cw=self.cw)
imgs = info['imgs']
for img in imgs:
self.urls.append(img.url)
self.title = clean_title(info['title'])
class Image(object):
def __init__(self, url, referer, p):
self._url = url
self.url = LazyUrl(referer, self.get, self)
ext = '.jpg'#
self.filename = u'{:04}{}'.format(p, ext)
def get(self, referer):
return self._url
def get_info(url, cw=None):
info = _get_info(url, cw)
imgs = []
html = info['content']
soup = Soup(html)
for img in soup.findAll('img'):
src = img.attrs.get('src')
if not src:
continue
# 1696
if not isinstance(src, bytes):
src = src.encode('utf8')
t = base64.b64encode(src)
if isinstance(t, bytes):
t = t.decode('utf8')
src = 'https://cdn1-images.epio.app/image/download/{}'.format(t)
img = Image(src, url, len(imgs))
imgs.append(img)
info['imgs'] = imgs
return info
def get_id(url):
return re.find('article/detail/([0-9a-z]+)', url)
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.backends import default_backend
import aes
backend = default_backend()
def decrypt(s, cw=None):
print_ = get_print(cw)
key, iv = aes.key_and_iv(s[:16], KEY)
print_('key: {}\niv: {}'.format(key, iv))
cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=backend)
r = -len(s) % 16
if r:
s += b'\x00' * r
dec = cipher.decryptor()
s_dec = dec.update(s[16:]) + dec.finalize()
s_dec = s_dec[:-s_dec[-1]]
if r:
s_dec = s_dec[:-r]
return s_dec
def _get_info(url, cw=None):
id = get_id(url)
url_api = 'https://girlimg.epio.app/api/articles/{}?lang=en-us'.format(id)
html = downloader.read_html(url_api, referer=url)
s = json.loads(html)['string']
s = base64.b64decode(s)
s = decrypt(s, cw)
info = json.loads(s)
return info

View File

@ -0,0 +1,186 @@
import downloader
import ytdl
from utils import Downloader, Session, try_n, LazyUrl, get_ext, format_filename, clean_title, get_print
from io import BytesIO
import ree as re
from m3u8_tools import playlist2stream, M3u8_stream
import utils
import ffmpeg
@Downloader.register
class Downloader_etc(Downloader):
type = 'etc'
URLS = []
single = True
MAX_PARALLEL = 8
display_name = 'Etc'
def init(self):
self.session = Session()
name = ytdl.get_extractor_name(self.url)
self.print_('extractor: {}'.format(name))
if name == 'generic':
raise NotImplementedError()
def read(self):
video = get_video(self.url, self.session, self.cw)
if video.artist:
self.artist = video.artist
self.urls.append(video.url)
self.print_('url_thumb: {}'.format(video.url_thumb))
self.setIcon(video.thumb)
if video.header.lower() not in ['yourporn', 'spankbang']:
self.enableSegment()#
if isinstance(video.url(), M3u8_stream):
self.disableSegment()
self.title = '[{}] {}'.format(video.header, video.title)
def int_or_none(s):
try:
return int(s)
except:
return None
def format_(f):
if f is None:
return 'None'
return '{} - {} - {} - {}'.format(f['format'], f['_resolution'], f['_audio'], f['url'])
@try_n(4)
def get_video(url, session, cw, ie_key=None):
print_ = get_print(cw)
options = {
'noplaylist': True,
#'extract_flat': True,
'playlistend': 1,
}
ydl = ytdl.YoutubeDL(options)
info = ydl.extract_info(url)
if not ie_key:
ie_key = ytdl.get_extractor_name(url)
info['ie_key'] = ie_key
url_new = info.get('url')
print('url: {} -> {}'.format(url, url_new))
formats = info.get('formats', [])
print(info.keys())
if not formats and (info.get('entries') or 'title' not in info):
if 'entries' in info:
entry = info['entries'][0]
url_new = entry.get('url') or entry['webpage_url']
if url_new != url:
return get_video(url_new, session, cw, ie_key=get_ie_key(info))
session.headers.update(info.get('http_headers', {}))
#session.cookies.update(ydl.cookiejar)
if not formats:
print('no formats')
if url_new:
f = {'url': url_new, 'format': ''}
formats.append(f)
fs = []
for i, f in enumerate(formats):
f['_index'] = i
f['_resolution'] = f.get('vbr') or int_or_none(re.find('([0-9]+)p', f['format'], re.IGNORECASE)) or f.get('height') or f.get('width') or int(f.get('vcodec', 'none') != 'none')
f['_audio'] = f.get('abr') or f.get('asr') or int(f.get('acodec', 'none') != 'none')
print_(format_(f))
fs.append(f)
if not fs:
raise Exception('No videos')
f = sorted(fs, key=lambda f:(f['_resolution'], f['_index']))[-1]
if f['_audio']:
f_audio = None
else:
fs_audio = sorted([f_audio for f_audio in fs if (not f_audio['_resolution'] and f_audio['_audio'])], key=lambda f:(f['_audio'], f['_index']))
if fs_audio:
f_audio = fs_audio[-1]
else:
try:
f = sorted([f for f in fs if f['_audio']], key=lambda f:(f['_resolution'], f['_index']))[-1]
except IndexError:
pass
f_audio = None
print_('video: {}'.format(format_(f)))
print_('audio: {}'.format(format_(f_audio)))
video = Video(f, f_audio, info, session, url, cw=cw)
return video
def get_ie_key(info):
ie_key = info.get('ie_key') or info['extractor']
ie_key = ie_key.split(':')[0]
if ie_key.lower().endswith('playlist'):
ie_key = ie_key[:-len('playlist')]
return ie_key
class Video(object):
def __init__(self, f, f_audio, info, session, referer, cw=None):
self.f_audio = f_audio
self.cw = cw
self.title = title = info['title']
self.id = info['id']
self.url = f['url']
self.artist = info.get('uploader')
self.header = utils.capitalize(get_ie_key(info))
self.session = session
self.referer = referer
self.url_thumb = info.get('thumbnail')
self.thumb = BytesIO()
if self.url_thumb:
downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session)
try:
ext = downloader.get_ext(self.url, session, referer)
except Exception as e:
print(e)
ext = get_ext(self.url)
if not ext:
print('empty ext')
if f['_resolution']:
ext = '.mp4'
else:
ext = '.mp3'
if ext.lower() == '.m3u8':
try:
url = playlist2stream(self.url, referer, session=session, n_thread=4)
except:
url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4)
ext = '.mp4'
else:
url = self.url
self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp)
self.filename = format_filename(title, self.id, ext, header=self.header)
def pp(self, filename):
if self.cw:
with self.cw.convert(self):
return self._pp(filename)
else:
return self._pp(filename)
def _pp(self, filename):
if self.f_audio:
f = BytesIO()
downloader.download(self.f_audio['url'], buffer=f, referer=self.referer, session=self.session)
ffmpeg.merge(filename, f, cw=self.cw)
return filename

View File

@ -0,0 +1,260 @@
#coding:utf8
import downloader
from utils import Session, urljoin, Soup, LazyUrl, try_n, Downloader, get_outdir, clean_title
import ree as re
import json
import os
from translator import tr_
from timee import sleep
from downloader import getsize
import errors
PATTERN_CURSOR = '".+?&cursor=([0-9]+)'
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
class Image(object):
def __init__(self, url):
if 'fbid=' in url:
id = int(re.findall('fbid=([0-9]+)', url)[0])
elif 'photos/' in url:
id = int(url.split('photos/')[1].split('/')[1])
else:
id = int(url)
self.id = id
def f(_):
img = get_img(url)
ext = os.path.splitext(img.split('?')[0])[1]
self.filename = u'{}{}'.format(id, ext)
return img
self.url = LazyUrl(url, f, self)
@try_n(4)
def get_img(url):
#print('get_img', url)
html = read_html(url)
soup = Soup(html)
for div in soup.findAll('div'):
href = div.attrs.get('data-full-size-href')
if href:
img = href
break
else:
img = None
if img is None:
# 1869
for code in soup.findAll('code'):
code = code.string
hidden = Soup(code)
soup.append(hidden)
for a in soup.findAll('a'):
target = a.attrs.get('target')
if target == '_blank':
img = a.attrs['href']
break
else:
raise Exception('No img')
return img
def suitable(url):
if 'facebook.com' not in url.lower():
return False
if '/videos/' in url or 'video.php?' in url:
return False
return True
@Downloader.register
class Downloader_facebook(Downloader):
type = 'facebook'
URLS = [suitable]
_soup = None
MAX_CORE = 8
@classmethod
def fix_url(cls, url):
if 'facebook.com/' not in url:
url = 'https://facebook.com/{}'.format(url)
url = url.replace('m.facebook.', 'facebook.')
if 'www.facebook.com/' not in url:
url = url.replace('facebook.com/', 'www.facebook.com/', 1)
if '/profile.php?' not in url:
url = url.split('?')[0]
return url.split('#')[0].strip('/')
@property
def username(self):
username = get_username(self.url)
return username
@property
def soup(self):
if self._soup is None:
html = read_html(self.url)
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = get_title(self.soup)
id_ = 'facebook_{}'.format(self.username)
title = u'{} ({})'.format(title, id_)
return clean_title(title)
@property
def album(self):
if 'album_id=' in self.url:
album = re.findall('album_id=([0-9]+)', self.url)[0]
else:
album = None
return album
def read(self):
self.print_(self.name)
self.title = tr_(u'읽는 중... {}').format(self.name)
imgs = get_imgs(self.username, self.name, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def read_html(url):
return downloader.read_html(url, user_agent=UA)
def get_title(soup):
html = str(soup)
name = re.find(r'"__isProfile":"Page","name":(".*?")', html) or re.find(r'"name":(".*?")', html)
if not name:
gc = soup.find('div', id='globalContainer')
if gc and gc.find('form', id='login_form'):
raise errors.LoginRequired()
raise Exception('no name')
title = json.loads(name)
return title
def get_imgs(username, title, cw=None):
urls = [
'https://m.facebook.com/{}/photos'.format(username),
'https://m.facebook.com/profile.php?id={}&sk=photos'.format(username), # no custom URL
]
for url in urls:
print('get_imgs url:', url)
try:
html = read_html(url)
except:
continue
soup = Soup(html)
if soup.find('a', id='signup-button'):
raise errors.LoginRequired()
photo = soup.find('div', class_='_5v64')
if photo is not None:
break
else:
raise Exception('No photo div')
cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1]
print('first cursor:', cursor)
href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html)
href = urljoin(url, href)
href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)
cursors = set([cursor])
imgs = []
dups = {}
dir = os.path.join(get_outdir('facebook'), title)
try:
filenames = os.listdir(dir)
except:
filenames = []
for filename in filenames:
name, ext = os.path.splitext(filename)
if name.isdigit():
dups[int(name)] = os.path.join(dir, filename)
pages = set()
while True:
print(href)
html = read_html(href)
data_raw = html.replace('for (;;);', '')
data = json.loads(data_raw)
actions = data['payload']['actions']
for action in actions:
if action['target'] == 'm_more_photos':
break
else:
print('No more photos')
break
html = action['html']
soup = Soup(html)
photos = soup.findAll('div' ,class_='_5v64')
for photo in photos:
for a in photo.findAll('a'):
page = a.attrs['href']
page = urljoin(href, page)
# remove duplicate pages
if page in pages:
continue
pages.add(page)
img = Image(page)
id = img.id
if id in dups and getsize(dups[id]) > 0:
print('skip', id)
imgs.append(dups[id])
else:
imgs.append(img)
s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw is not None:
cw.setTitle(s)
if not cw.alive:
return []
else:
print(s)
cursor = re.find(PATTERN_CURSOR, data_raw)
#print(cursor)
if cursor is None:
print('no cursor')
break
if cursor in cursors:
print('same cursor')
break
cursors.add(cursor)
href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)
return imgs
def get_username(url):
if '/profile.php?' in url:
id = re.find(r'/profile\.php[\?&]id=([0-9]+)', url)
return id
else:
url = url.replace('facebook.com/pg/', 'facebook.com/')
return url.split('?')[0].split('facebook.com/')[1].split('/')[0]

View File

@ -0,0 +1,128 @@
#coding: utf-8
import downloader
import flickr_api
from timee import sleep
from utils import Downloader, LazyUrl, query_url, clean_title
import os
from translator import tr_
import ree as re
from datetime import datetime
import flickr_auth
alphabet = '123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ'
base = len(alphabet)
def b58encode(div, s=''):
if div >= base:
div, mod = divmod(div, base)
return b58encode(div, alphabet[mod] + s)
return alphabet[div] + s
def b58decode(s):
return sum(alphabet.index(c) * pow(base, i) for i, c in enumerate(reversed(s)))
class Image(object):
def __init__(self, photo):
self.photo = photo
self.id = photo.id
self.filename = None
def f(_=None):
url = photo.getPhotoFile()
#url = 'https://flic.kr/p/{}'.format(b58encode(int(photo.id)))
ext = os.path.splitext(url)[1]
date = datetime.fromtimestamp(int(photo.dateuploaded))
date = u'{:02}-{:02}-{:02}'.format(date.year%100, date.month, date.day)
self.filename = u'[{}] {}{}'.format(date, self.id, ext)
return url
self.url = LazyUrl(u'flickr_{}'.format(self.id), f, self)
def find_ps(url):
user = flickr_api.Person.findByUrl(url)
id = re.search('/albums/([0-9]+)', url).groups()[0]
pss = user.getPhotosets()
for ps in pss:
if ps.id == id:
break
else:
raise Exception('Not found photoset id')
return user, ps
@Downloader.register
class Downloader_flickr(Downloader):
type = 'flickr'
URLS = ['flickr.com']
_name = None
def init(self):
if 'flickr.com' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
self.url = 'https://www.flickr.com/people/{}'.format(self.url)
@property
def name(self):
global pss
if self._name is None:
url = self.url
flickr_auth.get_api(url, self.cw)
if '/albums/' in url:
user, ps = find_ps(url)
self._name = u'{} (flickr_album_{}_{})'.format(ps.title, user.id, ps.id)
else:
user = flickr_api.Person.findByUrl(url)
self._name = u'{} (flickr_{})'.format(user.username, user.id)
return clean_title(self._name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.title, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
def get_imgs(url, title=None, cw=None):
flickr_auth.get_api(title, cw)
if not flickr_auth.isAuth:
raise Exception('No Auth')
if '/albums/' in url:
user, ps = find_ps(url)
handle = ps
else:
user = flickr_api.Person.findByUrl(url)
handle = user
photos = []
per_page = 500
for page in range(1, 200):
photos_new = handle.getPhotos(per_page=per_page, page=page)
photos += photos_new
if len(photos_new) < per_page:
break
msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(photos))
if cw:
if not cw.alive:
break
cw.setTitle(msg)
else:
print(msg)
imgs = []
for photo in photos:
img = Image(photo)
imgs.append(img)
return imgs

View File

@ -0,0 +1,131 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: imgur_downloader.pyo
# Compiled at: 2019-10-07 05:58:14
import downloader
from utils import Downloader, Soup, try_n, urljoin, get_max_range, clean_title, cut_pair
import ree as re, json, os
from timee import sleep
from translator import tr_
@Downloader.register
class Downloader_imgur(Downloader):
type = 'imgur'
URLS = ['imgur.com']
MAX_CORE = 16
def init(self):
self.info = get_info(self.url)
@property
def id_(self):
return re.find('imgur.com/.+?/([0-9a-zA-Z]+)', self.url)
@property
def name(self):
title = self.info['title'] or 'N/A'
return clean_title(title, n=100)
def read(self):
imgs = get_imgs(self.url, self.info, self.cw)
for img in imgs:
ext = os.path.splitext(img.split('?')[0])[1]
if len(imgs) > 1:
self.filenames[img] = (u'{:04}{}').format(len(self.urls), ext)
else:
self.filenames[img] = clean_title(self.name, n=-len(ext)) + ext
self.urls.append(img)
self.single = len(imgs) == 1
self.referer = self.url
self.title = u'{} (imgur_{})'.format(self.name, self.id_)
@try_n(4)
def get_info(url):
url = url.replace('/gallery/', '/a/')
if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0:
title = re.find(r'/r/([^/]+)', url)
info = {}
info['title'] = title
info['type'] = 'r'
else:
try: # legacy
html = downloader.read_html(url, cookies={'over18':'1'})
s = re.find('image *: *({.+)', html)
info_raw = cut_pair(s)
except Exception as e: # new
print(e)
id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id')
url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_)
info_raw = downloader.read_html(url_api, cookies={'over18':'1'})
info = json.loads(info_raw)
info['type'] = 'a'
return info
def get_imgs(url, info=None, cw=None):
print('get_imgs', url)
if info is None:
info = get_info(url)
imgs = []
# Range
max_pid = get_max_range(cw)
if info['type'] == 'a':
if 'album_images' in info: # legacy
imgs_ = info['album_images']['images']
elif 'media' in info: # new
imgs_ = info['media']
else: # legacy
imgs_ = [info]
for img in imgs_:
img_url = img.get('url') # new
if not img_url: # legacy
hash = img['hash']
ext = img['ext']
img_url = 'https://i.imgur.com/{}{}'.format(hash, ext)
if img_url in imgs:
continue
imgs.append(img_url)
elif info['type'] == 'r':
urls = set()
for p in range(100):
url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p)
print(url_api)
html = downloader.read_html(url_api, referer=url)
soup = Soup(html)
c = 0
for post in soup.findAll('div', class_='post'):
a = post.find('a', class_='image-list-link')
url_post = urljoin(url, a.attrs['href'])
if url_post in urls:
continue
urls.add(url_post)
c += 1
try: # for r18 images
imgs += get_imgs(url_post)
except Exception as e:
print(e)
s = (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs))
if cw is not None:
if cw.alive:
cw.setTitle(s)
else:
return []
else:
print(s)
if c == 0:
print('same; break')
break
return imgs

View File

@ -0,0 +1,579 @@
#coding:utf8
import downloader
from timee import sleep, clock
from constants import clean_url
from utils import Downloader, LazyUrl, urljoin, get_max_range, Soup, Session, update_url_query, get_print, cut_pair, get_ext, clean_title, lazy, try_n, generate_csrf_token, check_alive
import urllib
from error_printer import print_error
import os, requests
from translator import tr_
import json
from datetime import datetime
import hashlib
import ree as re
from ratelimit import limits, sleep_and_retry
import clf2
import errors
FORMAT_PIN = r'/p/([0-9a-zA-Z-_]+)'
def get_session(url, cw=None):
#res = clf2.solve(url, cw=cw)
#return res['session']
session = Session()
sessionid = session.cookies._cookies.get('.instagram.com', {}).get('/',{}).get('sessionid')
if sessionid is None or sessionid.is_expired():
raise errors.LoginRequired()
session.headers['User-Agent'] = downloader.hdr['User-Agent']
if not session.cookies.get('csrftoken', domain='.instagram.com'):
csrf_token = generate_csrf_token()
print('csrf:', csrf_token)
session.cookies.set("csrftoken", csrf_token, domain='.instagram.com')
return session
@Downloader.register
class Downloader_insta(Downloader):
type = 'insta'
URLS = ['instagram.com']
MAX_CORE = 8
display_name = 'Instagram'
def init(self):
self.session = get_session(self.url, self.cw)
if '/p/' in self.url:
self.print_('single post')
elif '/stories/' in self.url:
self.print_('stories')
elif 'instagram.com' in self.url:
self.url = u'https://www.instagram.com/{}'.format(self.username)
@lazy
def username(self):
return get_username(self.url)
@classmethod
def fix_url(cls, url):
if 'instagram.com' not in url:
url = u'https://www.instagram.com/{}'.format(url)
return url.split('?')[0].split('#')[0].strip('/')
@classmethod
def key_id(cls, url):
return url.replace('://www.', '://')
@lazy
def name(self):
return get_name(self.url)
@property
def id_(self):
return u'{} (insta_{})'.format(clean_title(self.name), self.username)
def read(self):
cw = self.cw
title = self.id_
self.title = title
self.artist = self.name
ui_setting = self.ui_setting
if '/p/' in self.url:
self.print_('single')
iter = get_imgs_single(self.url, self.session, cw=cw)
elif '/stories/highlights/' in self.url:
iter = get_stories_single(self.url, session=self.session, cw=cw)
else:
s = ui_setting.instaStories.isChecked()
self.print_('stories: {}'.format(s))
iter = get_imgs_all(self.url, title, session=self.session, cw=cw, d=self, stories=s)
imgs = []
for img in iter:
if cw and not cw.alive:
return
self.urls.append(img.url)
self.title = title
def get_j(script):
s = script.string
if not s:
return
try:
s = s.replace('window._sharedData', '').strip()[1:-1].strip()
j = json.loads(s)
return j
except ValueError as e:
pass
def read_html(url, session, cw):
#res = clf2.solve(url, session=session, cw=cw)#
#return res['html']
return downloader.read_html(url, session=session)
def check_error(soup, cw, wait):
print_ = get_print(cw)
err = soup.find('div', class_='error-container')
if err:
err = err.text.strip()
if wait:
print_('err: {}'.format(err))
sleep(60*30, cw)
else:
raise Exception(err)
def get_sd(url, session=None, html=None, cw=None, wait=True):
print_ = get_print(cw)
if html:
soup = Soup(html)
check_error(soup, cw, wait)
for script in soup.findAll('script'):
j = get_j(script)
if j:
break
else:
raise Exception('no _sharedData!!')
else:
for try_ in range(4):
_wait(cw)
html = read_html(url, session, cw)
soup = Soup(html)
check_error(soup, cw, wait)
for script in soup.findAll('script'):
j = get_j(script)
if j:
break
else:
continue
break
else:
raise Exception('no _sharedData')
for script in soup.findAll('script'):
s = script.string
if s and 'window.__additionalDataLoaded(' in s:
s = cut_pair(s)
j_add = json.loads(s)
try:
j['entry_data']['PostPage'][0].update(j_add)
except:
j['entry_data']['ProfilePage'][0].update(j_add) #2900
# Challenge
challenge = j['entry_data'].get('Challenge')
if challenge:
for cont in challenge[0]['extraData']['content']:
title = cont.get('title')
if title:
break
else:
title = 'Err'
raise errors.LoginRequired(title)
# LoginAndSignupPage
login = j['entry_data'].get('LoginAndSignupPage')
if login:
raise errors.LoginRequired()
return j
def get_id(url):
j = get_sd(url)
if '/p/' in url:
id = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['id']
elif '/stories/' in url:
id = j['entry_data']['StoriesPage'][0]['user']['username'] # ???
else:
id = j['entry_data']['ProfilePage'][0]['graphql']['user']['id']
return id
def get_username(url):
j = get_sd(url, wait=False)
if '/p/' in url:
id = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['username']
elif '/stories/' in url:
id = j['entry_data']['StoriesPage'][0]['user']['username']
else:
id = j['entry_data']['ProfilePage'][0]['graphql']['user']['username']
return id
def get_name(url):
j = get_sd(url)
if '/p/' in url:
name = j['entry_data']['PostPage'][0]['graphql']['shortcode_media']['owner']['full_name']
elif '/stories/' in url:
id = get_id(url)
url = 'https://www.instagram.com/{}/'.format(id)
return get_name(url)
else:
name = j['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']
return name
class Image(object):
def __init__(self, url, referer, filename, id=None):
self._url = url
self.url = LazyUrl(referer, self.get, self)
self.filename = filename
self.id = id
def get(self, referer):
wait_download()
return self._url
class Image_lazy(object):
def __init__(self, url, session=None, cw=None):
self.url = url
self.session = session
self.cw = cw
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
cw = self.cw
if cw and not cw.alive:
raise Exception('cw is dead')
node = Node(url, session=self.session, cw=cw)
img = node.imgs[0]
ext = os.path.splitext(url)[1]
wait_download()
url_img = img.url()
self.filename = img.filename
return url_img
@sleep_and_retry
@limits(1, 10)
def _wait(cw=None):
if cw and not cw.alive:
raise Exception('cw is dead while waiting')
##@sleep_and_retry
##@limits(1, 1)
def wait_download():
pass
@try_n(2)
def get_query(query_hash, variables, session, cw=None):
_wait(cw)
print_ = get_print(cw)
csrf_token = session.cookies.get('csrftoken', domain='.instagram.com')
if not csrf_token:
raise Exception('no csrftoken')
hdr = {
"X-CSRFToken" : csrf_token, #2849
"X-IG-App-ID" : "936619743392459",
"X-IG-WWW-Claim" : "0",
"X-Requested-With": "XMLHttpRequest",
}
url_ = update_url_query('https://www.instagram.com/graphql/query/', {'query_hash': query_hash, 'variables': json.dumps(variables)})
#print(len(edges), url_)
r = session.get(url_, headers=hdr)
try:
j = json.loads(r.text)
except Exception as e:
print(e)
j = {}
if not j or j.get('status') == 'fail':
msg = 'Fail: {} {}'.format(j.get('message') or 'Please wait a few minutes before you try again.', variables)
print_(msg)
sleep(60*30, cw)
raise Exception(msg)
return j
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
print_ = get_print(cw)
for try_ in range(4):
try:
html = read_html(url, session, cw)
m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)', html)
if m is None:
raise Exception('Invalid page')
break
except Exception as e:
e_ = e
print_(print_error(e)[0])
else:
raise e_
n = int(m.groups()[0])
n = min(n, n_max)
data = get_sd(url, html=html, cw=cw)
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
csrf_token = data['config']['csrf_token']#
session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com')
cursor = ''
edges = []
bad = 0
while True:
check_alive(cw)
variables = {
'id': uploader_id,
'first': 12,
}
if cursor:
variables['after'] = cursor
#print_(variables)#
media = None
try:
j = get_query('003056d32c2554def87228bc3fd9668a', variables, session, cw)
media = j['data']['user']['edge_owner_to_timeline_media']
sleep(2)#
except Exception as e:
if bad > 10:
raise Exception('no media')
else:
print_(u'no media.. retry... ({}) {}'.format(bad+1, print_error(e)[0]))
sleep(12*bad, cw)
bad += 1
continue
bad = 0
edges_new = media.get('edges')
if not edges_new or not isinstance(edges_new, list):
print('no edges_new')
break
edges += edges_new
s = u'{} {} ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
if cw is not None:
cw.setTitle(s)
if not cw.alive:
return []
else:
print(s)
if len(edges) >= n:
break
page_info = media.get('page_info')
if not page_info:
break
if not page_info.get('has_next_page'):
break
cursor = page_info.get('end_cursor')
if not cursor:
break
if len(edges) <= n/2:
raise Exception(u'Too short: {} / {}'.format(len(edges), n))
imgs = []
for edge in edges:
node = edge['node']
type = node['__typename']
id = node['shortcode']
url = u'https://www.instagram.com/p/{}/'.format(id)
## if type in ['GraphVideo', 'GraphImage']:
## single = True
## else:
## single = False
for img in Node(url, session=session, cw=cw, media=node).imgs:
imgs.append(img)
if len(imgs) >= n_max:
break
return imgs
class Node(object):
def __init__(self, url, format=u'[%y-%m-%d] id_ppage', session=None, cw=None, media=None):
print('Node', url)
print_ = get_print(cw)
self.id = re.search(FORMAT_PIN, url).groups()[0]
self.imgs = []
self.session = session
if not media:
if False: # Original
j = get_sd(url, self.session, cw=cw)
data = j['entry_data']['PostPage'][0]['graphql']
else:
variables = {
"shortcode" : self.id,
"child_comment_count" : 3,
"fetch_comment_count" : 40,
"parent_comment_count" : 24,
"has_threaded_comments": True,
}
j = get_query('a9441f24ac73000fa17fe6e6da11d59d', variables, session, cw)
data = j['data']
media = data['shortcode_media']
if 'video_url' in media:
urls = [
media['video_url']]
elif 'edge_sidecar_to_children' in media:
edges = media['edge_sidecar_to_children']['edges']
urls = []
for edge in edges:
node = edge['node']
if 'video_url' in node:
url_ = node['video_url']
else:
url_ = node['display_resources'][(-1)]['src']
urls.append(url_)
else:
urls = [media['display_resources'][(-1)]['src']]
time = media['taken_at_timestamp']
self.date = datetime.fromtimestamp(time)
self.timeStamp = self.date.strftime(format).replace(':', u'\uff1a')
for p, img in enumerate(urls):
ext = os.path.splitext(img.split('?')[0].split('#')[0])[1]
filename = ('{}{}').format(self.timeStamp, ext).replace('id', str(self.id)).replace('page', str(p))
img = Image(img, url, filename)
self.imgs.append(img)
def get_imgs_all(url, title=None, cw=None, d=None, session=None, stories=True):
max_pid = get_max_range(cw)
url = clean_url(url)
if stories:
imgs_str = get_stories(url, title, cw=cw, session=session)
else:
imgs_str = []
max_pid = max(0, max_pid - len(imgs_str))
imgs = get_imgs(url, max_pid, title=title, cw=cw, session=session)
return imgs_str + imgs[:max_pid]
def get_imgs_single(url, session=None, cw=None):
node = Node(url, session=session, cw=cw)
return node.imgs
def get_stories(url, title=None, cw=None, session=None):
print_ = get_print(cw)
html = downloader.read_html(url, session=session)
data = get_sd(url, html=html, cw=cw)
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
csrf_token = data['config']['csrf_token']#
session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com')
print('uploader_id:', uploader_id)
variables = {
'user_id': uploader_id,
'include_chaining': True,
'include_reel': True,
'include_suggested_users': False,
'include_logged_out_extras': False,
'include_highlight_reels': True,
'include_live_status': True,
}
j = get_query('d4d88dc1500312af6f937f7b804c68c3', variables, session, cw)
imgs = []
ids = set()
data = j['data']
hs = data['user']['edge_highlight_reels']
edges = hs['edges']
edges.insert(0, str(uploader_id))
for i, edge in enumerate(edges):
if isinstance(edge, str):
id = edge
hid = None
url_str = url
else:
id = None
hid = edge['node']['id']
url_str = 'https://www.instagram.com/stories/highlights/{}/'.format(hid)
try:
imgs_new = get_stories_single(url_str, id=id, cw=cw, session=session)
for img in imgs_new:
if img.id in ids:
print('duplicate: {}'.format(img.id))
continue
ids.add(img.id)
imgs.append(img)
print_('stories: {}'.format(hid))
except Exception as e:
print_(u'Failed to get stories: {}'.format(hid))
print(e)
msg = u'{} {} ({}/{})'.format(tr_(u'스토리 읽는 중...'), title, i+1, len(edges))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
imgs = sort_str(imgs)
return imgs
def sort_str(imgs):
imgs = sorted(imgs, key=lambda img: int(img.id), reverse=True)
return imgs
def get_stories_single(url, id=None, cw=None, session=None):
j = get_sd(url, session=session, cw=cw)
hid = re.find('/stories/highlights/([0-9]+)', url)
reel_ids = []
highlight_reel_ids = []
if hid is None:
if id is None:
id = get_id(url) # ???
reel_ids.append(str(id))
else:
highlight_reel_ids.append(str(hid))
print(id, hid)
variables = {
"reel_ids":reel_ids,
"tag_names":[],
"location_ids":[],
"highlight_reel_ids":highlight_reel_ids,
"precomposed_overlay":False,
"show_story_viewer_list":True,
"story_viewer_fetch_count":50,
"story_viewer_cursor":"",
"stories_video_dash_manifest":False
}
print(variables)
j = get_query('f5dc1457da7a4d3f88762dae127e0238', variables, session, cw)
data = j['data']
m = data['reels_media'][0]
items = m['items']
if not items:
raise Exception('no items')
imgs = []
for item in items:
id = item['id']
rs = item.get('video_resources') or item['display_resources']
r = rs[-1]
src = r['src']
ext = get_ext(src)
filename = u'stories_{}{}'.format(id, ext)
img = Image(src, url, filename, id=id)
imgs.append(img)
imgs = sort_str(imgs)
return imgs

View File

@ -93,7 +93,13 @@ class Downloader_iwara(Downloader):
def read_channel(url, type_, cw=None):
print_ = get_print(cw)
username = re.find(r'/users/([^/]+)', url, err='no username')
html = downloader.read_html(url)
soup = Soup(html)
if soup.find('div', id='block-mainblocks-user-connect'):
username = re.find(r'''/messages/new\?user=(.+)['"]''', html, err='no username')
else:
username = re.find(r'/users/([^/]+)', url, err='no username')
print_('username: {}'.format(username))
info = {}
urls = []
urls_set = set()

View File

@ -0,0 +1,79 @@
import downloader
from utils import Downloader, Soup, get_print, json_loads, compatstr, LazyUrl, format_filename, clean_title
import devtools
import js2py
import ree as re
from m3u8_tools import playlist2stream
from io import BytesIO
@Downloader.register
class Downloader_javfinder(Downloader):
type = 'javfinder'
URLS = ['javfinder.la']
single = True
display_name = 'JavFinder'
def read(self):
video = Video(self.url, cw=self.cw)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.title = video.title
class Video(object):
def __init__(self, url, cw=None):
info = solve(url, cw=cw)
url_video = info['file']
stream = playlist2stream(url_video, n_thread=4)
self.url = LazyUrl(url, lambda x: stream, self)
self.title = info['title']
id = info['id']
self.filename = format_filename(self.title, id, '.mp4')
self.thumb = BytesIO()
downloader.download(info['url_thumb'], buffer=self.thumb)
def solve(url, cw=None):
print_ = get_print(cw)
info = {}
res = devtools.watch_network(url, cw=cw)
#html = res['html']
html = downloader.read_html(url) # ???
soup = Soup(html)
info['title'] = soup.find('h1').text.strip()
info['url_thumb'] = soup.find('meta', {'property': 'og:image'})['content'].strip()
for r in res['rs']:
url_player = r.url()
if 'streamsb.net/embed-' in url_player:
break
else:
raise Exception('no player')
print_('player: {}'.format(url_player))
info['id'] = ''#
html = downloader.read_html(url_player, url)
soup = Soup(html)
for script in soup.findAll('script'):
script = script.string or ''
if 'function(p,a,c,k,e,d)' in script:
break
else:
raise Exception('no function(p,a,c,k,e,d)')
js = script.strip()[5:-1].replace('function(p,a,c,k,e,d)', 'function hack(p,a,c,k,e,d)').replace('return p}', 'return p};hack')
context = js2py.EvalJs()
t = context.eval(js)
sources = re.find(r'sources *: *(\[\{.+?\}\])', t, err='no sources')
sources = json_loads(sources)
info['file'] = sources[0]['file']
return info

View File

@ -0,0 +1,207 @@
import downloader
from utils import Soup, urljoin, Downloader, fix_title, Session, get_print, LazyUrl, clean_title, get_imgs_already
import ree as re
from timee import sleep
from translator import tr_
import os
from constants import try_n, clean_url
import urllib, page_selector
import bs4
PATTERN = r'jmana[0-9]*.*/(comic_list_title|book)\?book'
PATTERN_ALL = r'jmana[0-9]*.*/(comic_list_title|book|bookdetail)\?book'
PATTERN_ID = '[?&]bookdetailid=([0-9]+)'
class Image(object):
def __init__(self, url, page, p):
self.url = LazyUrl(page.url, lambda _: url, self)
ext = '.jpg'
name = (u'{:04}{}').format(p, ext)
self.filename = (u'{}/{}').format(page.title, name)
class Page(object):
def __init__(self, title, url):
self.title = clean_title(title)
self.url = url
self.id = int(re.find(PATTERN_ID, url))
@Downloader.register
class Downloader_jmana(Downloader):
type = 'jmana'
URLS = ['regex:'+PATTERN_ALL]
MAX_CORE = 8
_soup = None
def init(self):
self.url = clean_url(self.url)
self.session = Session()
if re.search(PATTERN_ID, self.url): #1799
select = self.soup.find('select', class_='bookselect')
for i, op in enumerate(select.findAll('option')[::-1]):
if 'selected' in op.attrs:
break
else:
raise Exception('no selected option')
for a in self.soup.findAll('a'):
url = urljoin(self.url, a.get('href') or '')
if re.search(PATTERN, url):
break
else:
raise Exception('list not found')
self.url = self.fix_url(url)
self._soup = None
for i, page in enumerate(get_pages(self.url, self.session, self.soup)):
if page.id == int(op['value']):
break
else:
raise Exception('can not find page')
self.cw.range_p = [i]
@classmethod
def fix_url(cls, url):
return url
@property
def soup(self):
if self._soup is None:
html = downloader.read_html(self.url, session=self.session)
soup = Soup(html)
self._soup = soup
return self._soup
@property
def name(self):
title = get_title(self.soup)
artist = get_artist(self.soup)
title = fix_title(self, title, artist)
return title
def read(self):
title = self.name
artist = get_artist(self.soup)
self.artist = artist
for img in get_imgs(self.url, title, self.session, soup=self.soup, cw=self.cw):
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def get_title(soup):
a = soup.find('a', class_='tit')
if a:
return a.text.strip()
return re.find(r'제목 *: *(.+)', soup.find('a', class_='tit').text, err='no title')
def get_artist(soup):
return re.find(r'작가 *: *(.+)', soup.text, default='').strip() or 'N/A'
@try_n(4, sleep=60)
def get_imgs_page(page, referer, session, cw=None):
print_ = get_print(cw)
sleep(5, cw) #2017
html = downloader.read_html(page.url, referer, session=session)
inserted = re.find(r'''var *inserted *= *['"](.*?)['"]''', html)
print_('inserted: {}'.format(inserted))
inserted = set(int(i) for i in inserted.split(',')) if inserted else set()
soup = Soup(html)
view = soup.find(class_='pdf-wrap')
imgs = []
for i, img in enumerate(child for child in view.children if isinstance(child, bs4.element.Tag)):
src = img.get('data-src') or img.get('src') or ''
if i in inserted:
print_('remove: {}'.format(src))
continue
if not src:
continue
src = urljoin(page.url, src.strip())
if '/adimg/' in src:
print('adimg:', src)
continue
if '/notice' in src:
print('notice:', src)
continue
img = Image(src, page, len(imgs))
imgs.append(img)
return imgs
def get_pages(url, session=None, soup=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = []
for inner in soup.findAll('div', class_='inner'):
a = inner.find('a')
if not a:
continue
href = a.attrs.get('href', '')
if not re.search(PATTERN_ID, href):
continue
if a.find('img'):
print('skip img', a.attrs.get('href'))
continue
href = urljoin(url, href)
title_page = a.text
page = Page(title_page, href)
pages.append(page)
pages = list(reversed(pages))
return pages
@page_selector.register('jmana')
@try_n(4)
def f(url):
if re.search(PATTERN_ID, url):
raise Exception(tr_(u'목록 주소를 입력해주세요'))
session = Session()
pages = get_pages(url, session=session)
return pages
def get_imgs(url, title, session, soup=None, cw=None):
print_ = get_print(cw)
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = get_pages(url, soup=soup)
print_('pages: {}'.format(len(pages)))
pages = page_selector.filter(pages, cw)
imgs = []
for i, page in enumerate(pages):
imgs_already = get_imgs_already('jmana', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs_page(page, url, session, cw)
if cw is not None:
if not cw.alive:
return
cw.setTitle((u'{} {} / {} ({} / {})').format(tr_(u'\uc77d\ub294 \uc911...'), title, page.title, i + 1, len(pages)))
if not imgs:
raise Exception('no imgs')
return imgs

View File

@ -0,0 +1,192 @@
import downloader
import ree as re
from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin
from time import sleep
from translator import tr_
import page_selector
import json
UA = downloader.hdr['User-Agent']
class Page(object):
def __init__(self, id_, title):
self.id_ = id_
self.title = title
self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_)
class Image(object):
def __init__(self, url, page, p):
self.url = LazyUrl('https://page.kakao.com/', lambda _: url, self)
ext = '.jpg'
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
@Downloader.register
class Downloader_kakaopage(Downloader):
type = 'kakaopage'
URLS = ['page.kakao.com/home']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'KakaoPage'
def init(self):
self.session = Session()
self.session.headers['User-Agent'] = UA
@classmethod
def fix_url(cls, url):
id = re.find('/home/.+?/([0-9]+)', url)
if id is not None:
url = id
if url.isdecimal():
url = 'https://page.kakao.com/home?seriesId={}'.format(url)
return url
def read(self):
info = get_info(self.url, self.session, cw=self.cw)
for img in info['imgs']:
self.urls.append(img.url)
self.artist = info['artist']
self.title = clean_title('[{}] {}'.format(info['artist'], info['title']))
def get_id(url):
id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId')
return id_
def get_pages(url, session):
id_ = get_id(url)
pages = []
ids = set()
for p in range(100):
url_api = 'https://api2-page.kakao.com/api/v5/store/singles'
data = {
'seriesid': id_,
'page': str(p),
'direction': 'asc',
'page_size': '20',
'without_hidden': 'true',
}
r = session.post(url_api, data=data, headers={'Referer': url})
print(p, r)
data = r.json()
singles = data['singles']
if not singles:
print('no singles')
break
for single in singles:
title_page = single['title']
id_page = single['id']
if id_page in ids:
print('dup id')
continue
ids.add(id_page)
page = Page(id_page, title_page)
pages.append(page)
sleep(.5)
return pages
@try_n(2)
def get_imgs_page(page, session):
html = downloader.read_html(page.url, session=session)
did = re.find('"did" *: *"(.+?)"', html, err='no did')
url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
data = {
'productId': page.id_,
'device_mgr_uid': 'Windows - Chrome',
'device_model': 'Windows - Chrome',
'deviceId': did,
}
print(data)
r = session.post(url_api, data=data, headers={'Referer': page.url})
data = r.json()
if data['result_code']:
raise Exception(data['message'])
imgs = []
for file in data['downloadData']['members']['files']:
url = file['secureUrl']
url = urljoin('https://page-edge-jz.kakao.com/sdownload/resource/', url)
img = Image(url, page, len(imgs))
imgs.append(img)
return imgs
def get_info(url, session, cw=None):
print_ = get_print(cw)
pages = get_pages(url, session)
pages = page_selector.filter(pages, cw)
if not pages:
raise Exception('no pages')
info = {}
html = downloader.read_html(url, session=session)
soup = Soup(html)
__NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
if __NEXT_DATA__:
data = json.loads(__NEXT_DATA__.string)
tid = data['props']['initialState']['common']['constant']['tid']
print_('tid: {}'.format(tid))
session.cookies['_kptid'] = tid
html = downloader.read_html(url, session=session)
soup = Soup(html)
title = soup.find('h2').text.strip()
info['title'] = title
artist = soup.find('meta', {'name': 'author'})['content']
for x in [' ,', ', ']:
while x in artist:
artist = artist.replace(x, ',')
artist = artist.replace(',', ', ')
info['artist'] = artist
imgs = []
for i, page in enumerate(pages):
if cw is not None:
if not cw.alive:
return
cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), title, page.title, i + 1, len(pages)))
try:
_imgs = get_imgs_page(page, session)
e_msg = None
except Exception as e:
_imgs = []
e_msg = print_error(e)[0]
print_('{} {}'.format(page.title, len(_imgs)))
if e_msg:
print_(e_msg)
imgs += _imgs
sleep(.2)
if not imgs:
raise Exception('no imgs')
info['imgs'] = imgs
return info
@page_selector.register('kakaopage')
@try_n(4)
def f(url):
if 'seriesId=' not in url:
raise Exception(tr_('목록 주소를 입력해주세요'))
pages = get_pages(url, Session())
return pages

View File

@ -0,0 +1,55 @@
import downloader
import ytdl
from utils import Downloader, try_n, LazyUrl, get_ext, format_filename
from io import BytesIO as IO
from m3u8_tools import M3u8_stream
@Downloader.register
class Downloader_vlive(Downloader):
type = 'kakaotv'
URLS = ['tv.kakao']
single = True
display_name = 'KakaoTV'
@classmethod
def fix_url(cls, url):
return url.split('?')[0].strip('/')
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(2)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['ext'] == 'mp4']
f = sorted(fs, key=lambda f: f['height'])[-1]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
ext = get_ext(self._url)
self.filename = format_filename(self.title, info['id'], ext)
return self._url

View File

@ -0,0 +1,72 @@
import downloader
from utils import Soup, urljoin, Downloader, LazyUrl, Session, try_n, format_filename, clean_title
from timee import sleep
import ree as re
from io import BytesIO
import clf2
@Downloader.register
class Downloader_kissjav(Downloader):
type = 'kissjav'
URLS = ['kissjav.com']
single = True
display_name = 'KissJAV'
def read(self):
video = get_video(self.url)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.session = get_session(self.url, cw=self.cw)
self.enableSegment(1024*1024//2)
self.title = video.title
def get_video(url):
html = downloader.read_html(url)
soup = Soup(html)
view = soup.find('div', id='player-container-fluid')
src_best = None
res_best = -1
for source in view.findAll('source'):
src = urljoin(url, source.attrs['src'])
res = re.find('([0-9]+)p', source.attrs['title'])
res = int(res) if res else 0
if res > res_best:
src_best = src
res_best = res
if src_best is None:
raise Exception('No source')
title = soup.find('h1').text.strip()
id = soup.find('div', id='video').attrs['data-id']
url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
#src_best = downloader.real_url(src_best)
video = Video(src_best, url_thumb, url, title, id)
return video
class Video(object):
def __init__(self, url, url_thumb, referer, title, id):
self.title = title
self.filename = format_filename(title, id, '.mp4')
self.url = LazyUrl(referer, lambda x: url, self)
self.thumb = BytesIO()
self.url_thumb = url_thumb
downloader.download(url_thumb, buffer=self.thumb)
@try_n(2)
def get_session(url, cw=None):
session = Session()
clf2.solve(url, session=session, cw=cw)
return session

View File

@ -0,0 +1,165 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, LazyUrl, Downloader, try_n, Session, clean_title, get_print
import os
from translator import tr_
import page_selector
import clf2
import utils
import base64
from image_reader import QPixmap
class Image(object):
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl(page.url, self.get, self)#, pp=self.pp)
ext = os.path.splitext(url)[1]
if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
ext = '.jpg'
self.filename = u'{}/{:04}{}'.format(page.title, p, ext)
def get(self, _):
return self._url
## def pp(self, filename):
## pixmap = QPixmap(filename)
## pixmap.save(filename)
## return filename
class Page(object):
def __init__(self, title, url):
self.title = clean_title(title)
self.url = url
@Downloader.register
class Downloader_lhscan(Downloader):
type = 'lhscan'
URLS = ['lhscan.net', 'loveheaven.net', 'lovehug.net']
MAX_CORE = 16
display_name = 'LHScan'
_soup = None
def init(self):
self.url = self.url.replace('lhscan.net', 'loveheaven.net')
self.session = Session()
#clf2.solve(self.url, session=self.session, cw=self.cw)
soup = self.soup
if not soup.find('ul', class_='manga-info'):
self.Invalid(u'{}: {}'.format(tr_(u'목록 주소를 입력해주세요'), self.url))
@property
def soup(self):
if self._soup is None:
for try_ in range(8):
try:
html = downloader.read_html(self.url, session=self.session)
break
except Exception as e:
print(e)
else:
raise
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = self.soup.findAll('span', {'itemprop': 'name'})[-1].text.strip()
return clean_title(title)
def read(self):
self.title = tr_(u'읽는 중... {}').format(self.name)
imgs = get_imgs(self.url, self.name, self.session, self.soup, self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
@try_n(8)
def get_imgs_page(page, session, cw=None):
print_ = get_print(cw)
print_(page.title)
html = downloader.read_html(page.url, session=session)
soup = Soup(html)
view = soup.find('div', class_='chapter-content')
if not view:
raise Exception('no chapter-content')
imgs = []
for img in soup.findAll('img', class_='chapter-img'):
src = img.get('data-pagespeed-lazy-src') or img.get('data-src') or img.get('data-srcset') or img.get('data-aload') or img['src']
try:
src = base64.b64decode(src).strip().decode('utf8')
except:
pass
src = urljoin(page.url, src)
if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
continue
if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
continue
if 'LoveHug_600cfd96e98ff.jpg' in src:
continue
img = Image(src.strip(), page, len(imgs))
imgs.append(img)
return imgs
def get_pages(url, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
tab = soup.find('ul', class_='list-chapters')
pages = []
for li in tab.findAll('li'):
text = li.find('div', class_='chapter-name').text.strip()
href = li.parent['href']
href = urljoin(url, href)
page = Page(text, href)
pages.append(page)
if not pages:
raise Exception('no pages')
return pages[::-1]
@page_selector.register('lhscan')
@try_n(4)
def f(url):
session = Session()
#clf2.solve(url, session=session)
pages = get_pages(url, session)
return pages
@try_n(2)
def get_imgs(url, title, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = get_pages(url, session, soup, cw)
pages = page_selector.filter(pages, cw)
imgs = []
for i, page in enumerate(pages):
imgs += get_imgs_page(page, session, cw)
s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
if cw is not None:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
return imgs

View File

@ -0,0 +1,119 @@
import downloader
from utils import Session, Downloader, get_ext, LazyUrl, get_print
import ree as re
import json
from io import BytesIO
from translator import tr_
@Downloader.register
class Downloader_likee(Downloader):
type = 'likee'
URLS = ['likee.video']
single = True
display_name = 'Likee'
def init(self):
self.session = Session()
def read(self):
info = get_info(self.url, self.session, self.cw)
self.print_('type: {}'.format(info['type']))
self.artist = info['artist']
if info['type'] != 'single':
video = self.process_playlist(info['title'], info['videos'])
else:
video = info['videos'][0]
video.url()
self.urls.append(video.url)
self.title = info['title']
thumb = BytesIO()
downloader.download(video.url_thumb, referer=self.url, buffer=thumb)
self.setIcon(thumb)
def get_info(url, session, cw=None):
print_ = get_print(cw)
info = {}
info['videos'] = []
if '/video/' in url:
info['type'] = 'single'
video = Video(url, session)
video.url()
info['videos'].append(video)
info['title'] = video.id_
info['artist'] = video.artist
return info
info['type'] = 'channel'
html = downloader.read_html(url, session=session)
data_raw = html.split('window.data = ')[1].split('};')[0]+'}'
data = json.loads(data_raw)
info['uid'] = data['userinfo']['uid']
info['username'] = data['userinfo']['yyuid']
info['artist'] = data['userinfo']['nick_name']
info['title'] = '{} (likee_{})'.format(info['artist'], info['username'])
lastPostId = ''
urls = set()
while True:
url_api = 'https://likee.video/official_website/VideoApi/getUserVideo'
r = session.post(url_api, data={'uid': info['uid'], 'count': '30', 'lastPostId': lastPostId})
data = json.loads(r.text)
videos = data['data']['videoList']
if not videos:
break
for data in videos:
url_post = 'https://likee.video/@{}/video/{}'.format(data['likeeId'], data['postId'])
if url_post in urls:
print_('duplicate: {}'.format(url_post))
continue
urls.add(url_post)
video = Video(url_post, session, data)
video.url()
info['videos'].append(video)
lastPostId = data['postId']
msg = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(info['videos']))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
return info
class Video(object):
def __init__(self, url, session, data=None):
self.id_ = re.find('/video/([0-9]+)', url, err='no id')
self._session = session
self._data = data
self.url = LazyUrl(url, self.get, self)
def get(self, url):
if self._data:
video = self._data
else:
url_api = 'https://likee.video/official_website/VideoApi/getVideoInfo'
r = self._session.post(url_api, data={'postIds': str(self.id_)})
data = json.loads(r.text)
video = data['data']['videoList'][0]
url_video = video['videoUrl']
self.url_thumb = video['coverUrl']
self.artist = video['nickname']
ext = get_ext(url_video)
self.title = self.id_
self.filename = '{}{}'.format(self.id_, ext)
return url_video

View File

@ -0,0 +1,145 @@
#coding:utf8
import downloader
from utils import Soup, Downloader, LazyUrl, urljoin, try_n, get_outdir, clean_title
import ree as re
import os
from timee import sleep
from translator import tr_
from io import BytesIO
import json
class Image(object):
def __init__(self, item, referer):
self.item = item
self.id = str(item['id'])
self.referer = referer
self.url = LazyUrl(referer, self.get, self)
def get(self, url):
img = urljoin(url, self.item['url_to_original'])
ext = os.path.splitext(img.split('?')[0])[1]
self.filename = u'{}{}'.format(self.id, ext)
return img
class Video(object):
def __init__(self, url, title, url_thumb):
self.url = url
self.title = title
ext = os.path.splitext(url.split('?')[0])[1]
self.filename = u'{}{}'.format(clean_title(title), ext)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
@Downloader.register
class Downloader_luscious(Downloader):
type = 'luscious'
URLS = ['luscious.net']
MAX_CORE = 4
@classmethod
def fix_url(cls, url):
url = url.replace('members.luscious.', 'www.luscious.')
return url
def read(self):
url = fix_url(self.url)
for try_ in range(8):
try:
html = downloader.read_html(url)
break
except Exception as e:
print(e)
self.print_('retry...')
else:
raise
soup = Soup(html)
title = clean_title(get_title(soup))
self.title = tr_(u'읽는 중... {}').format(title)
if '/videos/' in url:
video = get_video(url, soup)
imgs = [video]
self.setIcon(video.thumb)
else:
imgs = get_imgs(url, soup, self.cw)
dir = os.path.join(get_outdir(self.type), title)
names = {}
try:
for name in os.listdir(dir):
id = os.path.splitext(name)[0]
names[id] = name
except:
pass
for img in imgs:
if img.id in names:
url = os.path.join(dir, names[img.id])
else:
url = img.url
self.urls.append(url)
self.title = title#
def update(cw, title, imgs):
s = u'{} {} ({})'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw is not None:
cw.setTitle(s)
else:
print(s)
def fix_url(url):
url = re.sub(r'[^./]+\.luscious', 'legacy.luscious', url)
return url
def get_imgs(url, soup=None, cw=None):
url = fix_url(url)
if soup is None:
html = downloader.read_html(url)
soup = Soup(html)
title = get_title(soup)
imgs = []
for p in range(1, 81):
imgs_new = get_imgs_p(url, p)
if not imgs_new:
break
imgs += imgs_new
update(cw, title, imgs)
return imgs
@try_n(4, sleep=30)
def get_imgs_p(url, p=1):
id = re.find('/albums/[^/]+?([0-9]+)/', url+'/')
print(url, id)
url_api = 'https://api.luscious.net/graphql/nobatch/?operationName=AlbumListOwnPictures&query=+query+AlbumListOwnPictures%28%24input%3A+PictureListInput%21%29+%7B+picture+%7B+list%28input%3A+%24input%29+%7B+info+%7B+...FacetCollectionInfo+%7D+items+%7B+...PictureStandardWithoutAlbum+%7D+%7D+%7D+%7D+fragment+FacetCollectionInfo+on+FacetCollectionInfo+%7B+page+has_next_page+has_previous_page+total_items+total_pages+items_per_page+url_complete+%7D+fragment+PictureStandardWithoutAlbum+on+Picture+%7B+__typename+id+title+created+like_status+number_of_comments+number_of_favorites+status+width+height+resolution+aspect_ratio+url_to_original+url_to_video+is_animated+position+tags+%7B+category+text+url+%7D+permissions+url+thumbnails+%7B+width+height+size+url+%7D+%7D+&variables=%7B%22input%22%3A%7B%22filters%22%3A%5B%7B%22name%22%3A%22album_id%22%2C%22value%22%3A%22{}%22%7D%5D%2C%22display%22%3A%22position%22%2C%22page%22%3A{}%7D%7D'.format(id, p)
data_raw = downloader.read_html(url_api, referer=url)
data = json.loads(data_raw)
has_next_page = data['data']['picture']['list']['info']['has_next_page']
imgs = []
for item in data['data']['picture']['list']['items']:
img = Image(item, url)
imgs.append(img)
return imgs
def get_video(url, soup):
url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
title = re.find('videos/([^/]+)', url)
video = soup.find('video')
url = video.source.attrs['src']
video = Video(url, title, url_thumb)
return video
def get_title(soup):
return soup.find('h2').text.strip()

View File

@ -0,0 +1,33 @@
from utils import Downloader, LazyUrl, clean_title
from m3u8_tools import playlist2stream, M3u8_stream
import os
@Downloader.register
class Downloader_m3u8(Downloader):
type = 'm3u8'
URLS = ['.m3u8']
single = True
display_name = 'M3U8'
def init(self):
if '://' not in self.url:
self.url = 'http://' + self.url
def read(self):
video = Video(self.url)
self.urls.append(video.url)
self.title = video.title
class Video(object):
def __init__(self, url):
try:
m = playlist2stream(url)
except:
m = M3u8_stream(url)
self.url = LazyUrl(url, lambda _: m, self)
self.title = os.path.splitext(os.path.basename(url))[0]
self.filename = clean_title(self.title, n=-4) + '.mp4'

View File

@ -0,0 +1,211 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, LazyUrl, Downloader, query_url, try_n, Session, get_print, clean_title
import os
from translator import tr_
from timee import sleep
import requests
import ree as re
import clf2#
class Image(object):
def __init__(self, url, p, page):
ext = os.path.splitext(url)[1]
if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
ext = '.jpg'
self.filename = u'{:04}{}'.format(p, ext)
if page.title is not None:
self.filename = u'{}/{}'.format(page.title, self.filename)
def f(_):
return url
self.url = LazyUrl(page.url, f, self)
class Page(object):
def __init__(self, title, url, soup=None):
self.title = clean_title(title)
self.url = url
self.soup = soup
@Downloader.register
class Downloader_mrm(Downloader):
type = 'mrm'
URLS = ['myreadingmanga.info']
_soup = None
MAX_CORE = 16
display_name = 'MyReadingManga'
def init(self):
self.session = get_session(self.url, self.cw)
@classmethod
def fix_url(cls, url):
return re.find('https?://myreadingmanga.info/[^/]+', url, err='err')
@property
def soup(self):
if self._soup is None:
for try_ in range(8):
try:
html = read_html(self.url, session=self.session, cw=self.cw)
break
except Exception as e:
e_ = e
self.print_(e)
else:
raise e_
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = get_title(self.soup)
return title
def read(self):
self.title = u'읽는 중... {}'.format(self.name)
imgs = get_imgs(self.url, self.soup, self.session, self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
def get_title(soup):
title = soup.find('h1', class_='entry-title').text.strip()
title = fix_title(title)
title = clean_title(title)
return title
def get_imgs(url, soup=None, session=None, cw=None):
if soup is None:
html = read_html(url, session=session, cw=cw)
soup = Soup(html)
title = get_title(soup)
pagination = soup.find('div', class_='pagination')
if pagination is None:
page = Page(None, url, soup)
imgs = get_imgs_page(page, session=session)
else:
pages = get_pages(url, soup, session=session)
imgs = []
for i, page in enumerate(pages):
s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
if cw:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
imgs += get_imgs_page(page, session=session)
if not imgs:
raise Exception('no imgs')
return imgs
def get_pages(url, soup=None, session=None):
if soup is None:
html = read_html(url, session=session, cw=None)
soup = Soup(html)
pagination = soup.find('div', class_='pagination')
pages = []
hrefs = set()
for a in pagination.findAll('a'):
href = a.attrs.get('href', '')
href = urljoin(url, href)
if not href.startswith(url):
print('not match', href)
continue
while href.endswith('/'):
href = href[:-1]
if href in hrefs:
print('duplicate', href)
continue
hrefs.add(href)
text = a.text.strip()
page = Page(text, href)
pages.append(page)
if url not in hrefs:
page = Page('1', url, soup)
pages.insert(0, page)
return pages
@try_n(4)
def get_imgs_page(page, session=None):
url = page.url
soup = page.soup
if soup is None:
html = read_html(url, session=session, cw=None)
soup = Soup(html)
page.soup = soup
view = soup.find('div', class_='entry-content')
imgs = []
for img in view.findAll('img'):
img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src')
if img is None:
continue
img = urljoin(url, img)
img = Image(img, len(imgs), page)
imgs.append(img)
print(page.title, len(imgs), page.url)
return imgs
def fix_title(title):
title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title)
while ' ' in title:
title = title.replace(' ', ' ')
return title
def read_html(url, session, cw):
## html = downloader.read_html(url, session=session)
## soup = Soup(html)
##
## cf = soup.find('div', class_='cf-browser-verification')
## if cf is None:
## return html
r = clf2.solve(url, cw=cw, session=session)
return r['html']
@try_n(4)
def get_session(url, cw=None):
print_ = get_print(cw)
## html = downloader.read_html(url)
## soup = Soup(html)
##
## cf = soup.find('div', class_='cf-browser-verification')
## if cf is None:
## print_('no cf protection')
## return None
print_('cf protection')
r = clf2.solve(url, cw=cw)
session = r['session']
return session

View File

@ -0,0 +1,170 @@
#coding:utf-8
import downloader
import re
from utils import urljoin, Downloader, Soup, LazyUrl, clean_title
import json
from timee import sleep
import collections
PATTERNS = ['.*blog.naver.com/(?P<username>.+)/(?P<pid>[0-9]+)',
'.*blog.naver.com/.+?blogId=(?P<username>[^&]+).+?logNo=(?P<pid>[0-9]+)',
'.*?(?P<username>[0-9a-zA-Z_-]+)\.blog\.me/(?P<pid>[0-9]+)']
HDR = {
'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ko, en-US; q=0.7, en; q=0.3',
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
}
def get_id(url):
for pattern in PATTERNS:
m = re.match(pattern, url)
if m is None:
continue
username = m.group('username')
pid = m.group('pid')
break
else:
username, pid = None, None
return username, pid
@Downloader.register
class Downloader_naver(Downloader):
type = 'naver'
URLS = ['blog.naver.', '.blog.me']
display_name = 'Naver Blog'
def init(self):
username, pid = get_id(self.url)
if username is None:
return self.Invalid('Invalid format')
self.url = 'https://blog.naver.com/{}/{}'.format(username, pid)
self.headers = {'User-Agent': downloader.hdr['User-Agent']}
@property
def name(self):
username, pid = get_id(self.url)
return clean_title(u'{}/{}'.format(username, pid))
def read(self):
self.title = u'읽는 중... {}'.format(self.name)
imgs = get_imgs(self.url)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class Image(object):
def __init__(self, url):
self.url = url
class Video(object):
def __init__(self, url, referer, p):
self.url = LazyUrl(referer, lambda _: url, self)
self.filename = 'video_{}.mp4'.format(p)
def read_page(url, depth=0):
print('read_page', url, depth)
if depth > 10:
raise Exception('Too deep')
html = downloader.read_html(url, header=HDR)
if len(html) < 5000:
id = re.findall('logNo=([0-9]+)', html)[0]
usernames = re.findall('blog.naver.com/([0-9a-zA-Z]+)', url)
if not usernames:
usernames = re.findall('blogId=([0-9a-zA-Z]+)', url)
username = usernames[0]
url = 'https://m.blog.naver.com/PostView.nhn?blogId={}&logNo={}&proxyReferer='.format(username, id)
print('###', username, id, url)
soup = Soup(html)
if soup.find('div', {'id': 'viewTypeSelector'}):
return url, soup
frame = soup.find('frame')
if frame is None:
print('frame is None')
return read_page(url, depth+1)
return read_page(urljoin('https://blog.naver.com', frame.attrs['src']), depth+1)
def get_imgs(url):
url = url.replace('blog.naver', 'm.blog.naver')
url_frame, soup = read_page(url)
imgs = []
urls = set()
view = soup.find('div', {'id': 'viewTypeSelector'})
print('view', view is not None)
imgs_ = view.findAll('span', class_='_img') + view.findAll('img')
for img in imgs_:
url = img.attrs.get('src', None)
if url is None:
url = img.attrs.get('thumburl', None)
if url is None:
print(u'invalid img: {}'.format(url))
continue
if 'ssl.pstatic.net' in url: #
continue
if 'blogpfthumb-phinf.pstatic.net' in url: # profile
continue
if 'dthumb-phinf.pstatic.net' in url: # link
continue
if 'storep-phinf.pstatic.net' in url: # emoticon
continue
url = url.replace('mblogthumb-phinf', 'blogfiles')
#url = re.sub('\?type=[a-zA-Z0-9]*', '?type=w1@2x', url)
#url = re.sub('\?type=[a-zA-Z0-9]*', '', url)
url = url.split('?')[0]
if url in urls:
print('### Duplicate:', url)
continue
urls.add(url)
#url = url.split('?type=')[0]
img = Image(url)
imgs.append(img)
pairs = []
for video in soup.findAll('span', class_='_naverVideo'):
vid = video.attrs['vid']
key = video.attrs['key']
pairs.append((vid, key))
for script in soup.findAll('script', class_='__se_module_data'):
data_raw = script['data-module']
data = json.loads(data_raw)['data']
vid = data.get('vid')
if not vid:
continue
key = data['inkey']
pairs.append((vid, key))
videos = []
for vid, key in pairs:
url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key)
data_raw = downloader.read_html(url_api)
data = json.loads(data_raw)
fs = data['videos']['list']
fs = sorted(fs, key=lambda f: f['size'], reverse=True)
video = Video(fs[0]['source'], url_frame, len(videos))
videos.append(video)
return imgs + videos

View File

@ -0,0 +1,244 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: navertoon_downloader.pyo
# Compiled at: 2019-10-03 10:19:35
import downloader
from utils import Soup, urljoin, Downloader, LazyUrl, get_imgs_already, clean_title, get_ext, get_print
from constants import try_n
import ree as re, os
from timee import sleep
import page_selector
from translator import tr_
import json
class Page(object):
def __init__(self, url, title, p):
self.url = url
self.title = title
self.p = p
class Image(object):
def __init__(self, url, page, p):
ext = get_ext(url)
self.filename = (u'{}/{:04}{}').format(clean_title(page.title), p, ext)
self.url = LazyUrl(page.url, lambda _: url, self)
class Info(object):
def __init__(self, id, title, artist):
self.id = id
self.title = title
self.artist = artist
@Downloader.register
class Downloader_navertoon(Downloader):
type = 'navertoon'
URLS = ['comic.naver.com']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'Naver Webtoon'
def init(self):
self.url = get_main(self.url)
self.__info, _ = get_pages(self.url, self.cw)
@property
def name(self):
id = self.__info.id
title = self.__info.title
artist = self.__info.artist
title = self.format_title('N/A', id, title, artist, 'N/A', 'N/A', 'Korean', prefix='navertoon_')
return clean_title(title)
def read(self):
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(self.name)
imgs = get_imgs_all(self.url, self.name, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = self.name
def get_main(url):
url_main = re.sub('[?&]page=[0-9]+', '', re.sub('[?&]no=[0-9]+', '', url)).replace('detail.nhn', 'list.nhn').replace('m.comic.naver.', 'comic.naver.')
while url_main.endswith('#'):
url_main = url_main[:-1]
return url_main
def set_no(url, p):
if '&no=' not in url:
url = url + ('&no={}').format(p)
return url
url = re.sub('&no=[0-9]+', ('&no={}').format(p), url)
return url
def get_id(url):
return int(url.lower().split('titleid=')[1].split('&')[0])
def set_page(url, p):
if '&page=' in url:
url = re.sub('&page=[0-9]+', ('&page={}').format(p), url)
else:
url += ('&page={}').format(p)
return url
@try_n(4)
def get_pages(url, cw=None):
print_ = get_print(cw)
url = get_main(url).replace('comic.naver.', 'm.comic.naver.')
id = get_id(url)
print('id:', id)
print(url)
html = downloader.read_html(url)
soup = Soup(html)
try:
info = soup.find('div', class_='area_info')
artist = info.find('span', class_='author').text.strip()
except Exception as e:
print(e)
try:
title = ('\n').join(soup.find('div', class_='title').text.strip().split('\n')[:-1]).strip()
except:
title = 'artist not found'
raise Exception(title)
print('artist:', artist)
title = soup.find('meta', {'property': 'og:title'}).attrs['content']
pages = []
nos = set()
for p in range(1, 100):
if p == 1:
url_page = url
else:
url_page = set_page(url, p)
html = downloader.read_html(url_page)
print('read page:', url_page)
soup = Soup(html)
view = soup.findAll('ul', class_='section_episode_list')[(-1)]
for lst in view.findAll('li'):
url_page = urljoin(url, lst.find('a').attrs['href'])
if 'detail.nhn' not in url_page.lower():
continue
print_('url_page: {}'.format(url_page))
text = lst.find('strong', class_='title').find('span', class_='name').text.strip()
no = int(re.findall('[?&]no=([0-9]+)', url_page)[0])
if no in nos:
print('duplicate no: {}'.format(no))
continue
nos.add(no)
text = '{:04} - {}'.format(no, text)
page = Page(url_page, text, p)
pages.append(page)
btn_next = soup.find('a', class_='btn_next')
if btn_next is None or btn_next.attrs['href'] == '#':
print('end of page')
break
info = Info(id, title, artist)
return (
info, pages)
@page_selector.register('navertoon')
@try_n(4)
def f(url):
url = get_main(url)
info, pages = get_pages(url)
return pages
@try_n(6)
def get_imgs(page, cw=None):
print_ = get_print(cw)
html = downloader.read_html(page.url)
soup = Soup(html)
type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html)
print_('type: {}'.format(type_))
imgs = []
if type_ == 'DEFAULT': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772
view = soup.find('div', class_='toon_view_lst')
for img in view.findAll('img'):
img = img.attrs.get('data-src')
if not img:
continue
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
elif type_ == 'CUTTOON': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803
view = soup.find('div', class_='swiper-wrapper')
for div in view.findAll('div', class_='swiper-slide'):
if div.parent != view:
continue
if div.find('div', class_='cut_viewer_last'):
print('cut_viewer_last')
continue
if div.find('div', class_='cut_viewer_recomm'):
print('cut_viewer_recomm')
continue
img = div.find('img')
img = img.attrs['data-src']
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
elif type_ == 'EFFECTTOON': #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144
img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/'
print('img_base:', img_base)
url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html)
data_raw = downloader.read_html(url_api, page.url)
data = json.loads(data_raw)
for img in data['assets']['stillcut'].values(): # ordered in python3.7+
img = urljoin(img_base, img)
img = Image(img, page, len(imgs))
imgs.append(img)
else:
_imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html)
if not _imgs:
raise Exception('no imgs')
for img in _imgs:
img = urljoin(page.url, img)
img = Image(img, page, len(imgs))
imgs.append(img)
return imgs
def get_imgs_all(url, title, cw=None):
print_ = get_print(cw)
info, pages = get_pages(url, cw)
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
imgs_already = get_imgs_already('navertoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs_new = get_imgs(page, cw)
print_('{}: {}'.format(page.title, len(imgs_new)))
imgs += imgs_new
if cw is not None:
cw.setTitle(tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages)))
if not cw.alive:
break
return imgs

View File

@ -0,0 +1,63 @@
import downloader
import ree as re
from io import BytesIO as IO
import os
from constants import try_n
from error_printer import print_error
from utils import Downloader, compatstr, LazyUrl, get_ext, format_filename, clean_title
import ytdl
@Downloader.register
class Downloader_navertv(Downloader):
type = 'navertv'
single = True
URLS = ['tv.naver.com']
display_name = 'Naver TV'
def init(self):
if not re.match('https?://.+', self.url, re.IGNORECASE):
self.url = 'https://tv.naver.com/v/{}'.format(self.url)
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
if not fs:
raise Exception('No MP4 videos')
f = fs[0]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
id = info['id']
ext = get_ext(self._url)
self.filename = format_filename(self.title, id, ext)
return self._url

View File

@ -0,0 +1,97 @@
#coding:utf8
import downloader
import nndownload
from io import BytesIO
import ree as re
from utils import Downloader, get_print, compatstr, format_filename, clean_title, try_n
from nico_login import login, logout
def get_id(url):
if '/watch/' in url:
id = re.findall('/watch/([a-zA-Z0-9]+)', url)[0]
else:
id = url
return id
class Video(object):
def __init__(self, session, info):
self.session = session
self.info = info
self.url = info['url']
self.title = info['title']
self.ext = info['ext']
self.id = info['id']
self.fileName = format_filename(self.title, self.id, self.ext)
self.url_thumb = info['thumbnail_url']
print('thumb:', self.url_thumb)
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
def __repr__(self):
return u'Video({})'.format(self.id)
@Downloader.register
class Downloader_nico(Downloader):
type = 'nico'
single = True
URLS = ['nicovideo.jp']
display_name = 'Niconico'
def init(self):
if not re.match('https?://.+', self.url, re.IGNORECASE):
self.url = 'https://www.nicovideo.jp/watch/{}'.format(self.url)
@property
def id_(self):
return get_id(self.url)
def read(self):
ui_setting = self.ui_setting
if ui_setting.nicoBox.isChecked():
username = compatstr(ui_setting.nico_id.text())
password = compatstr(ui_setting.nico_pw.text())
else:
username = ''
password = ''
try:
session = login(username, password)
except Exception as e:
logout()
return self.Invalid(u'Failed to login: {}'.format(self.url), fail=True)
self.session = session
try:
video = get_video(session, self.id_, cw=self.cw)
except Exception as e:
logout()
raise
self.urls.append(video.url)
self.filenames[video.url] = video.fileName
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
@try_n(2)
def get_video(session, id, cw=None):
print_ = get_print(cw)
try:
info = nndownload.request_video(session, id)
except:
raise Exception('Err')
video = Video(session, info)
return video

View File

@ -0,0 +1,164 @@
#coding: utf-8
import downloader
from utils import Downloader, urljoin, get_max_range, query_url, Soup, Session, LazyUrl, get_print, clean_title, try_n, get_ext
from translator import tr_
from constants import clean_url
import ree as re
from errors import LoginRequired
def get_id(url):
return re.find('id=([0-9]+)', url)
def get_name(soup):
return soup.find('p', class_='user_icon').find('a', class_='name').text.strip()
def isLogin(soup):
if soup.find('ul', id="sub-menu"):
return True
return False
@Downloader.register
class Downloader_nijie(Downloader):
type = 'nijie'
URLS = ['nijie.info']
MAX_CORE = 4
display_name = 'ニジエ'
def init(self):
if 'members.php' not in self.url and 'members_illust.php' not in self.url:
raise NotImplementedError()
id = get_id(self.url)
html = downloader.read_html('https://nijie.info/members.php?id={}'.format(id))
self.soup = Soup(html)
if not isLogin(self.soup):
raise LoginRequired()
@classmethod
def fix_url(cls, url):
if 'nijie.info' not in url.lower():
url = 'https://nijie.info/members.php?id={}'.format(url)
return url.replace('http://', 'https://')
@property
def name(self):
name = u'{} (nijie_{})'.format(get_name(self.soup), get_id(self.url))
return clean_title(name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.name, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class Image(object):
def __init__(self, id, url, p, lazy=True, img=None):
self.id = id
self.p = p
if lazy:
self.url = LazyUrl(url, self.get_single, self)
else:
self.url = LazyUrl(url, lambda _:img, self)
ext = get_ext(img)
self.filename = '{}_p{}{}'.format(id, p, ext)
def get_single(self, url): # single
img = get_imgs_post(self.id, url)[0].url()
ext = get_ext(img)
self.filename = '{}_p{}{}'.format(self.id, self.p, ext)
return img
@try_n(8, sleep=10)
def get_imgs_post(id, url):
#print('get_imgs_post', id, url)
html = downloader.read_html(url)
soup = Soup(html)
view = soup.find('div', id='gallery')
imgs = []
for img in view.findAll(class_='mozamoza'):
url_img = urljoin(url, img['src'])
url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img)
img = Image(id, url, len(imgs), False, url_img)
imgs.append(img)
return imgs
def setPage(url, page):
# Always use HTTPS
url = url.replace('http://', 'https://')
# Change the page
if 'p=' in url:
url = re.sub('p=[0-9]*', 'p={}'.format(page), url)
else:
url += '&p={}'.format(page)
return url
def get_imgs(url, title=None, cw=None):
print_ = get_print(cw)
url = clean_url(url)
id = get_id(url)
url = u'https://nijie.info/members_illust.php?id={}'.format(id)
# Range
max_pid = get_max_range(cw)
imgs = []
url_imgs = set()
for p in range(1, 1+100):
url = setPage(url, p)
print_(url)
html = downloader.read_html(url)
soup = Soup(html)
posts = soup.findAll('div', class_='nijie')
if not posts:
print('no posts')
break
c = 0
for post in posts:
url_img = urljoin(url, post.a.attrs['href'])
if url_img in url_imgs:
print('duplicate:', url_img)
continue
url_imgs.add(url_img)
id = int(re.find('[?&]id=([0-9]+)', url_img))
multi = post.find('div', class_='thumbnail-icon')
if multi:
imgs_ = get_imgs_post(id, url_img)#
else:
imgs_ = [Image(id, url_img, 0)]
imgs += imgs_
c += 1
if len(imgs) >= max_pid:
break
msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw:
if not cw.alive:
return
cw.setTitle(msg)
else:
print(msg)
if len(imgs) >= max_pid or c == 0:
break
return imgs

View File

@ -0,0 +1,109 @@
import downloader
from utils import Session, Soup, LazyUrl, get_print, Downloader, get_ext, try_n, format_filename, clean_title
import ree as re
import json
from io import BytesIO
class EmbedUrlError(Exception): pass
@Downloader.register
class Downloader_pandoratv(Downloader):
type = 'pandoratv'
URLS = ['pandora.tv']
single = True
display_name = 'Pandora TV'
@classmethod
def fix_url(cls, url):
return url.split('#')[0]
def read(self):
video = Video(self.url, format, cw=self.cw)
try:
video.url()#
except EmbedUrlError as e:
return self.Invalid(e.args[0])
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
def extract(name, html, cw=None):
print_ = get_print(cw)
value = re.find(r'''{} *= *['"](.*?)['"]'''.format(name), html)
if value is None:
value = json.loads(re.find(r'''{} *= *(\[.*?\])'''.format(name), html))
print_('{}: {}'.format(name, value))
if value is None:
raise Exception('No {}'.format(name))
return value
class Video(object):
_url_video = None
def __init__(self, url, format='title', cw=None):
self.url = LazyUrl(url, self.get, self)
self.format = format
self.cw = cw
@try_n(2)
def get(self, url):
if self._url_video:
return self._url_video
cw = self.cw
print_ = get_print(cw)
html = downloader.read_html(url)
soup = Soup(html)
embedUrl = extract('embedUrl', html, cw)
if embedUrl:
raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl))
uid = extract('strLocalChUserId', html, cw)
pid = extract('nLocalPrgId', html, cw)
fid = extract('strFid', html, cw)
resolType = extract('strResolType', html, cw)
resolArr = extract('strResolArr', html, cw)
vodSvr = extract('nVodSvr', html, cw)
resols = extract('nInfo', html, cw)
runtime = extract('runtime', html, cw)
url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/'
data = {
'userId': uid,
'prgId': pid,
'fid': fid,
'resolType': resolType,
'resolArr': ','.join(map(str, resolArr)),
'vodSvr': vodSvr,
'resol': max(resols),
'runtime': runtime,
'tvbox': 'false',
'defResol': 'true',
'embed': 'false',
}
session = Session()
r = session.post(url_api, headers={'Referer': url}, data=data)
data = json.loads(r.text)
self._url_video = data['src']
self.title = soup.find('meta', {'property': 'og:description'})['content']
ext = get_ext(self._url_video)
self.filename = format_filename(self.title, pid, ext)
self.url_thumb = soup.find('meta', {'property': 'og:image'})['content']
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
return self._url_video

View File

@ -0,0 +1,216 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: pinter_downloader.pyo
# Compiled at: 2019-10-21 07:44:55
import downloader
from utils import Session, Downloader, LazyUrl, clean_url, try_n, Soup, clean_title
import json, os, ree as re
from timee import sleep
from translator import tr_
import urllib
import constants
from ratelimit import limits, sleep_and_retry
BASE_URL = 'https://www.pinterest.com'
def get_info(username, board, api):
if '/' in board:
section = (u'/').join(board.split('/')[1:])
board = board.split('/')[0]
info = api.board(username, board)
for s in api.board_sections(info['id']):
print(s['slug'].lower(), section)
if s['slug'].lower() == section.lower():
break
else:
raise Exception('Invalid section')
title = s['title']
info.update(s)
info['name'] = (u'{}/{}').format(info['name'], title)
print('section_id:', info['id'])
else:
info = api.board(username, board)
#info = board_info(username, board)
return info
def board_info(username, board):
url = u'https://www.pinterest.com/{}/{}/'.format(username, board)
html = downloader.read_html(url)
soup = Soup(html)
data = soup.find('script', id='initial-state').text
data = json.loads(data)['resourceResponses']
info = data[0]['response']['data']
return info
@Downloader.register
class Downloader_pinter(Downloader):
type = 'pinter'
URLS = ['pinterest.']
type_pinter = 'board'
display_name = 'Pinterest'
@try_n(4)
def init(self):
if 'pinterest.' not in self.url:
self.url = u'https://www.pinterest.com/{}'.format(self.url)
self.api = PinterestAPI()
username, board = get_username_board(self.url)
if '/' in board:
self.type_pinter = 'section'
self.print_(('type: {}').format(self.type_pinter))
self.info = get_info(username, board, self.api)
@property
def name(self):
username = self.info['owner']['username']
name = self.info['name']
return clean_title((u'{}/{}').format(username, name))
def read(self):
self.title = self.name
id = self.info['id']
imgs = get_imgs(id, self.api, cw=self.cw, title=self.name, type=self.type_pinter)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
class PinterestAPI:
HEADERS = {'Accept': 'application/json, text/javascript, */*, q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'X-Pinterest-AppState': 'active',
'X-APP-VERSION': 'cb1c7f9',
'X-Requested-With': 'XMLHttpRequest',
'Origin': BASE_URL + '/'}
def __init__(self):
self.session = Session()
self.session.headers.update(self.HEADERS)
def pin(self, pin_id):
options = {'id': pin_id, 'field_set_key': 'detailed'}
return self._call('Pin', options)['resource_response']['data']
def pin_related(self, pin_id):
options = {'pin': pin_id, 'add_vase': True, 'pins_only': True}
return self._pagination('RelatedPinFeed', options)
def board(self, user, board):
options = {'slug': board, 'username': user, 'field_set_key': 'detailed'}
return self._call('Board', options)['resource_response']['data']
def board_pins(self, board_id):
options = {'board_id': board_id}
return self._pagination('BoardFeed', options)
def board_related(self, board_id):
options = {'board_id': board_id, 'add_vase': True}
return self._pagination('BoardRelatedPixieFeed', options)
def board_sections(self, board_id):
options = {'board_id': board_id}
return self._pagination('BoardSections', options)
def board_section_pins(self, section_id):
options = {'section_id': section_id}
return self._pagination('BoardSectionPins', options)
@try_n(4)
@sleep_and_retry
@limits(1, 4) # 1000 calls per hour
def _call(self, resource, options):
url = ('{}/resource/{}Resource/get/').format(BASE_URL, resource)
params = {'data': json.dumps({'options': options}), 'source_url': ''}
print('_call: {}, {}'.format(url, params))
r = self.session.get(url, params=params)
print(r)
s = r.text
status_code = r.status_code
try:
data = json.loads(s)
except ValueError:
data = {}
else:
if status_code < 400 and not r.history:
return data
if status_code == 404 or r.history:
raise Exception('Not Found')
raise Exception('API request failed: {}'.format(status_code))
def _pagination(self, resource, options):
while True:
data = self._call(resource, options)
for x in data['resource_response']['data']:
yield x
try:
bookmarks = data['resource']['options']['bookmarks']
if not bookmarks or bookmarks[0] == '-end-' or bookmarks[0].startswith('Y2JOb25lO'):
return
options['bookmarks'] = bookmarks
except KeyError:
return
class Image(object):
def __init__(self, img):
self.id = img['id']
print(self.id)
self.url0 = img['images']['orig']['url']
def f(_):
return self.url0
self.url = LazyUrl(('{}/pin/{}/').format(BASE_URL, self.id), f, self)
ext = os.path.splitext(self.url0.split('?')[0].split('#')[0])[1]
self.filename = ('{}{}').format(self.id, ext)
def get_imgs(id, api, cw=None, title=None, type='board'):
imgs = []
ids = set()
print('get_imgs: type={}'.format(type))
if type == 'board':
gen = api.board_pins(id)
elif type == 'section':
gen = api.board_section_pins(id)
else:
raise Exception((u'Type "{}" is not supported').format(type))
for img in gen:
if 'images' not in img:
print('skip img:', img['id'])
continue
img = Image(img)
if img.id in ids:
print('duplicate:', img.id)
continue
ids.add(img.id)
print(img.url)
print(img.filename)
print
imgs.append(img)
if cw is not None:
if not cw.alive:
return []
cw.setTitle((u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), title, len(imgs)))
return imgs
def get_username_board(url):
url = clean_url(url)
m = re.search('pinterest.[a-zA-Z.]+?/([^/]+)/([^#\\?]+)', url)
username, board = m.groups()
board = urllib.parse.unquote(board).strip()
while board.endswith('/'):
board = board[:-1].strip()
return (username, board)

View File

@ -14,6 +14,10 @@ except ImportError:
import constants
from datetime import datetime
import requests
from timee import sleep
from collections import deque
from locker import lock
import threading
FORCE_LOGIN = True
LIMIT = 48
for header in ['pixiv_illust', 'pixiv_bmk', 'pixiv_search', 'pixiv_following', 'pixiv_following_r18']:
@ -27,6 +31,7 @@ class Downloader_pixiv(Downloader):
type = 'pixiv'
MAX_CORE = 16
keep_date = True
STEP = 8, 32
@classmethod
def fix_url(cls, url):
@ -107,10 +112,10 @@ class PixivAPI():
def profile(self, id_):
return self.call('user/{}/profile/all?lang=en'.format(id_))
def bookmarks(self, id_, offset=0, limit=None):
def bookmarks(self, id_, offset=0, limit=None, rest='show'):
if limit is None:
limit = LIMIT
return self.call('user/{}/illusts/bookmarks?tag=&offset={}&limit={}&rest=show&lang=en'.format(id_, offset, limit))
return self.call('user/{}/illusts/bookmarks?tag=&offset={}&limit={}&rest={}&lang=en'.format(id_, offset, limit, rest))
def search(self, q, order='date_d', mode='all', p=1, s_mode='s_tag', type_='all'):
return self.call('search/artworks/{0}?word={0}&order={1}&mode={2}&p={3}&s_mode={4}&type={5}&lang=en'.format(quote(q), order, mode, p, s_mode, type_))
@ -254,13 +259,17 @@ def get_info(url, cw=None, depth=0):
id_ = api.user_id(url)
if id_ is None: #
id_ = my_id()
if id_ == my_id():
rest = 'all'
else:
rest = 'show'
process_user(id_, info, api)
info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id'])
ids = []
ids_set = set()
offset = 0
while len(ids) < max_pid:
data = api.bookmarks(id_, offset)
data = api.bookmarks(id_, offset, rest=rest)
c = 0
for id in [work['id'] for work in data['works']]:
if id in ids_set:
@ -359,15 +368,54 @@ def process_user(id_, info, api):
def process_ids(ids, info, imgs, cw, depth=0):
print_ = get_print(cw)
max_pid = get_max_range(cw)
for i, id_illust in enumerate(ids):
try:
info_illust = get_info('https://www.pixiv.net/en/artworks/{}'.format(id_illust), cw, depth=depth+1)
except Exception as e:
if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired): # logout during extraction
raise e
print_('process_ids error ({}):\n{}'.format(depth, print_error(e)[0]))
continue
imgs += info_illust['imgs']
class Thread(threading.Thread):
alive = True
rem = 0
def __init__(self, queue):
super().__init__(daemon=True)
self.queue = queue
@classmethod
@lock
def add_rem(cls, x):
cls.rem += x
def run(self):
while self.alive:
try:
id_, res, i = self.queue.popleft()
except Exception as e:
sleep(.1)
continue
try:
info_illust = get_info('https://www.pixiv.net/en/artworks/{}'.format(id_), cw, depth=depth+1)
res[i] = info_illust['imgs']
except Exception as e:
if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired): # logout during extraction
res[i] = e
print_('process_ids error ({}):\n{}'.format(depth, print_error(e)[0]))
finally:
Thread.add_rem(-1)
queue = deque()
n, step = Downloader_pixiv.STEP
print_('{} / {}'.format(n, step))
ts = []
for i in range(n):
t = Thread(queue)
t.start()
ts.append(t)
for i in range(0, len(ids), step):
res = [[]]*step
for j, id_illust in enumerate(ids[i:i+step]):
queue.append((id_illust, res, j))
Thread.add_rem(1)
while Thread.rem:
sleep(.001, cw)
for imgs_ in res:
if isinstance(imgs_, Exception):
raise imgs_
imgs += imgs_
s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
if cw:
cw.setTitle(s)
@ -377,3 +425,5 @@ def process_ids(ids, info, imgs, cw, depth=0):
break
if depth == 0:
check_alive(cw)
for t in ts:
t.alive = False

View File

@ -0,0 +1,530 @@
#coding:utf8
'''
Pornhub Downloader
'''
from __future__ import division, print_function, unicode_literals
from io import BytesIO
import os
import js2py
import downloader
import ree as re
from utils import (Downloader, Soup, try_n, LazyUrl, urljoin, get_print,
Session, get_max_range, filter_range, get_ext,
lock, format_filename, clean_title, get_resolution)
import clf2
import utils
from m3u8_tools import playlist2stream, M3u8_stream
class File(object):
'''
File
'''
def __init__(self, id_, title, url, url_thumb):
self.id_ = id_
self.title = clean_title('{}'.format(title))
self.url = url
ext = get_ext(self.url)
if ext.lower() == '.m3u8':
try:
self.url = playlist2stream(self.url, n_thread=4)
except:
self.url = M3u8_stream(self.url, n_thread=4)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(self.url_thumb, buffer=self.thumb)
if ext.lower() == '.m3u8':
ext = '.mp4'
self.filename = format_filename(self.title, self.id_, ext)
print('filename:', self.filename)
class Video(object):
'''
Video
'''
_url = None
filename = None
thumb = None
def __init__(self, url, cw, session):
self.url = LazyUrl(url, self.get, self)
self.cw = cw
self.session = session
def get(self, url):
'''
get
'''
cw = self.cw
session = self.session
print_ = get_print(cw)
if self._url:
return self._url
id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
re.find(r'/embed/(\w+)', url, re.IGNORECASE)
print('id: {}'.format(id_))
if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))
html = downloader.read_html(url, session=session)
soup = Soup(html)
soup = fix_soup(soup, url, session, cw)
html = str(soup)
# removed
if soup.find('div', class_='removed'):
raise Exception('removed')
gif = soup.find('div', {'id': 'gifImageSection'})
if gif:
print_('GIF')
id_ = url.split('/gif/')[1]
id_ = re.findall('[0-9a-zA-Z]+', id_)[0]
jss = list(gif.children)
for js in jss:
if 'data-mp4' in getattr(js, 'attrs', {}):
break
else:
raise Exception('gif mp4 url not found')
title = js['data-gif-title']
url = js['data-mp4']
url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb')
file = File('gif_{}'.format(id_), title, url, url_thumb)
else:
if id_ is None:
raise Exception('no id')
print_('Video')
j = decode(html, cw)
# 1968
#title = j['video_title']
title = soup.find('h1', class_='title').text.strip()
url_thumb = j['image_url']
videos = []
for video in j['mediaDefinitions']:
url_ = video.get('videoUrl').strip()
ext = get_ext(url_)
if ext.lower() not in ['.mp4', '.m3u8']:
print('not mp4: {}'.format(ext))
continue
quality = video.get('quality', 0)
if isinstance(quality, list):
quality = quality[0]
video['quality'] = int(quality)
print_('[{}p] {}'.format(quality, url_))
videos.append(video)
if not videos:
raise Exception('No videos')
videos = sorted(videos, key=lambda video: video['quality'])
res = get_resolution()
videos_good = [video for video in videos if video['quality'] <= res]
if videos_good:
video = videos_good[-1]
else:
video = videos[0]
print_('\n[{}p] {}'.format(video['quality'], video['videoUrl']))
file = File(id_, title, video['videoUrl'].strip(), url_thumb)
self._url = file.url
self.title = file.title
self.filename = file.filename
self.thumb = file.thumb
return self._url
def is_login(session, cw=None, n=2):
'''
is_login
'''
print_ = get_print(cw)
print_('is_login {}'.format(n))
if n <= 0:
return False
url = 'https://www.pornhubpremium.com'
soup = downloader.read_soup(url, session=session)
soup = fix_soup(soup, url, session, cw)
html = str(soup)
if soup.find('ul', id='profileMenuDropdown'):
return True
return is_login(session, cw, n-1)
@Downloader.register
class Downloader_pornhub(Downloader):
'''
Downloader
'''
type = 'pornhub'
single = True
strip_header = False
URLS = ['pornhub.com', 'pornhubpremium.com']
def init(self):
self.session = Session() # 1791
if 'pornhub_gif_' in self.url:
self.url = 'https://www.pornhub.com/gif/{}'.format(
self.url.replace('pornhub_gif_', ''))
elif 'pornhub_album_' in self.url:
self.url = 'https://www.pornhub.com/album/{}'.format(
self.url.replace('pornhub_album_', ''))
elif 'pornhub_' in self.url:
self.url = 'https://www.pornhub.com/view_video.php?viewkey={}'\
.format(self.url.replace('pornhub_', ''))
if 'pornhubpremium.com' in self.url.lower() and\
not is_login(self.session, self.cw):
return self.Invalid('[Pornhub] Login cookies required')
@classmethod
def key_id(cls, url):
for domain in cls.URLS:
if domain in url:
id_ = domain + url.split(domain)[1]
break
else:
raise Exception('no id')
return id_.split('#')[0]
def read(self):
cw = self.cw
session = self.session
videos = []
tab = ''.join(self.url.replace('pornhubpremium.com', 'pornhub.com', 1).split('?')[0].split('#')[0].split('pornhub.com/')[-1].split('/')[2:3])
if '/album/' in self.url:
self.print_('Album')
info = read_album(self.url, session=session)
self.single = False
for photo in info['photos']:
self.urls.append(photo.url)
self.title = clean_title(info['title'])
elif '/photo/' in self.url:
self.print_('Photo')
info = read_photo(self.url, session=session)
for photo in info['photos']:
self.urls.append(photo.url)
self.title = info['title']
elif tab not in ['', 'videos']:
raise NotImplementedError(tab)
elif 'viewkey=' not in self.url.lower() and\
'/embed/' not in self.url.lower() and\
'/gif/' not in self.url.lower():
self.print_('videos')
info = get_videos(self.url, cw)
hrefs = info['hrefs']
self.print_('videos: {}'.format(len(hrefs)))
if not hrefs:
raise Exception('no hrefs')
videos = [Video(href, cw, session) for href in hrefs]
video = self.process_playlist(info['title'], videos)
self.setIcon(video.thumb)
self.enableSegment()
else:
video = Video(self.url, cw, session)
video.url()
self.urls.append(video.url)
self.setIcon(video.thumb)
self.title = video.title
self.enableSegment()
def fix_soup(soup, url, session=None, cw=None):
'''
fix_soup
'''
print_ = get_print(cw)
if soup.find('div', class_='logo'):
return soup
print_('invalid soup: {}'.format(url))
res = clf2.solve(url, session=session, cw=cw)
return Soup(res['html'])
class Photo(object):
'''
Photo
'''
def __init__(self, id_, url, referer):
self.id_ = id_
self.url = LazyUrl(referer, lambda x: url, self)
ext = os.path.splitext(url.split('?')[0])[1]
self.filename = '{}{}'.format(id_, ext)
@try_n(8)
def read_album(url, session=None):
'''
read_album
'''
soup = downloader.read_soup(url, session=session)
id_album = re.find('/album/([0-9]+)', url, err='no album id')
url_json = 'https://www.pornhub.com/album/show_album_json?album={}'.format(id_album)
data = downloader.read_json(url_json, url, session=session)
block = soup.find('div', class_='photoAlbumListBlock')
href = block.a.attrs['href']
id_ = re.find('/photo/([0-9]+)', href, err='no photo id')
ids = [id_]
while True:
item = data[id_]
id_ = item['next']
if id_ in ids:
break
ids.append(id_)
photos = []
for id_ in ids:
item = data[id_]
img = item['img_large']
referer = 'https://www.pornhub.com/photo/{}'.format(id_)
photo = Photo(id_, img, referer)
photos.append(photo)
info = {}
title = clean_title(soup.find('h1', class_='photoAlbumTitleV2').text)
info['title'] = format_filename(title, 'album_{}'.format(id_album))
info['photos'] = photos
return info
@try_n(8)
def read_photo(url, session=None):
'''
read_photo
'''
id_ = re.find('/photo/([0-9]+)', url, err='no photo id')
soup = downloader.read_soup(url, session=session)
div = soup.find('div', id='thumbSlider')
href = urljoin(url, div.find('a').attrs['href'])
info = read_album(href)
photos = []
for photo in info['photos']:
if str(photo.id_) == id_:
photos.append(photo)
info['photos'] = photos
info['title'] = '{} - {}'.format(info['title'], photos[0].filename)
return info
@try_n(4)
def get_videos(url, cw=None):
'''
get_videos
'''
print_ = get_print(cw)
if '/users/' in url:
mode = 'users'
username = url.split('/users/')[1].split('/')[0]
elif '/pornstar/' in url:
mode = 'pornstar'
username = url.split('/pornstar/')[1].split('/')[0]
elif '/model/' in url:
mode = 'model'
username = url.split('/model/')[1].split('/')[0]
elif '/channels/' in url:
mode = 'channels'
username = url.split('/channels/')[1].split('/')[0]
elif '/playlist/' in url:
mode = 'playlist'
username = url.split('/playlist/')[1].split('/')[0]
else:
raise Exception('Not supported url')
username = username.split('?')[0].split('#')[0]
session = Session()
if mode in ['pornstar']:
url_main = 'https://www.pornhub.com/{}/{}'.format(mode, username)
html = downloader.read_html(url_main, session=session)
soup = Soup(html)
soup = fix_soup(soup, url_main, session, cw)
for a in soup.findAll('a'):
if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''):
free = True
break
else:
free = False
print_('free: {}'.format(free))
# Range
max_pid = get_max_range(cw, 500)
max_pid = min(max_pid, 2000)#
html = downloader.read_html(url, session=session)
soup = fix_soup(Soup(html), url, session, cw)
info = {}
# get title
h1 = soup.find('h1')
if h1:
header = 'Playlist'
title = h1.find(id='watchPlaylist')
else:
title = None
if not title:
header = 'Channel'
profile = soup.find('div', class_='profileUserName')
wrapper = soup.find('div', class_='titleWrapper')
bio = soup.find('div', class_='withBio')
title = soup.find('h1', {'itemprop':'name'})
if not title and profile:
title = profile.a
if not title and wrapper:
title = wrapper.h1
if not title and bio:
title = bio.h1
if not title:
raise Exception('No title')
#print(title)
info['title'] = '[{}] {}'.format(header, title.text.strip())
token = re.find('''token *= *['"](.*?)['"]''', html)
print_('token: {}'.format(token))
# get links
hrefs = []
fail = 0
for p in range(1, 1+100):
try:
if mode in ['users', 'model']:
if mode == 'users':
url_api = 'https://www.pornhub.com/users/{}/videos/public/'\
'ajax?o=mr&page={}'.format(username, p)
elif mode == 'model':
url_api = 'https://www.pornhub.com/model/{}/videos/upload/'\
'ajax?o=mr&page={}'.format(username, p)
r = session.post(url_api)
soup = Soup(r.text)
if soup.find('h1'):
print('break: h1')
break
elif mode in ['pornstar']:
if free:
url_api = 'https://www.pornhub.com/{}/{}/videos/upload'\
'?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('div', class_='videoUList')
else:
url_api = 'https://www.pornhub.com/{}/{}?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('ul', class_='pornstarsVideos')
elif mode in ['channels']:
url_api = 'https://www.pornhub.com/{}/{}/videos?page={}'.format(mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
try:
soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide')
except:
break
elif mode in ['playlist']:
#url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(username, len(hrefs))
if token is None:
raise Exception('no token')
url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&token={}&page={}'.format(username, token, p)
soup = downloader.read_soup(url_api, session=session)
else:
raise NotImplementedError(mode)
fail = 0
except Exception as e:
print_(e)
fail += 1
if fail < 2:
continue
else:
break
finally:
print_('{} ({})'.format(url_api, len(hrefs)))
if cw and not cw.alive:
return
lis = soup.findAll('li', class_='videoblock')
if not lis:
print_('break: no lis')
break
if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found':
print_('Page Not Found')
break
c = 0
for li in lis:
a = li.find('a')
href = a.attrs['href']
href = urljoin(url, href)
if href in hrefs:
continue
c += 1
if href.startswith('javascript:'): # Remove Pornhub Premium
print(href)
continue
hrefs.append(href)
if c == 0:
print('c==0')
break
print(c) # 1320
if len(hrefs) >= max_pid:
break
if cw:
hrefs = filter_range(hrefs, cw.range)
info['hrefs'] = hrefs
return info
@lock
def decode(html, cw=None):
'''
decode
'''
print_ = get_print(cw)
print_('decode')
soup = Soup(html)
for script in soup.findAll('script'):
script = script.text or script.string or ''
script = script.strip()
if 'videoUrl' in script:
break
else:
raise Exception('No script')
flashvars = script.split()[1]
script = 'playerObjList={};' + script
context = js2py.EvalJs()
context.execute(script)
return context.eval(flashvars).to_dict()

View File

@ -0,0 +1,133 @@
import downloader
import ree as re
import os
from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, clean_title
from translator import tr_
try:
from urllib import quote # python2
except:
from urllib.parse import quote # python3
import sys
from timee import sleep
from constants import clean_url
LIMIT = 100
def get_tags(url):
url = clean_url(url)
qs = query_url(url)
if 'page=favorites' in url:
id = qs.get('id', ['N/A'])[0]
id = u'fav_{}'.format(id)
else:
tags = qs.get('tags', [])
tags.sort()
id = u' '.join(tags)
if not id:
id = u'N/A'
return id
@Downloader.register
class Downloader_rule34_xxx(Downloader):
type = 'rule34_xxx'
URLS = ['rule34.xxx']
MAX_CORE = 8
display_name = 'Rule34.xxx'
_name = None
def init(self):
if 'rule34.xxx' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
url = self.url
url = url.replace(' ', '+')
while '++' in url:
url = url.replace('++', '+')
url = quote(url)
url = url.replace('%2B', '+')
self.url = u'https://rule34.xxx/index.php?page=post&s=list&tags={}'.format(url)
@property
def name(self):
if self._name is None:
tags = get_tags(self.url)
self._name = tags
return clean_title(self._name)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.name, cw=self.cw)
for img in imgs:
self.urls.append(img.url)
self.filenames[img.url] = img.filename
self.title = self.name
class Image(object):
def __init__(self, id_, url):
self.url = url
ext = os.path.splitext(url)[1]
self.filename = u'{}{}'.format(id_, ext)
def setPage(url, page):
# Always use HTTPS
url = url.replace('http://', 'https://')
# Change the page
if 'pid=' in url:
url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url)
else:
url += '&pid={}'.format(page)
return url
def get_imgs(url, title=None, cw=None):
url = clean_url(url)
if 's=view' in url and 'page=favorites' not in url:
raise NotImplementedError('Not Implemented')
if 'page=dapi' not in url.lower():
tags = get_tags(url)
tags = quote(tags, safe='/')
tags = tags.replace('%20', '+')
url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(tags, 0, LIMIT)
print_ = get_print(cw)
# Range
max_pid = get_max_range(cw)
imgs = []
ids = set()
for p in range(500): #1017
url = setPage(url, p)
print_(url)
html = downloader.read_html(url)
soup = Soup(html)
posts = soup.findAll('post')
if not posts:
break
for post in posts:
id_ = post.attrs['id']
if id_ in ids:
print('duplicate:', id_)
continue
ids.add(id_)
url_img = post.attrs['file_url']
img = Image(id_, url_img)
imgs.append(img)
if len(imgs) >= max_pid:
break
if cw is not None:
if not cw.alive:
break
cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
return imgs

View File

@ -0,0 +1,180 @@
#coding: utf8
import downloader
import json
from io import BytesIO
from utils import Downloader, LazyUrl, get_print, try_n, lock, clean_title
from error_printer import print_error
import os
from timee import sleep
import ffmpeg
import ytdl
from m3u8_tools import M3u8_stream
CLIENT_ID = None
@lock
def get_cid(force=False):
global CLIENT_ID
if CLIENT_ID is None or force:
print('update cid...')
d = ytdl.YoutubeDL()
e = ytdl.extractor.soundcloud.SoundcloudIE(d)
e._update_client_id()
CLIENT_ID = e._CLIENT_ID
return CLIENT_ID
class Audio(object):
_url = None
def __init__(self, info, album_art, cw=None):
self.info = info
self.album_art = album_art
self.cw = cw
self.url = LazyUrl(info['webpage_url'], self.get, self, pp=self.pp)
def get(self, url):
print_ = get_print(self.cw)
if self._url:
return self._url
info = self.info
## ydl = ytdl.YoutubeDL()
## info = ydl.extract_info(url)
formats = info['formats']
print(formats)
formats = sorted(formats, key=lambda x: int(x.get('abr', 0)), reverse=True)
url_audio = None
for format in formats:
protocol = format['protocol']
print_(u'{}】 format【{}】 abr【{}'.format(protocol, format['format'], format.get('abr', 0)))
if not url_audio and protocol in ['http', 'https']:
url_audio = format['url']
if not url_audio:
url_audio = M3u8_stream(formats[0]['url'])
self.album_art = False#
self.username = info['uploader']
self.title = u'{} - {}'.format(self.username, info['title'])
self.filename = u'{}{}'.format(clean_title(self.title, allow_dot=True, n=-4), '.mp3')
thumb = None
for t in info['thumbnails'][::-1]:
width = t.get('width', 1080)
if not 100 <= width <= 500:
continue
url_thumb = t['url']
thumb = BytesIO()
try:
downloader.download(url_thumb, buffer=thumb)
break
except Exception as e:
print(e)
thumb = None
self.thumb = thumb
self._url = url_audio
return self._url
def pp(self, filename):
cw = self.cw
with cw.convert(self):
return self._pp(filename)
def _pp(self, filename):
if self.thumb and self.album_art:
self.thumb.seek(0)#
ffmpeg.add_cover(filename, self.thumb, {'artist':self.username, 'title':self.info['title']}, cw=self.cw)
@Downloader.register
class Downloader_soundcloud(Downloader):
type = 'soundcloud'
single = True
URLS = ['soundcloud.com']
#lock = True
audio = None
display_name = 'SoundCloud'
def init(self):
if 'soundcloud.com' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
self.url = 'https://soundcloud.com/{}'.format(self.url)
def read(self):
album_art = self.ui_setting.albumArt.isChecked()
info = get_audios(self.url, self.cw, album_art)
audios = info['audios']
if not audios:
raise Exception('no audios')
# first audio must be valid
while audios:
audio = audios[0]
try:
audio.url()
break
except Exception as e:
e_ = e
print(e)
audios.remove(audio)
else:
raise e_
if len(audios) > 1:
audio = self.process_playlist(info['title'], audios)
else:
self.urls.append(audio.url)
self.title = audio.title
self.artist = audio.username
self.setIcon(audio.thumb)
@try_n(2)
def get_audios(url, cw, album_art):
print_ = get_print(cw)
url = url.rstrip('/')
if url.count('/') == 3:
url += '/tracks'
info = {
#'extract_flat': True,
}
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
if 'entries' in info:
entries = info['entries']
title = info['title']
for _type in ['All', 'Tracks', 'Albums', 'Sets', 'Reposts', 'Likes', 'Spotlight']:
x = '({})'.format(_type)
if x in title:
title = title.replace(x, '')
kind = _type
break
else:
kind = 'Playlist'
print_(u'kind: {}'.format(kind))
info['title'] = u'[{}] {}'.format(kind.capitalize(), title)
else:
entries = [info]
audios = []
for e in entries:
if '/sets/' in e['webpage_url']:
continue
audio = Audio(e, album_art, cw=cw)
audios.append(audio)
info['audios'] = audios
return info

View File

@ -0,0 +1,250 @@
from __future__ import division, print_function, unicode_literals
import downloader
import ree as re
from utils import urljoin, Soup, LazyUrl, Downloader, try_n, compatstr, get_print, clean_title, Session, get_max_range
import os
import json
import ast
from io import BytesIO
import random
import clf2
from translator import tr_
from timee import sleep
from error_printer import print_error
import devtools
HDR = {'User-Agent': downloader.hdr['User-Agent']}
PATTERN_VID = '/(v|video)/(?P<id>[0-9]+)'
def is_captcha(soup):
return soup.find('div', class_="verify-wrap") is not None
@Downloader.register
class Downloader_tiktok(Downloader):
type = 'tiktok'
single = True
URLS = ['tiktok.com']
display_name = 'TikTok'
def init(self):
cw = self.cw
self.session = Session()
res = clf2.solve(self.url, self.session, cw)
soup = Soup(res['html'])
if is_captcha(soup):
def f(html):
return not is_captcha(Soup(html))
clf2.solve(self.url, self.session, cw, show=True, f=f)
@classmethod
def fix_url(cls, url):
url = url.split('?')[0].split('#')[0].strip('/')
if 'tiktok.com' not in url.lower():
url = 'https://www.tiktok.com/@{}'.format(url)
return url
def read(self):
format = compatstr(self.ui_setting.youtubeFormat.currentText()).lower().strip()
if re.search(PATTERN_VID, self.url) is None:
info = read_channel(self.url, self.session, self.cw)
items = info['items']
videos = [Video('https://www.tiktok.com/@{}/video/{}'.format(info['uid'], item['id']), self.session, format) for item in items]
title = '{} (tiktok_{})'.format(info['nickname'], info['uid'])
video = self.process_playlist(title, videos)
else:
video = Video(self.url, self.session, format)
video.url()
self.urls.append(video.url)
self.title = clean_title(video.title)
self.setIcon(video.thumb)
class Video(object):
_url = None
def __init__(self, url, session, format='title (id)'):
self.url = LazyUrl(url, self.get, self)
self.session = session
self.format = format
@try_n(2)
def get(self, url):
if self._url:
return self._url
m = re.search(PATTERN_VID, url)
id = m.group('id')
ext = '.mp4'
self.title = id#
self.filename = '{}{}'.format(clean_title(self.title, n=-len(ext)), ext)
html = downloader.read_html(url, session=self.session)
soup = Soup(html)
data = soup.find(id='__NEXT_DATA__')
props = data.contents[0]
data_encode = json.dumps(props)
ast_le = ast.literal_eval(data_encode)
data = json.loads(ast_le)
#info = data['props']['pageProps']['videoData']['itemInfos']
info = data['props']['pageProps']['itemInfo']['itemStruct']
self._url = info['video']['downloadAddr']
self.url_thumb = info['video']['cover']
self.thumb = BytesIO()
downloader.download(self.url_thumb, referer=url, buffer=self.thumb)
return self._url
def read_channel(url, session, cw=None):
print_ = get_print(cw)
info = {}
info['items'] = []
ids = set()
info['items'] = []
sd = {
'count_empty': 0,
'shown': False,
}
max_pid = get_max_range(cw)
def f(html, browser=None):
soup = Soup(html)
if is_captcha(soup):
print('captcha')
browser.show()
sd['shown'] = True
elif sd['shown']:
browser.hide()
sd['shown'] = False
try:
info['uid'] = soup.find('h2', class_='share-title').text.strip()
info['nickname'] = soup.find('h1', class_='share-sub-title').text.strip()
except Exception as e:
print_(print_error(e)[0])
c = 0
ids_now = set()
for div in soup.findAll('div', class_='video-feed-item'):
a = div.find('a')
if a is None:
continue
href = a['href']
if not href:
continue
m = re.search(PATTERN_VID, href)
if m is None:
continue
id_video = int(m.group('id'))
ids_now.add(id_video)
if id_video in ids:
continue
ids.add(id_video)
info['items'].append({'id': id_video})
c += 1
print_('items: {}'.format(len(info['items'])))
if len(info['items']) >= max_pid:
info['items'] = info['items'][:max_pid]
return True
browser.runJavaScript('window.scrollTo(0, document.body.scrollHeight);')
sleep(15, cw)
if c or (ids_now and min(ids_now) > min(ids)):
sd['count_empty'] = 0
else:
print_('empty')
sd['count_empty'] += 1
msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items']))
if cw:
if not cw.alive:
raise Exception('cw dead')
cw.setTitle(msg)
else:
print(msg)
return sd['count_empty'] > 4
res = clf2.solve(url, session, cw, f=f, timeout=1800, show=True)
if not info['items']:
raise Exception('no items')
return info
@try_n(2)
def read_channel_legacy(url, session, cw=None):
print_ = get_print(cw)
html = downloader.read_html(url, session=session, headers=HDR)
uid = re.find('//user/profile/([0-9]+)', html, err='no uid')
secUid = re.find('"secUid" *: *"([^"]+?)"', html, err='no secUid')
verifyFp = ''.join(random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for i in range(16))
maxCursor = 0
info = {}
info['items'] = []
ids = set()
for i in range(100):
url_api = 'https://t.tiktok.com/api/item_list/?count=30&id={uid}&type=1&secUid={secUid}&maxCursor={maxCursor}&minCursor=0&sourceType=8&appId=1180&region=US&language=en&verifyFp={verifyFp}'.format(uid=uid, secUid=secUid, verifyFp=verifyFp, maxCursor=maxCursor)
js = 'window.byted_acrawler.sign({url:"{}"});'.replace('{}', url_api)
print(js)
for try_ in range(4):
try:
sign = devtools.eval_js(url, js, session)['output']
break
except Exception as e:
print(e)
e_ = e
else:
raise e_
url_api += '&_signature=' + sign
print_(url_api)
data_raw = downloader.read_html(url_api, url, session=session, headers=HDR)
data = json.loads(data_raw)
items = []
for item in data.get('items', []):
id_video = item['id']
if id_video in ids:
print('duplicate:', id_video)
continue
ids.add(id_video)
items.append(item)
if not items:
print('no items')
break
info['items'] += items
if i == 0:
info['uid'] = items[0]['author']['uniqueId']
info['nickname'] = items[0]['author']['nickname']
msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info['nickname'], info['uid'], len(info['items']))
if cw:
if not cw.alive:
break
cw.setTitle(msg)
else:
print(msg)
if not data['hasMore']:
break
maxCursor = data['maxCursor']
if not info['items']:
raise Exception('no items')
return info

View File

@ -0,0 +1,100 @@
#coding:utf8
import downloader
from utils import Soup, urljoin, Downloader, cut_pair, LazyUrl, clean_title
from timee import sleep
from translator import tr_
from io import BytesIO
import ree as re
import os
@Downloader.register
class Downloader_tokyomotion(Downloader):
type = 'tokyomotion'
URLS = ['tokyomotion.net']
single = True
_type = None
display_name = 'TOKYO Motion'
def init(self):
html = downloader.read_html(self.url)
self.soup = Soup(html)
if '/album/' in self.url:
self._type = 'album'
else:
self._type = 'video'
@property
def name(self):
title = get_title(self.soup)
return clean_title(title)
def read(self):
if self._type == 'video':
video = get_video(self.url, self.soup)
self.urls.append(video.url)
self.setIcon(video.thumb)
elif self._type == 'album':
imgs = get_imgs(self.url)
for img in imgs:
self.urls.append(img.url)
self.single = False
else:
raise NotImplementedError('Unknown type: {}'.format(self._type))
self.title = self.name
class Video(object):
def __init__(self, url, url_thumb, referer, filename):
self.url = LazyUrl(referer, lambda x: url, self)
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(url_thumb, referer=referer, buffer=self.thumb)
self.filename = filename
def get_title(soup):
video = soup.find('video', id='vjsplayer')
if video:
title = soup.find('h3').text.strip()
else:
title = soup.find('title').text.split(' Album - ')[0].strip()
return title
def get_video(url, soup=None):
if soup is None:
html = downloader.read_html(url)
soup = Soup(html)
video = soup.find('video', id='vjsplayer').find('source').attrs['src']
url_thumb = soup.find('video', id='vjsplayer').attrs['poster']
title = get_title(soup)
filename = u'{}.mp4'.format(clean_title(title))
video = Video(video, url_thumb, url, filename)
return video
class Image(object):
def __init__(self, url, referer):
self.url = LazyUrl(referer, lambda x: url, self)
self.filename = os.path.basename(url.split('?')[0])
def get_imgs(url):
id = re.find('album/.*?([0-9]+)', url)
print('id:', id)
url = 'https://www.tokyomotion.net/album/slideshow/{}'.format(id)
html = downloader.read_html(url)
soup = Soup(html)
imgs = []
for a in soup.findAll('a', {'data-lightbox': 'slideshow-{}'.format(id)}):
img = a.find('img').attrs['src']
img = img.replace('/tmb/', '/')
img = Image(img, url)
imgs.append(img)
return imgs

View File

@ -1,4 +1,4 @@
from utils import Downloader, speed_text, clean_title
from utils import Downloader, clean_title
import constants, os, downloader
from size import Size
try:
@ -54,9 +54,10 @@ class Downloader_torrent(Downloader):
if not files:
raise Exception('No files')
cw.single = self.single = len(files) == 1
for file in files:
filename = os.path.join(self.dir, file)
cw.imgs.append(filename)
if not cw.imgs:
for file in files:
filename = os.path.join(self.dir, file)
cw.imgs.append(filename)
def start_(self):
cw = self.cw
@ -81,8 +82,11 @@ class Downloader_torrent(Downloader):
if cw.alive:
cw.setSpeed('')
if cw.pause_lock and cw.pbar.value() < cw.pbar.maximum():
cw.pause_data = {'type': self.type, 'url': self.url,
'filesize': self._filesize_prev}
cw.pause_data = {
'type': self.type,
'url': self.url,
'filesize': self._filesize_prev,
}
cw.paused = True
cw.pause_lock = False
self.update_tools_buttons()
@ -110,8 +114,8 @@ class Downloader_torrent(Downloader):
cw.dones.add(file)
file = constants.compact(file).replace('\\', '/')
files = file.split('/')
file = (u' / ').join(files[1:])
msg = (u'Completed: {}').format(file)
file = ' / '.join(files[1:])
msg = 'Completed: {}'.format(file)
self.print_(msg)
if i == 0:
for try_ in range(4):
@ -126,20 +130,20 @@ class Downloader_torrent(Downloader):
downloader.total_download_size += d_size
cw.pbar.setValue(s.progress * MAX_PBAR)
if s.state_str == 'queued':
title_ = (u'Waiting... {}').format(title)
title_ = 'Waiting... {}'.format(title)
elif s.state_str == 'checking files':
title_ = (u'Checking files... {}').format(title)
title_ = 'Checking files... {}'.format(title)
self._filesize_prev = filesize
elif s.state_str == 'downloading':
title_ = (u'{} (p: {}, s: {})').format(title, s.num_peers, s.num_seeds)
title_ = '{} (p: {}, s: {})'.format(title, s.num_peers, s.num_seeds)
cw.setFileSize(filesize)
text = self.size.speed_text()
cw.setSpeed(text)
elif s.state_str == 'seeding':
title_ = (u'{}').format(title)
title_ = '{}'.format(title)
cw.setFileSize(filesize)
else:
title_ = (u'{}... {}').format(s.state_str.capitalize(), title)
title_ = '{}... {}'.format(s.state_str.capitalize(), title)
cw.setTitle(title_, update_filter=False)
else:
return 'abort'

View File

@ -0,0 +1,204 @@
#coding:utf8
import downloader
from translator import tr_
from utils import Soup, Session, query_url, get_max_range, Downloader, clean_title, update_url_query, get_print, get_ext, LazyUrl
import ree as re
import errors
from ratelimit import limits, sleep_and_retry
from error_printer import print_error
class Image(object):
def __init__(self, url, id, p=0, cw=None):
self._url = url
self.id_ = id
self.p = p
self.cw = cw
self.url = LazyUrl(url, self.get, self)
def get(self, _):
print_ = get_print(self.cw)
url = self._url
ext = get_ext(url)
if ext.lower() == '.gif':
print_('get_ext: {}, {}'.format(self.id_, url))
try:
ext = downloader.get_ext(url)
except Exception as e: #3235
print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0])
self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
return url
@Downloader.register
class Downloader_tumblr(Downloader):
type = 'tumblr'
URLS = ['tumblr.com']
def init(self):
if u'tumblr.com/post/' in self.url:
return self.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url))
self.session = Session()
@classmethod
def fix_url(cls, url):
id = get_id(url)
return 'https://{}.tumblr.com'.format(id)
def read(self):
username = get_id(self.url)
name = get_name(username, self.session)
for img in get_imgs(username, self.session, cw=self.cw):
self.urls.append(img.url)
self.title = clean_title('{} (tumblr_{})'.format(name, username))
class TumblrAPI(object):
_url_base = 'https://www.tumblr.com/api'
_hdr = {
'referer': 'https://www.tumblr.com',
'authorization': 'Bearer aIcXSOoTtqrzR8L8YEIOmBeW94c3FmbSNSWAUbxsny9KKx5VFh',
}
_qs = {
'fields[blogs]': 'name,avatar,title,url,is_adult,?is_member,description_npf,uuid,can_be_followed,?followed,?advertiser_name,is_paywall_on,theme,subscription_plan,?primary,share_likes,share_following,can_subscribe,subscribed,ask,?can_submit,?is_blocked_from_primary,?tweet,?admin,can_message,?analytics_url,?top_tags,paywall_access',
'npf': 'true',
'reblog_info': 'false',
'include_pinned_posts': 'false',
#'page_number': None,
}
def __init__(self, session, cw=None):
self.session = session
self.cw = cw
def print_(self, s):
get_print(self.cw)(s)
@sleep_and_retry
@limits(1, 1)
def call(self, path, qs, default_qs=True):
if default_qs:
qs_new = qs
qs = self._qs.copy()
qs.update(qs_new)
url = self._url_base + path
url = update_url_query(url, qs)
r = self.session.get(url, headers=self._hdr)
data = r.json()
errs = data.get('errors', [])
if errs:
code = int(errs[0]['code'])
if code == 0:
raise Exception('Not found')
elif code == 4012:
raise errors.LoginRequired(errs[0]['detail'])
r.raise_for_status()
return data['response']
def name(self, username):
path = '/v2/blog/{}/posts'.format(username)
data = self.call(path, {})
return data['blog']['title'] or data['blog']['name']
def posts(self, username):
path = '/v2/blog/{}/posts'.format(username)
qs = {}
ids = set()
default_qs = True
while True:
if self.cw and not self.cw.alive:
break
data = self.call(path, qs, default_qs=default_qs)
for post in data['posts']:
id_ = post['id']
if id_ in ids:
self.print_('duplicate: {}'.format(id_))
continue
ids.add(id_)
yield Post(post, self.cw)
try:
links = data.get('links') or data['_links']
path_next = links['next']['href']
except:
path_next = None
if path_next:
path = path_next
default_qs = False
else:
break
class Post(object):
def __init__(self, data, cw=None):
id_ = data['id']
self.imgs = []
cs = data['content']
for trail in data['trail']:
cs += trail['content']
for c in cs:
if c['type'] in ['image', 'video']:
media = c.get('media')
if not media: #2859
continue
if isinstance(media, list):
media = media[0]
img = media['url']
self.imgs.append(Image(img, id_, len(self.imgs), cw))
elif c['type'] in ['text', 'link', 'audio']:
continue
else:
raise NotImplementedError(id_, c)
def get_name(username, session):
return TumblrAPI(session).name(username)
def get_imgs(username, session, cw=None):
print_ = get_print(cw)
artist = get_name(username, session)
imgs = []
error_count = 0
max_pid = get_max_range(cw)
api = TumblrAPI(session, cw)
for post in api.posts(username):
imgs += post.imgs
s = '{} {} (tumblr_{}) - {}'.format(tr_(u'\uc77d\ub294 \uc911...'), artist, username, len(imgs))
if cw:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
if len(imgs) > max_pid:
break
return imgs[:max_pid]
def get_id(url):
if '/dashboard/blog/' in url:
url = re.find('/dashboard/blog/([0-9a-zA-Z_-]+)', url)
if '/login_required/' in url:
url = url.split('/login_required/')[1].split('?')[0].split('/')[0]
if 'tumblr.com/blog/view/' in url:
url = url.split('tumblr.com/blog/view/')[1]
if 'tumblr.com' in url:
if 'www.tumblr.com' in url:
qs = query_url(url)
url = qs.get('url', [url])[0]
url = url.split('.tumblr.com')[0].split('/')[(-1)]
if url == 'www':
raise Exception('no id')
return url

View File

@ -275,6 +275,7 @@ class TwitterAPI(object):
return
params["cursor"] = cursor
if params.get("cursor") is None: # nothing
print_('no cursor')
break
@ -328,7 +329,8 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
names[id_].append(name)
else:
names[id_] = [name]
max_id = max(ids) if ids else 0
ids_sure = sorted(ids)[:-100]
max_id = max(ids_sure) if ids_sure else 0 #3201
# 2303
imgs_old = []
@ -341,23 +343,23 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
imgs_new = []
enough = False
c_old = 0
for tweet in TwitterAPI(session, cw).timeline_media(username):
id_ = int(tweet['id_str'])
if id_ < max_id:
print_('enough')
enough = True
break
imgs_ = get_imgs_from_tweet(tweet, session, types, format, cw)
if id_ in ids:
print_('duplicate: {}'.format(id_))
c_old += 1
continue
ids.add(id_)
imgs_new += imgs_
if len(imgs_old) + len(imgs_new) >= n:
imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw)
if len(imgs_new) + c_old >= n: #3201
break
msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new))
@ -368,7 +370,7 @@ def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage',
else:
print(msg)
if not enough and not imgs_new:
if not enough and not imgs_new and c_old == 0:
raise Exception('no imgs')
imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True)

View File

@ -0,0 +1,103 @@
#coding:utf8
from __future__ import division, print_function, unicode_literals
import downloader
from utils import Soup, get_ext, LazyUrl, Downloader, try_n, clean_title, get_print
import ree as re
from translator import tr_
from timee import sleep
import errors
def setPage(url, p):
url = url.split('?')[0]
if p > 1:
url += '?page={}'.format(p)
return url
def getPage(url):
p = re.find('page=([0-9]+)', url)
return int(p or 1)
class Image(object):
def __init__(self, url, referer, p):
self.url = LazyUrl(referer, lambda x: url, self)
ext = get_ext(url)
self.filename = '{:04}{}'.format(p, ext)
@Downloader.register
class Downloader_v2ph(Downloader):
type = 'v2ph'
URLS = ['v2ph.com/album/']
MAX_CORE = 4
display_name = 'V2PH'
@classmethod
def fix_url(cls, url):
return url.split('?')[0]
def read(self):
info = get_info(self.url)
for img in get_imgs(self.url, info['title'], self.cw):
self.urls.append(img.url)
self.title = clean_title(info['title'])
@try_n(2)
def get_info(url):
html = downloader.read_html(url)
soup = Soup(html)
info = {}
info['title'] = soup.find('h1').text.strip()
return info
def get_imgs(url, title, cw=None):
print_ = get_print(cw)
imgs = []
for p in range(1, 1001):
url = setPage(url, p)
print_(url)
for try_ in range(4):
try:
html = downloader.read_html(url, user_agent=downloader.hdr['User-Agent'])
#sleep(1)
break
except Exception as e:
print(e)
else:
raise
soup = Soup(html)
view = soup.find('div', class_='photos-list')
if view is None:
if p == 1:
raise errors.LoginRequired()
else:
break # Guest user
for img in view.findAll('img'):
img = img.attrs['data-src']
img = Image(img, url, len(imgs))
imgs.append(img)
pgn = soup.find('ul', class_='pagination')
ps = [getPage(a.attrs['href']) for a in pgn.findAll('a')]
if p >= max(ps):
print('max p')
break
msg = '{} {} ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps))
if cw:
cw.setTitle(msg)
else:
print(msg)
return imgs

View File

@ -0,0 +1,58 @@
import downloader
import ree as re
from io import BytesIO as IO
from error_printer import print_error
from utils import Downloader, LazyUrl, get_ext, format_filename, try_n
import ytdl
@Downloader.register
class Downloader_vimeo(Downloader):
type = 'vimeo'
URLS = ['vimeo.com']
single = True
def init(self):
if 'vimeo.com' not in self.url.lower():
self.url = u'https://vimeo.com/{}'.format(self.url)
def read(self):
video = Video(self.url)
video.url()#
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = video.title
class Video(object):
_url = None
def __init__(self, url):
self.url = LazyUrl(url, self.get, self)
@try_n(4)
def get(self, url):
if self._url:
return self._url
ydl = ytdl.YoutubeDL()
info = ydl.extract_info(url)
fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
if not fs:
raise Exception('No MP4 videos')
f = fs[0]
self._url = f['url']
self.thumb_url = info['thumbnails'][0]['url']
self.thumb = IO()
downloader.download(self.thumb_url, buffer=self.thumb)
self.title = info['title']
ext = get_ext(self._url)
self.filename = format_filename(self.title, info['id'], ext)
return self._url

View File

@ -0,0 +1,76 @@
import downloader
import ytdl
from utils import Downloader, try_n, LazyUrl, get_ext, format_filename, clean_title
from io import BytesIO
import ree as re
from m3u8_tools import M3u8_stream
import os
@Downloader.register
class Downloader_vlive(Downloader):
type = 'vlive'
URLS = ['vlive.tv']
single = True
display_name = 'V LIVE'
def init(self):
if 'channels.vlive.tv' in self.url:
raise NotImplementedError('channel')
def read(self):
video = get_video(self.url)
self.urls.append(video.url)
self.setIcon(video.thumb)
self.enableSegment()
self.title = clean_title(video.title)
@try_n(4)
def get_video(url):
options = {
'noplaylist': True,
}
ydl = ytdl.YoutubeDL(options)
info = ydl.extract_info(url)
fs = []
for f in info['formats']:
if f['ext'] != 'mp4':
continue
f['quality'] = f.get('vbr') or re.find('([0-9]+)p', f['format'], re.IGNORECASE)
print(f['format'], f['quality'])
fs.append(f)
if not fs:
raise Exception('No videos')
f = sorted(fs, key=lambda f:f['quality'])[-1]
video = Video(f, info)
return video
class Video(object):
def __init__(self, f, info):
self.title = title = info['title']
self.id = info['id']
self.url = f['url']
self.thumb = BytesIO()
downloader.download(info['thumbnail'], buffer=self.thumb)
ext = get_ext(self.url)
if ext.lower() == '.m3u8':
raise NotImplementedError('stream')#
url = M3u8_stream(self.url, n_thread=4)
else:
url = self.url
self.url = LazyUrl(self.url, lambda x: url, self)
self.filename = format_filename(title, self.id, ext)

View File

@ -0,0 +1,147 @@
import downloader
from utils import Soup, LazyUrl, clean_title, get_ext, get_imgs_already, urljoin, try_n, Downloader
import os
import page_selector
from translator import tr_
import ree as re
@Downloader.register
class Downloader_webtoon(Downloader):
type = 'webtoon'
URLS = ['webtoon.com', 'webtoons.com']
MAX_CORE = 8
MAX_SPEED = 4.0
display_name = 'WEBTOON'
def init(self):
self.url = get_main(self.url)
self.soup = downloader.read_soup(self.url)
@classmethod
def fix_url(cls, url):
return url.replace('webtoon.com', 'webtoons.com')
def read(self):
title = clean_title(self.soup.find('h1').text.strip())
self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(title)
imgs = get_imgs_all(self.url, title, cw=self.cw)
for img in imgs:
if isinstance(img, Image):
self.urls.append(img.url)
else:
self.urls.append(img)
self.title = title
class Page(object):
def __init__(self, url, title):
self.url = url
self.title = title
class Image(object):
def __init__(self, url, page, p):
ext = get_ext(url) or downloader.get_ext(url, referer=page.url)
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
self.url = LazyUrl(page.url, lambda _: url, self)
@try_n(2)
def get_imgs(page):
html = downloader.read_html(page.url)
if 'window.__motiontoonViewerState__' in html:
raise NotImplementedError('motiontoon')
soup = Soup(html)
view = soup.find('div', class_='viewer_img')
imgs = []
for img in view.findAll('img'):
src = img.get('data-url') or img['src']
img = Image(urljoin(page.url, src), page, len(imgs))
imgs.append(img)
return imgs
def get_main(url):
if 'episode_no=' in url:
soup = downloader.read_soup(url)
url = urljoin(url, soup.find('div', class_='subj_info').find('a')['href'])
return url
def set_page(url, p):
if '&page=' not in url:
url = url + '&page={}'.format(p)
else:
url = re.sub('&page=[0-9]+', '&page={}'.format(p), url)
if p == 1:
url = url.replace('&page=1', '')
return url
def get_pages(url):
pages = []
urls = set()
for p in range(1, 101):
url_page = set_page(url, p)
print(url_page)
for try_ in range(4):
try:
soup = downloader.read_soup(url_page)
view = soup.find('ul', id='_listUl')
if view is None:
raise Exception('no view')
break
except Exception as e:
e_ = e
print(e)
else:
raise e_
pages_new = []
for li in view.findAll('li', recursive=False):
href = urljoin(url, li.find('a')['href'])
title = li.find('span', class_='subj').text.strip()
if href in urls:
continue
urls.add(href)
no = int(li['data-episode-no'])
title = '{:04} - {}'.format(no, title)
page = Page(href, title)
pages_new.append(page)
if not pages_new:
break
pages += pages_new
return pages[::-1]
@page_selector.register('webtoon')
@try_n(4)
def f(url):
url = get_main(url)
return get_pages(url)
def get_imgs_all(url, title, cw=None):
pages = get_pages(url)
pages = page_selector.filter(pages, cw)
imgs = []
for p, page in enumerate(pages):
imgs_already = get_imgs_already('webtoon', title, page, cw)
if imgs_already:
imgs += imgs_already
continue
imgs += get_imgs(page)
msg = tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages))
if cw is not None:
cw.setTitle(msg)
if not cw.alive:
break
else:
print(msg)
return imgs

View File

@ -0,0 +1,180 @@
#coding:utf8
import downloader
import ree as re
from timee import sleep, clock, time
from constants import clean_url
from utils import Downloader, urljoin, try_n, Session, get_print, clean_title, Soup, fix_protocol
import os
from translator import tr_
import json
from datetime import datetime
import constants
import clf2
import errors
@Downloader.register
class Downloader_weibo(Downloader):
type = 'weibo'
URLS = ['weibo.com', 'weibo.cn']
def init(self):
self.session = Session()
@classmethod
def fix_url(cls, url):
url = url.replace('weibo.cn', 'weibo.com').split('?')[0]
if 'weibo.com/p/' in url:
id = re.findall('weibo.com/p/([^/]+)', url)[0]
url = 'https://weibo.com/p/{}'.format(id)
elif 'weibo.com/u/' in url:
id = re.findall('weibo.com/u/([^/]+)', url)[0]
url = 'https://weibo.com/u/{}'.format(id)
elif 'weibo.com/' in url:
id = re.findall('weibo.com/([^/]+)', url)[0]
url = 'https://weibo.com/{}'.format(id)
else:
id = url
url = 'https://weibo.com/u/{}'.format(id)
url = fix_protocol(url)
return url
def read(self):
checkLogin(self.session)
uid, oid, name = get_id(self.url, self.cw)
title = clean_title('{} (weibo_{})'.format(name, uid))
for img in get_imgs(uid, oid, title, self.session, cw=self.cw, d=self, parent=self.mainWindow):
self.urls.append(img.url)
self.filenames[img.url] = img.filename
self.title = title
def checkLogin(session):
c = session.cookies._cookies.get('.weibo.com', {}).get('/',{}).get('SUBP')
if not c or c.is_expired():
raise errors.LoginRequired()
class Album(object):
def __init__(self, id, type):
self.id = id
self.type = type
class Image(object):
def __init__(self, url, filename=None, timestamp=0):
self.url = url
if filename is None:
filename = os.path.basename(url)
self.filename = filename
self.timestamp = timestamp
def _get_page_id(html):
m = re.search("CONFIG\\['page_id'\\]='([0-9]+?)'", html)
return m
def get_id(url, cw=None):
for try_ in range(2):
try:
res = clf2.solve(url, cw=cw, f=_get_page_id)
html = res['html']
soup = Soup(html)
if soup.find('div', class_='gn_login'):
raise errors.LoginRequired()
m = _get_page_id(html)
if not m:
raise Exception('no page_id')
oid = m.groups()[0]
uids = re.findall('uid=([0-9]+)', html)
uid = max(set(uids), key=uids.count)
name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0]
break
except errors.LoginRequired as e:
raise
except Exception as e:
e_ = e
print(e)
else:
raise e_
return uid, oid, name
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None):
print_ = get_print(cw)
print_('uid: {}, oid:{}'.format(uid, oid))
@try_n(4)
def get_album_imgs(album, page):
url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format(uid, album.id, page, album.type, int(time()*1000))
referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid)
html = downloader.read_html(url, referer, session=session, timeout=30)
j = json.loads(html)
data = j['data']
imgs = []
for photo in data['photo_list']:
host = photo['pic_host']
name = photo['pic_name']
id = photo['photo_id']
timestamp = photo['timestamp']
date = datetime.fromtimestamp(timestamp)
t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month, date.day)
url = '{}/large/{}'.format(host, name)
ext = os.path.splitext(name)[1]
filename = '[{}] {}{}'.format(t, id, ext)
img = Image(url, filename, timestamp)
imgs.append(img)
return imgs
def get_albums(page):
url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format(uid, page, int(time()*1000))
referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid)
html = downloader.read_html(url, referer, session=session)
j = json.loads(html)
data = j['data']
albums = []
for album in data['album_list']:
id = album['album_id']
type = album['type']
album = Album(id, type)
albums.append(album)
return albums
albums = []
for p in range(1, 101):
albums_new = get_albums(p)
albums += albums_new
print_('p:{}, albums:{}'.format(p, len(albums)))
if not albums_new:
break
imgs = []
for album in albums:
print('Album:', album.id, album.type)
for p in range(1, 101):
imgs_new = get_album_imgs(album, p)
imgs += imgs_new
s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
if cw:
if not cw.alive:
return []
cw.setTitle(s)
else:
print(s)
if not imgs_new:
break
sleep(1)
imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True)
return imgs