Hitomi-Downloader/src/extractor/kakaopage_downloader.py

214 lines
5.6 KiB
Python

import downloader
import ree as re
from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin, get_imgs_already, check_alive
from time import sleep
from translator import tr_
import page_selector
import json
import clf2
from ratelimit import limits, sleep_and_retry
class Page:
def __init__(self, id_, title):
self.id_ = id_
self.title = title
self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_)
class Image:
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl('https://page.kakao.com/', self.get, self)
ext = '.jpg'
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
@sleep_and_retry
@limits(5, 1)
def get(self, _):
return self._url
class Downloader_kakaopage(Downloader):
type = 'kakaopage'
URLS = ['page.kakao.com/home']
MAX_CORE = 4
MAX_SPEED = 4.0
display_name = 'KakaoPage'
ACCEPT_COOKIES = [r'(.*\.)?kakao\.com']
def init(self):
self.session = Session()
@classmethod
def fix_url(cls, url):
id = re.find('/home/.+?/([0-9]+)', url)
if id is not None:
url = id
if url.isdecimal():
url = 'https://page.kakao.com/home?seriesId={}'.format(url)
return url
def read(self):
info = get_info(self.url, self.session, cw=self.cw)
for img in info['imgs']:
if isinstance(img, Image):
img = img.url
self.urls.append(img)
self.artist = info['artist']
self.title = info['title']
def get_id(url):
id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId')
return id_
def get_pages(url, session, cw=None):
id_ = get_id(url)
pages = []
ids = set()
for p in range(500): #2966
check_alive(cw)
url_api = 'https://api2-page.kakao.com/api/v5/store/singles'
data = {
'seriesid': id_,
'page': str(p),
'direction': 'asc',
'page_size': '20',
'without_hidden': 'true',
}
r = session.post(url_api, data=data, headers={'Referer': url})
print(p, r)
data = r.json()
singles = data['singles']
if not singles:
print('no singles')
break
for single in singles:
title_page = single['title']
id_page = single['id']
if id_page in ids:
print('dup id')
continue
ids.add(id_page)
page = Page(id_page, title_page)
pages.append(page)
sleep(.5)
return pages
def read_html(url, session):
res = clf2.solve(url, session=session)
return res['html']
@try_n(2)
def get_imgs_page(page, session):
html = read_html(page.url, session=session)
did = re.find('"did" *: *"(.+?)"', html, err='no did')
url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
data = {
'productId': page.id_,
'device_mgr_uid': 'Windows - Chrome',
'device_model': 'Windows - Chrome',
'deviceId': did,
}
print(data)
r = session.post(url_api, data=data, headers={'Referer': page.url})
data = r.json()
if data['result_code']:
raise Exception(data['message'])
imgs = []
for file in data['downloadData']['members']['files']:
url = file['secureUrl']
url = 'https://page-edge.kakao.com/sdownload/resource?kid=' + url #5176
img = Image(url, page, len(imgs))
imgs.append(img)
return imgs
def get_info(url, session, cw=None):
print_ = get_print(cw)
pages = get_pages(url, session, cw)
pages = page_selector.filter(pages, cw)
if not pages:
raise Exception('no pages')
info = {}
html = read_html(url, session=session)
soup = Soup(html)
__NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
if __NEXT_DATA__:
data = json.loads(__NEXT_DATA__.string)
tid = data['props']['initialState']['common']['constant']['tid']
print_('tid: {}'.format(tid))
session.cookies['_kptid'] = tid
html = read_html(url, session=session)
soup = Soup(html)
title = soup.find('h2').text.strip()
artist = soup.find('meta', {'name': 'author'})['content']
for x in [' ,', ', ']:
while x in artist:
artist = artist.replace(x, ',')
artist = artist.replace(',', ', ')
info['artist'] = artist
info['title_raw'] = title
info['title'] = clean_title('[{}] {}'.format(artist, title))
imgs = []
for i, page in enumerate(pages):
check_alive(cw)
if cw is not None:
cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), info['title'], page.title, i + 1, len(pages)))
#3463
imgs_already = get_imgs_already('kakaopage', info['title'], page, cw)
if imgs_already:
imgs += imgs_already
continue
try:
_imgs = get_imgs_page(page, session)
e_msg = None
except Exception as e:
_imgs = []
e_msg = print_error(e)[0]
print_('{} {}'.format(page.title, len(_imgs)))
if e_msg:
print_(e_msg)
imgs += _imgs
sleep(.2)
if not imgs:
raise Exception('no imgs')
info['imgs'] = imgs
return info
@page_selector.register('kakaopage')
@try_n(4)
def f(url):
if 'seriesId=' not in url:
raise Exception(tr_('목록 주소를 입력해주세요'))
pages = get_pages(url, Session())
return pages