214 lines
5.6 KiB
Python
214 lines
5.6 KiB
Python
import downloader
|
|
import ree as re
|
|
from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin, get_imgs_already, check_alive
|
|
from time import sleep
|
|
from translator import tr_
|
|
import page_selector
|
|
import json
|
|
import clf2
|
|
from ratelimit import limits, sleep_and_retry
|
|
|
|
|
|
class Page:
|
|
|
|
def __init__(self, id_, title):
|
|
self.id_ = id_
|
|
self.title = title
|
|
self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_)
|
|
|
|
|
|
class Image:
|
|
|
|
def __init__(self, url, page, p):
|
|
self._url = url
|
|
self.url = LazyUrl('https://page.kakao.com/', self.get, self)
|
|
ext = '.jpg'
|
|
self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
|
|
|
|
@sleep_and_retry
|
|
@limits(5, 1)
|
|
def get(self, _):
|
|
return self._url
|
|
|
|
|
|
|
|
class Downloader_kakaopage(Downloader):
|
|
type = 'kakaopage'
|
|
URLS = ['page.kakao.com/home']
|
|
MAX_CORE = 4
|
|
MAX_SPEED = 4.0
|
|
display_name = 'KakaoPage'
|
|
ACCEPT_COOKIES = [r'(.*\.)?kakao\.com']
|
|
|
|
def init(self):
|
|
self.session = Session()
|
|
|
|
@classmethod
|
|
def fix_url(cls, url):
|
|
id = re.find('/home/.+?/([0-9]+)', url)
|
|
if id is not None:
|
|
url = id
|
|
if url.isdecimal():
|
|
url = 'https://page.kakao.com/home?seriesId={}'.format(url)
|
|
return url
|
|
|
|
def read(self):
|
|
info = get_info(self.url, self.session, cw=self.cw)
|
|
|
|
for img in info['imgs']:
|
|
if isinstance(img, Image):
|
|
img = img.url
|
|
self.urls.append(img)
|
|
|
|
self.artist = info['artist']
|
|
|
|
self.title = info['title']
|
|
|
|
|
|
|
|
def get_id(url):
|
|
id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId')
|
|
return id_
|
|
|
|
|
|
|
|
def get_pages(url, session, cw=None):
|
|
id_ = get_id(url)
|
|
|
|
pages = []
|
|
ids = set()
|
|
for p in range(500): #2966
|
|
check_alive(cw)
|
|
url_api = 'https://api2-page.kakao.com/api/v5/store/singles'
|
|
data = {
|
|
'seriesid': id_,
|
|
'page': str(p),
|
|
'direction': 'asc',
|
|
'page_size': '20',
|
|
'without_hidden': 'true',
|
|
}
|
|
r = session.post(url_api, data=data, headers={'Referer': url})
|
|
print(p, r)
|
|
data = r.json()
|
|
|
|
singles = data['singles']
|
|
if not singles:
|
|
print('no singles')
|
|
break
|
|
|
|
for single in singles:
|
|
title_page = single['title']
|
|
id_page = single['id']
|
|
if id_page in ids:
|
|
print('dup id')
|
|
continue
|
|
ids.add(id_page)
|
|
page = Page(id_page, title_page)
|
|
pages.append(page)
|
|
sleep(.5)
|
|
return pages
|
|
|
|
|
|
def read_html(url, session):
|
|
res = clf2.solve(url, session=session)
|
|
return res['html']
|
|
|
|
|
|
@try_n(2)
|
|
def get_imgs_page(page, session):
|
|
html = read_html(page.url, session=session)
|
|
did = re.find('"did" *: *"(.+?)"', html, err='no did')
|
|
url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
|
|
data = {
|
|
'productId': page.id_,
|
|
'device_mgr_uid': 'Windows - Chrome',
|
|
'device_model': 'Windows - Chrome',
|
|
'deviceId': did,
|
|
}
|
|
print(data)
|
|
r = session.post(url_api, data=data, headers={'Referer': page.url})
|
|
data = r.json()
|
|
if data['result_code']:
|
|
raise Exception(data['message'])
|
|
imgs = []
|
|
for file in data['downloadData']['members']['files']:
|
|
url = file['secureUrl']
|
|
url = 'https://page-edge.kakao.com/sdownload/resource?kid=' + url #5176
|
|
img = Image(url, page, len(imgs))
|
|
imgs.append(img)
|
|
return imgs
|
|
|
|
|
|
def get_info(url, session, cw=None):
|
|
print_ = get_print(cw)
|
|
pages = get_pages(url, session, cw)
|
|
pages = page_selector.filter(pages, cw)
|
|
if not pages:
|
|
raise Exception('no pages')
|
|
|
|
info = {}
|
|
|
|
html = read_html(url, session=session)
|
|
soup = Soup(html)
|
|
|
|
__NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
|
|
if __NEXT_DATA__:
|
|
data = json.loads(__NEXT_DATA__.string)
|
|
tid = data['props']['initialState']['common']['constant']['tid']
|
|
print_('tid: {}'.format(tid))
|
|
session.cookies['_kptid'] = tid
|
|
html = read_html(url, session=session)
|
|
soup = Soup(html)
|
|
|
|
title = soup.find('h2').text.strip()
|
|
artist = soup.find('meta', {'name': 'author'})['content']
|
|
for x in [' ,', ', ']:
|
|
while x in artist:
|
|
artist = artist.replace(x, ',')
|
|
artist = artist.replace(',', ', ')
|
|
info['artist'] = artist
|
|
info['title_raw'] = title
|
|
info['title'] = clean_title('[{}] {}'.format(artist, title))
|
|
|
|
imgs = []
|
|
|
|
for i, page in enumerate(pages):
|
|
check_alive(cw)
|
|
if cw is not None:
|
|
cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), info['title'], page.title, i + 1, len(pages)))
|
|
|
|
#3463
|
|
imgs_already = get_imgs_already('kakaopage', info['title'], page, cw)
|
|
if imgs_already:
|
|
imgs += imgs_already
|
|
continue
|
|
|
|
try:
|
|
_imgs = get_imgs_page(page, session)
|
|
e_msg = None
|
|
except Exception as e:
|
|
_imgs = []
|
|
e_msg = print_error(e)[0]
|
|
print_('{} {}'.format(page.title, len(_imgs)))
|
|
if e_msg:
|
|
print_(e_msg)
|
|
|
|
imgs += _imgs
|
|
sleep(.2)
|
|
|
|
if not imgs:
|
|
raise Exception('no imgs')
|
|
|
|
info['imgs'] = imgs
|
|
|
|
return info
|
|
|
|
|
|
@page_selector.register('kakaopage')
|
|
@try_n(4)
|
|
def f(url):
|
|
if 'seriesId=' not in url:
|
|
raise Exception(tr_('목록 주소를 입력해주세요'))
|
|
pages = get_pages(url, Session())
|
|
return pages
|