Hitomi-Downloader/src/extractor/asmhentai_downloader.py

125 lines
3.4 KiB
Python
Raw Normal View History

2020-08-10 19:21:38 +02:00
#coding: utf8
import downloader
import ree as re
2022-07-21 07:14:09 +02:00
from utils import Soup, urljoin, Downloader, join, LazyUrl, Session, get_print
2020-08-10 19:21:38 +02:00
import os
2022-07-21 07:14:09 +02:00
from timee import sleep
from translator import tr_
2020-08-10 19:21:38 +02:00
def get_id(url):
try:
return int(url)
except:
if '/gallery/' in url:
return int(re.find('/gallery/[0-9]+/([0-9]+)', url))
else:
return int(re.find('/g/([0-9]+)', url))
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
class Downloader_asmhentai(Downloader):
type = 'asmhentai'
URLS = ['asmhentai.com']
MAX_CORE = 8
2020-09-21 10:50:21 +02:00
display_name = 'AsmHentai'
2020-08-10 19:21:38 +02:00
2022-09-18 08:40:54 +02:00
def init(self):
self.session = Session()
2020-09-02 12:17:58 +02:00
@classmethod
def fix_url(cls, url):
id_ = get_id(url)
return 'https://asmhentai.com/g/{}/'.format(id_)
2020-08-10 19:21:38 +02:00
def read(self):
2022-07-21 07:14:09 +02:00
info = get_info(self.url, self.session, self.cw)
2020-08-10 19:21:38 +02:00
# 1225
artist = join(info['artists'])
2020-09-02 12:17:58 +02:00
self.artist = artist
2020-08-10 19:21:38 +02:00
group = join(info['groups']) if info['groups'] else u'NA'
lang = info['language'][0] if info['language'] else u'NA'
series = info['parodies'][0] if info['parodies'] else u'NA'
title = self.format_title(info['category'][0], info['id'], info['title'], artist, group, series, lang)
2022-07-21 07:14:09 +02:00
self.urls += [img.url for img in info['imgs']]
2020-08-10 19:21:38 +02:00
self.title = title
2022-07-21 07:14:09 +02:00
class Image:
def __init__(self, url, referer):
self.url = LazyUrl(referer, lambda _:url, self)
self.filename = os.path.basename(url)
2020-08-10 19:21:38 +02:00
2022-07-21 07:14:09 +02:00
def get_info(url, session, cw):
print_ = get_print(cw)
html = downloader.read_html(url, session=session)
soup = Soup(html)
2020-08-10 19:21:38 +02:00
info = {}
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
info['id'] = get_id(url)
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
title = soup.find('h1').text.strip()
info['title'] = title
for tag in soup.findAll('span', class_='tag'):
href = tag.parent.attrs['href']
href = urljoin(url, href).strip('/')
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
key = href.split('/')[3]
value = href.split('/')[-1]
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
if key == 'language' and value == 'translated':
continue
2022-07-21 07:14:09 +02:00
2020-08-10 19:21:38 +02:00
if key in info:
info[key].append(value)
else:
info[key] = [value]
for key in ['artists', 'groups', 'parodies', 'tags', 'characters']:
if key not in info:
info[key] = []
2022-07-21 07:14:09 +02:00
info['imgs'] = []
def read_imgs(soup):
c = 0
for img in soup.findAll('div', class_='preview_thumb'):
img = img.find('img').attrs.get('data-src') or img.find('img').attrs.get('src')
img = urljoin(url, img).replace('t.jpg', '.jpg')
img = Image(img, url)
info['imgs'].append(img)
c += 1
if not c:
raise Exception('no imgs')
read_imgs(soup)
csrf = soup.find('meta', {'name':'csrf-token'})['content']
print_(f'csrf: {csrf}')
t_pages = int(soup.find('input', type='hidden', id='t_pages')['value'])
print_(f't_pages: {t_pages}')
while len(info['imgs']) < t_pages: #4971
print_('imgs: {}'.format(len(info['imgs'])))
sleep(1, cw)
cw.setTitle('{} {} - {} / {}'.format(tr_('읽는 중...'), info['title'], len(info['imgs']), t_pages))
data = {
'_token': csrf,
'id': str(info['id']),
'dir': soup.find('input', type='hidden', id='dir')['value'],
'v_pages': len(info['imgs']),
't_pages': str(t_pages),
'type': '1',
}
2022-09-18 08:40:54 +02:00
r = session.post('https://asmhentai.com/inc/thumbs_loader.php', data=data)
2022-07-21 07:14:09 +02:00
soup_more = Soup(r.text)
read_imgs(soup_more)
2020-08-10 19:21:38 +02:00
return info