Hitomi-Downloader/src/extractor/asmhentai_downloader.py

125 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding: utf8
import downloader
import ree as re
from utils import Soup, urljoin, Downloader, join, LazyUrl, Session, get_print
import os
from timee import sleep
from translator import tr_
def get_id(url):
try:
return int(url)
except:
if '/gallery/' in url:
return int(re.find('/gallery/[0-9]+/([0-9]+)', url))
else:
return int(re.find('/g/([0-9]+)', url))
class Downloader_asmhentai(Downloader):
type = 'asmhentai'
URLS = ['asmhentai.com']
MAX_CORE = 8
display_name = 'AsmHentai'
def init(self):
self.session = Session()
@classmethod
def fix_url(cls, url):
id_ = get_id(url)
return 'https://asmhentai.com/g/{}/'.format(id_)
def read(self):
info = get_info(self.url, self.session, self.cw)
# 1225
artist = join(info['artists'])
self.artist = artist
group = join(info['groups']) if info['groups'] else u'NA'
lang = info['language'][0] if info['language'] else u'NA'
series = info['parodies'][0] if info['parodies'] else u'NA'
title = self.format_title(info['category'][0], info['id'], info['title'], artist, group, series, lang)
self.urls += [img.url for img in info['imgs']]
self.title = title
class Image:
def __init__(self, url, referer):
self.url = LazyUrl(referer, lambda _:url, self)
self.filename = os.path.basename(url)
def get_info(url, session, cw):
print_ = get_print(cw)
html = downloader.read_html(url, session=session)
soup = Soup(html)
info = {}
info['id'] = get_id(url)
title = soup.find('h1').text.strip()
info['title'] = title
for tag in soup.findAll('span', class_='tag'):
href = tag.parent.attrs['href']
href = urljoin(url, href).strip('/')
key = href.split('/')[3]
value = href.split('/')[-1]
if key == 'language' and value == 'translated':
continue
if key in info:
info[key].append(value)
else:
info[key] = [value]
for key in ['artists', 'groups', 'parodies', 'tags', 'characters']:
if key not in info:
info[key] = []
info['imgs'] = []
def read_imgs(soup):
c = 0
for img in soup.findAll('div', class_='preview_thumb'):
img = img.find('img').attrs.get('data-src') or img.find('img').attrs.get('src')
img = urljoin(url, img).replace('t.jpg', '.jpg')
img = Image(img, url)
info['imgs'].append(img)
c += 1
if not c:
raise Exception('no imgs')
read_imgs(soup)
csrf = soup.find('meta', {'name':'csrf-token'})['content']
print_(f'csrf: {csrf}')
t_pages = int(soup.find('input', type='hidden', id='t_pages')['value'])
print_(f't_pages: {t_pages}')
while len(info['imgs']) < t_pages: #4971
print_('imgs: {}'.format(len(info['imgs'])))
sleep(1, cw)
cw.setTitle('{} {} - {} / {}'.format(tr_('읽는 중...'), info['title'], len(info['imgs']), t_pages))
data = {
'_token': csrf,
'id': str(info['id']),
'dir': soup.find('input', type='hidden', id='dir')['value'],
'v_pages': len(info['imgs']),
't_pages': str(t_pages),
'type': '1',
}
r = session.post('https://asmhentai.com/inc/thumbs_loader.php', data=data)
soup_more = Soup(r.text)
read_imgs(soup_more)
return info