Hitomi-Downloader/src/extractor/lhscan_downloader.py

198 lines
5.5 KiB
Python

#coding:utf8
import downloader
from utils import Soup, urljoin, LazyUrl, Downloader, try_n, Session, clean_title, get_print
import os
from translator import tr_
import page_selector
import clf2
import utils
import base64
import ree as re
import errors
##from image_reader import QPixmap
class Image(object):
def __init__(self, url, page, p):
self._url = url
self.url = LazyUrl(page.url, self.get, self)#, pp=self.pp)
ext = os.path.splitext(url)[1]
if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
ext = '.jpg'
self.filename = u'{}/{:04}{}'.format(page.title, p, ext)
def get(self, _):
return self._url
## def pp(self, filename):
## pixmap = QPixmap(filename)
## pixmap.save(filename)
## return filename
class Page(object):
def __init__(self, title, url):
self.title = clean_title(title)
self.url = url
def get_soup_session(url, cw=None):
print_ = get_print(cw)
session = Session()
res = clf2.solve(url, session=session, cw=cw)
print_('{} -> {}'.format(url, res['url']))
if res['url'].rstrip('/') == 'https://welovemanga.one':
raise errors.LoginRequired()
return Soup(res['html']), session
class Downloader_lhscan(Downloader):
type = 'lhscan'
URLS = [
#'lhscan.net', 'loveheaven.net',
'lovehug.net', 'welovemanga.',
]
MAX_CORE = 16
display_name = 'LHScan'
_soup = None
def init(self):
self._soup, self.session = get_soup_session(self.url, self.cw)
if not self.soup.find('ul', class_='manga-info'):
raise errors.Invalid(u'{}: {}'.format(tr_(u'목록 주소를 입력해주세요'), self.url))
@classmethod
def fix_url(cls, url):
url = url.replace('lovehug.net', 'welovemanga.one')
url = url.replace('welovemanga.net', 'welovemanga.one') #4298
return url
@property
def soup(self):
if self._soup is None:
for try_ in range(8):
try:
html = downloader.read_html(self.url, session=self.session)
break
except Exception as e:
e_ = e
print(e)
else:
raise e_
self._soup = Soup(html)
return self._soup
@property
def name(self):
title = self.soup.find('ul', class_='manga-info').find('h3').text
return clean_title(title)
def read(self):
self.title = tr_(u'읽는 중... {}').format(self.name)
imgs = get_imgs(self.url, self.name, self.session, self.soup, self.cw)
for img in imgs:
self.urls.append(img.url)
self.title = self.name
@try_n(8)
def get_imgs_page(page, referer, session, cw=None):
print_ = get_print(cw)
print_(page.title)
html = downloader.read_html(page.url, referer, session=session)
if clf2._is_captcha(Soup(html)): #4124
html = clf2.solve(page.url, session, cw)['html']
if not html:
raise Exception('empty html')
html = html.replace('{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')), 'data-src=')
soup = Soup(html)
view = soup.find('div', class_='chapter-content')
if not view:
raise Exception('no chapter-content')
imgs = []
for img in soup.findAll('img', class_='chapter-img'):
src = img.get('data-pagespeed-lazy-src') or img.get('data-src') or img.get('data-srcset') or img.get('data-aload') or img['src']
try:
src = base64.b64decode(src).strip().decode('utf8')
except:
pass
src0 = src
src = src.replace('welovemanga.one', '1')#
src = urljoin(page.url, src).strip()
if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
continue
if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
continue
if 'LoveHug_600cfd96e98ff.jpg' in src:
continue
if 'image_5f0ecf23aed2e.png' in src:
continue
if '/uploads/lazy_loading.gif' in src:
continue
if not imgs:
print_(src0)
img = Image(src, page, len(imgs))
imgs.append(img)
return imgs
def get_pages(url, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
tab = soup.find('ul', class_='list-chapters')
pages = []
for li in tab.findAll('li'):
text = li.find('div', class_='chapter-name').text.strip()
href = li.parent['href']
href = urljoin(url, href)
page = Page(text, href)
pages.append(page)
if not pages:
raise Exception('no pages')
return pages[::-1]
@page_selector.register('lhscan')
@try_n(4)
def f(url):
soup, session = get_soup_session(url)
pages = get_pages(url, session, soup=soup)
return pages
@try_n(2)
def get_imgs(url, title, session, soup=None, cw=None):
if soup is None:
html = downloader.read_html(url, session=session)
soup = Soup(html)
pages = get_pages(url, session, soup, cw)
pages = page_selector.filter(pages, cw)
imgs = []
for i, page in enumerate(pages):
imgs += get_imgs_page(page, url, session, cw)
s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
if cw is not None:
if not cw.alive:
return
cw.setTitle(s)
else:
print(s)
return imgs