#coding:utf8 import downloader import ree as re from timee import sleep, clock, time from constants import clean_url from utils import Downloader, urljoin, try_n, Session, get_print, clean_title, Soup, fix_protocol import os from translator import tr_ import json from datetime import datetime import constants import clf2 import errors @Downloader.register class Downloader_weibo(Downloader): type = 'weibo' URLS = ['weibo.com', 'weibo.cn'] def init(self): self.session = Session() @classmethod def fix_url(cls, url): url = url.replace('weibo.cn', 'weibo.com').split('?')[0] if 'weibo.com/p/' in url: id = re.findall('weibo.com/p/([^/]+)', url)[0] url = 'https://weibo.com/p/{}'.format(id) elif 'weibo.com/u/' in url: id = re.findall('weibo.com/u/([^/]+)', url)[0] url = 'https://weibo.com/u/{}'.format(id) elif 'weibo.com/' in url: id = re.findall('weibo.com/([^/]+)', url)[0] url = 'https://weibo.com/{}'.format(id) else: id = url url = 'https://weibo.com/u/{}'.format(id) url = fix_protocol(url) return url def read(self): checkLogin(self.session) uid, oid, name = get_id(self.url, self.cw) title = clean_title('{} (weibo_{})'.format(name, uid)) for img in get_imgs(uid, oid, title, self.session, cw=self.cw, d=self, parent=self.mainWindow): self.urls.append(img.url) self.filenames[img.url] = img.filename self.title = title def checkLogin(session): c = session.cookies._cookies.get('.weibo.com', {}).get('/',{}).get('SUBP') if not c or c.is_expired(): raise errors.LoginRequired() class Album(object): def __init__(self, id, type): self.id = id self.type = type class Image(object): def __init__(self, url, filename=None, timestamp=0): self.url = url if filename is None: filename = os.path.basename(url) self.filename = filename self.timestamp = timestamp def _get_page_id(html): m = re.search("CONFIG\\['page_id'\\]='([0-9]+?)'", html) return m def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() m = _get_page_id(html) if not m: raise Exception('no page_id') oid = m.groups()[0] uids = re.findall('uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0] break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None): print_ = get_print(cw) print_('uid: {}, oid:{}'.format(uid, oid)) @try_n(4) def get_album_imgs(album, page): url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format(uid, album.id, page, album.type, int(time()*1000)) referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid) html = downloader.read_html(url, referer, session=session, timeout=30) j = json.loads(html) data = j['data'] imgs = [] for photo in data['photo_list']: host = photo['pic_host'] name = photo['pic_name'] id = photo['photo_id'] timestamp = photo['timestamp'] date = datetime.fromtimestamp(timestamp) t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month, date.day) url = '{}/large/{}'.format(host, name) ext = os.path.splitext(name)[1] filename = '[{}] {}{}'.format(t, id, ext) img = Image(url, filename, timestamp) imgs.append(img) return imgs def get_albums(page): url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format(uid, page, int(time()*1000)) referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid) html = downloader.read_html(url, referer, session=session) j = json.loads(html) data = j['data'] albums = [] for album in data['album_list']: id = album['album_id'] type = album['type'] album = Album(id, type) albums.append(album) return albums albums = [] for p in range(1, 101): albums_new = get_albums(p) albums += albums_new print_('p:{}, albums:{}'.format(p, len(albums))) if not albums_new: break imgs = [] for album in albums: print('Album:', album.id, album.type) for p in range(1, 101): imgs_new = get_album_imgs(album, p) imgs += imgs_new s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: if not cw.alive: return [] cw.setTitle(s) else: print(s) if not imgs_new: break sleep(1) imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True) return imgs