import os import json import time import sqlite3 import requests import contextlib from bs4 import BeautifulSoup from typing import Dict with ( contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db, contextlib.closing(db.cursor()) as cursor, ): try: cursor.execute(''' DROP TABLE `repos` ''') except: pass cursor.execute(''' CREATE TABLE `repos` ( `owner` TEXT NOT NULL CHECK(`owner`<>''), `name` TEXT NOT NULL CHECK(`name`<>''), `stars` INTEGER NOT NULL CHECK(`stars`>=0), `forks` INTEGER NOT NULL CHECK(`forks`>=0), `desc` TEXT CHECK(`desc`<>''), `lang` TEXT CHECK(`lang`<>''), `flag` TEXT CHECK(`flag`<>'') ) STRICT ''') for page in range(1, 244+1): last = (page == 244) with open(f'disroot_repos_{page}.html', 'r') as f: doc_text = f.read() bs = BeautifulSoup(doc_text, 'html.parser') flex_lists = bs.select('.flex-list') assert len(flex_lists) == 1 flex_items = flex_lists[0].select(':scope > .flex-item') if last: assert flex_items and len(flex_items) <= 20 else: assert len(flex_items) == 20 for flex_item in flex_items: flex_item_mains = flex_item.select(':scope > .flex-item-main') assert len(flex_item_mains) == 1 row = flex_item_mains[0] url = row.select(':scope > .flex-item-header > .flex-item-title > a')[0]['href'] desc_el = row.select(':scope > .flex-item-body') desc = desc_el[0].decode_contents().strip() if len(desc_el) == 2 else None if not desc: # no empty strings in the db desc = None lang_el = row.select(':scope > .flex-item-header > .flex-item-trailing > a.muted[href*="&language="]') lang = lang_el[0].text.strip() if lang_el else None stars = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/stars"]')[0].text) forks = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/forks"]')[0].text) url_parts = url.split('/')[1:] owner = url_parts[0] name = url_parts[1] print(owner, name, stars, forks, desc, lang) cursor.execute(''' INSERT INTO `repos` (`owner`, `name`, `stars`, `forks`, `desc`, `lang`) VALUES (?, ?, ?, ?, ?, ?) ''', (owner, name, stars, forks, desc, lang))