disspam/disimport.py

77 lines
2.6 KiB
Python

import os
import json
import time
import sqlite3
import requests
import contextlib
from bs4 import BeautifulSoup
from typing import Dict
with (
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
contextlib.closing(db.cursor()) as cursor,
):
try:
cursor.execute('''
DROP TABLE `repos`
''')
except:
pass
cursor.execute('''
CREATE TABLE `repos` (
`owner` TEXT NOT NULL CHECK(`owner`<>''),
`name` TEXT NOT NULL CHECK(`name`<>''),
`stars` INTEGER NOT NULL CHECK(`stars`>=0),
`forks` INTEGER NOT NULL CHECK(`forks`>=0),
`desc` TEXT CHECK(`desc`<>''),
`lang` TEXT CHECK(`lang`<>''),
`flag` TEXT CHECK(`flag`<>'')
) STRICT
''')
for page in range(1, 244+1):
last = (page == 244)
with open(f'disroot_repos_{page}.html', 'r') as f:
doc_text = f.read()
bs = BeautifulSoup(doc_text, 'html.parser')
flex_lists = bs.select('.flex-list')
assert len(flex_lists) == 1
flex_items = flex_lists[0].select(':scope > .flex-item')
if last:
assert flex_items and len(flex_items) <= 20
else:
assert len(flex_items) == 20
for flex_item in flex_items:
flex_item_mains = flex_item.select(':scope > .flex-item-main')
assert len(flex_item_mains) == 1
row = flex_item_mains[0]
url = row.select(':scope > .flex-item-header > .flex-item-title > a')[0]['href']
desc_el = row.select(':scope > .flex-item-body')
desc = desc_el[0].decode_contents().strip() if len(desc_el) == 2 else None
if not desc: # no empty strings in the db
desc = None
lang_el = row.select(':scope > .flex-item-header > .flex-item-trailing > a.muted[href*="&language="]')
lang = lang_el[0].text.strip() if lang_el else None
stars = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/stars"]')[0].text)
forks = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/forks"]')[0].text)
url_parts = url.split('/')[1:]
owner = url_parts[0]
name = url_parts[1]
print(owner, name, stars, forks, desc, lang)
cursor.execute('''
INSERT INTO `repos` (`owner`, `name`, `stars`, `forks`, `desc`, `lang`)
VALUES (?, ?, ?, ?, ?, ?)
''', (owner, name, stars, forks, desc, lang))