77 lines
2.6 KiB
Python
77 lines
2.6 KiB
Python
|
|
import os
|
|
import json
|
|
import time
|
|
import sqlite3
|
|
import requests
|
|
import contextlib
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict
|
|
|
|
with (
|
|
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
|
|
contextlib.closing(db.cursor()) as cursor,
|
|
):
|
|
try:
|
|
cursor.execute('''
|
|
DROP TABLE `repos`
|
|
''')
|
|
except:
|
|
pass
|
|
cursor.execute('''
|
|
CREATE TABLE `repos` (
|
|
`owner` TEXT NOT NULL CHECK(`owner`<>''),
|
|
`name` TEXT NOT NULL CHECK(`name`<>''),
|
|
`stars` INTEGER NOT NULL CHECK(`stars`>=0),
|
|
`forks` INTEGER NOT NULL CHECK(`forks`>=0),
|
|
`desc` TEXT CHECK(`desc`<>''),
|
|
`lang` TEXT CHECK(`lang`<>''),
|
|
`flag` TEXT CHECK(`flag`<>'')
|
|
) STRICT
|
|
''')
|
|
|
|
for page in range(1, 244+1):
|
|
|
|
last = (page == 244)
|
|
|
|
with open(f'disroot_repos_{page}.html', 'r') as f:
|
|
doc_text = f.read()
|
|
|
|
bs = BeautifulSoup(doc_text, 'html.parser')
|
|
|
|
flex_lists = bs.select('.flex-list')
|
|
assert len(flex_lists) == 1
|
|
|
|
flex_items = flex_lists[0].select(':scope > .flex-item')
|
|
if last:
|
|
assert flex_items and len(flex_items) <= 20
|
|
else:
|
|
assert len(flex_items) == 20
|
|
|
|
for flex_item in flex_items:
|
|
|
|
flex_item_mains = flex_item.select(':scope > .flex-item-main')
|
|
assert len(flex_item_mains) == 1
|
|
|
|
row = flex_item_mains[0]
|
|
|
|
url = row.select(':scope > .flex-item-header > .flex-item-title > a')[0]['href']
|
|
desc_el = row.select(':scope > .flex-item-body')
|
|
desc = desc_el[0].decode_contents().strip() if len(desc_el) == 2 else None
|
|
if not desc: # no empty strings in the db
|
|
desc = None
|
|
lang_el = row.select(':scope > .flex-item-header > .flex-item-trailing > a.muted[href*="&language="]')
|
|
lang = lang_el[0].text.strip() if lang_el else None
|
|
stars = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/stars"]')[0].text)
|
|
forks = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/forks"]')[0].text)
|
|
|
|
url_parts = url.split('/')[1:]
|
|
|
|
owner = url_parts[0]
|
|
name = url_parts[1]
|
|
print(owner, name, stars, forks, desc, lang)
|
|
cursor.execute('''
|
|
INSERT INTO `repos` (`owner`, `name`, `stars`, `forks`, `desc`, `lang`)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
''', (owner, name, stars, forks, desc, lang))
|