commit 9c0cc51be236fff1c91e77ccda502782a6746522 Author: Ag Date: Tue Jan 16 16:23:31 2024 +0000 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e1bc5d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.db +*.html +*.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb91aff --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# disspam + +## Scripts + +### discrawl.py + +Crawls the explore tab to list public repos. + +### disimport.py + +Parses the html downloaded by discrawl.py and saves it in an sqlite database. + +### disserve.py + +Serves a web interface for tagging repos as good or bad. This uses the +`disrepos.db` database file created by disimport.py. + +Run: `flask --app disserve --debug run` (tested with Flask 2.2.2) + +Display: + +Repos that are marked good or bad have their first column change the background +color to green or red. + +Repo owners with at least one bad repo are shown with a red background color on +all of their repos. Note that this isn't immediately applied after marking one +of their repos, the table has to be updated first. + +The currently selected row has a yellow background color. The selection is used +for keyboard shortcuts that operate on a row. + +The row currently being hovered with the mouse has a light pink background color +to make the table easier to read. + +Controls: + +- W: Open repo owner in new tab +- E: Open repo itself in new tab +- A: Mark as bad +- S: Remove mark +- D: Mark as good +- Up/Down: Select previous/next row +- Click: Select clicked row + +The "Filter" box takes an expression to filter the table of shown repos. +Expressions can use the columns shown in the table. For example, +`!stars && !forks && desc` would show only repos with no forks or stars and a +non-empty description. + +To match repos marked a specific way, use `flag === "notSpam"` for good repos, +`flag === "spam"` for bad repos and `!flag` for unmarked repos. + +Feature wishlist: + +- Multiple selection +- Hide mirrors diff --git a/discrawl.py b/discrawl.py new file mode 100644 index 0000000..a5e8196 --- /dev/null +++ b/discrawl.py @@ -0,0 +1,43 @@ + +import os +import json +import time +import requests +import contextlib +from typing import Dict + +with contextlib.closing(requests.Session()) as session: + + for page in range(1, 245+1): + + file = f'disroot_repos_{page}.html' + hfile = f'disroot_repos_{page}_head.json' + + if os.path.exists(file): + continue + + print(f'get page {page}') + url = f'https://git.disroot.org/explore/repos?page={page}&sort=oldest&q=&topic=false&language=&only_show_relevant=false' + response = session.get(url) + + with open(hfile, 'w') as f: + f.write(json.dumps({ + 'url': url, + 'status': response.status_code, + 'headers': [(k, v) for k, v in response.headers.items()], + })) + + try: + response.raise_for_status() + except: + print(f'Error fetching URL "{url}"!') + print(f' Status: {response.status_code}') + print(f' Headers:') + for k, v in response.headers.items(): + print(f' {k}: {v}') + raise + + with open(file, 'w') as f: + f.write(response.text) + + time.sleep(2) diff --git a/disimport.py b/disimport.py new file mode 100644 index 0000000..d9b5ead --- /dev/null +++ b/disimport.py @@ -0,0 +1,76 @@ + +import os +import json +import time +import sqlite3 +import requests +import contextlib +from bs4 import BeautifulSoup +from typing import Dict + +with ( + contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db, + contextlib.closing(db.cursor()) as cursor, +): + try: + cursor.execute(''' + DROP TABLE `repos` + ''') + except: + pass + cursor.execute(''' + CREATE TABLE `repos` ( + `owner` TEXT NOT NULL CHECK(`owner`<>''), + `name` TEXT NOT NULL CHECK(`name`<>''), + `stars` INTEGER NOT NULL CHECK(`stars`>=0), + `forks` INTEGER NOT NULL CHECK(`forks`>=0), + `desc` TEXT CHECK(`desc`<>''), + `lang` TEXT CHECK(`lang`<>''), + `flag` TEXT CHECK(`flag`<>'') + ) STRICT + ''') + + for page in range(1, 244+1): + + last = (page == 244) + + with open(f'disroot_repos_{page}.html', 'r') as f: + doc_text = f.read() + + bs = BeautifulSoup(doc_text, 'html.parser') + + flex_lists = bs.select('.flex-list') + assert len(flex_lists) == 1 + + flex_items = flex_lists[0].select(':scope > .flex-item') + if last: + assert flex_items and len(flex_items) <= 20 + else: + assert len(flex_items) == 20 + + for flex_item in flex_items: + + flex_item_mains = flex_item.select(':scope > .flex-item-main') + assert len(flex_item_mains) == 1 + + row = flex_item_mains[0] + + url = row.select(':scope > .flex-item-header > .flex-item-title > a')[0]['href'] + desc_el = row.select(':scope > .flex-item-body') + desc = desc_el[0].decode_contents().strip() if len(desc_el) == 2 else None + if not desc: # no empty strings in the db + desc = None + lang_el = row.select(':scope > .flex-item-header > .flex-item-trailing > a.muted[href*="&language="]') + lang = lang_el[0].text.strip() if lang_el else None + stars = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/stars"]')[0].text) + forks = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/forks"]')[0].text) + + url_parts = url.split('/')[1:] + + owner = url_parts[0] + name = url_parts[1] + print(owner, name, stars, forks, desc, lang) + cursor.execute(''' + INSERT INTO `repos` (`owner`, `name`, `stars`, `forks`, `desc`, `lang`) + VALUES (?, ?, ?, ?, ?, ?) + ''', (owner, name, stars, forks, desc, lang)) diff --git a/disserve.py b/disserve.py new file mode 100644 index 0000000..5c99603 --- /dev/null +++ b/disserve.py @@ -0,0 +1,108 @@ + +import json +import sqlite3 +import contextlib +from flask import Flask, Response, request + +app = Flask(__name__) + +@app.route('/main.js') +def main_js(): + with open('disstuff.js', 'r') as f: + return Response( + f.read(), + mimetype='application/javascript', + ) + +@app.route('/repos.json') +def repos(): + with ( + contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db, + contextlib.closing(db.cursor()) as cursor, + ): + cursor.execute(''' + SELECT `rowid`, `owner`, `name`, `stars`, `forks`, `desc`, `lang`, `flag` + FROM `repos` + ORDER BY `rowid` ASC + ''') + keys = 'rowid owner name stars forks desc lang flag'.split(' ') + return Response( + json.dumps([dict(zip(keys, row)) for row in cursor]), + mimetype='text/json', + ) + +@app.route('/badusers.json') +def badusers(): + with ( + contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db, + contextlib.closing(db.cursor()) as cursor, + ): + cursor.execute(''' + SELECT DISTINCT `owner` + FROM `repos` + WHERE `owner` IN ( + SELECT `owner` + FROM `repos` + WHERE `flag`='spam') + ''') + return Response( + json.dumps([row[0] for row in cursor]), + mimetype='text/json', + ) + +@app.route('/flag', methods=['POST']) +def flag(): + with ( + contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db, + contextlib.closing(db.cursor()) as cursor, + ): + rowid = int(request.form['rowid']) + flag = request.form['flag'] + if flag == '': + flag = None + cursor.execute(''' + UPDATE `repos` + SET `flag`=? + WHERE `rowid`=? + ''', (flag, rowid)) + assert cursor.rowcount == 1 + return '' + + +@app.route('/') +def index(): + return ''' + + + Filter:
+ + + + + + + +
+
+ + ''' diff --git a/disstuff.js b/disstuff.js new file mode 100644 index 0000000..9d3fad2 --- /dev/null +++ b/disstuff.js @@ -0,0 +1,207 @@ + +var update = async function () +{ + filterstatus.textContent = "Working..."; + + var response = await fetch("badusers.json"); + var json = await response.json(); + var badUsers = {}; + for (var t of json) + { + badUsers[t] = true; + } + + var response = await fetch("repos.json"); + var json = await response.json(); + console.log(json.length); + + main.innerHTML = ""; + + var filterFn = null; + if (filter.value.trim()) + { + filterFn = Function("with (arguments[1]) return eval(arguments[0]);").bind(null, filter.value); + } + + var table = document.createElement("table"); + table.setAttribute("border", ""); + var thead = document.createElement("thead"); + thead.innerHTML = ` + rowid + owner + name + desc + stars + forks + lang + `; + table.appendChild(thead); + var tbody = document.createElement("tbody"); + table.appendChild(tbody); + var numFilteredOut = 0; + for (var t of json) + { + if (filterFn) + { + try + { + if (!filterFn(t)) + { + numFilteredOut++; + continue; + } + } + catch (e) + { + // want to catch syntax errors + alert(e); + break; + } + } + + var tr = document.createElement("tr"); + + tr.setAttribute("class", "table-row"); + if (t["flag"]) + tr.setAttribute("data-flag", t["flag"]); + tr.setAttribute("data-rowid", t["rowid"]); + + var tdRowid = document.createElement("td"); + tdRowid.textContent = t["rowid"]; + tdRowid.setAttribute("align", "right"); + tr.appendChild(tdRowid); + + var tdOwner = document.createElement("td"); + if (badUsers[t["owner"]]) + tdOwner.dataset.badUser = true; + var ownerLink = document.createElement("a"); + ownerLink.className = "owner-link"; + ownerLink.textContent = t["owner"]; + ownerLink.href = `https://git.disroot.org/${t["owner"]}`; + ownerLink.target = "_blank"; + tdOwner.appendChild(ownerLink); + tr.appendChild(tdOwner); + + var tdName = document.createElement("td"); + var repoLink = document.createElement("a"); + repoLink.className = "repo-link"; + repoLink.textContent = t["name"]; + repoLink.href = `https://git.disroot.org/${t["owner"]}/${t["name"]}`; + repoLink.target = "_blank"; + tdName.appendChild(repoLink); + tr.appendChild(tdName); + + var tdDesc = document.createElement("td"); + //~ tdDesc.textContent = t["desc"]; + tdDesc.innerHTML = t["desc"]; + tr.appendChild(tdDesc); + + var tdStars = document.createElement("td"); + tdStars.textContent = t["stars"]; + tdStars.setAttribute("align", "right"); + tr.appendChild(tdStars); + + var tdForks = document.createElement("td"); + tdForks.textContent = t["forks"]; + tdForks.setAttribute("align", "right"); + tr.appendChild(tdForks); + + var tdLang = document.createElement("td"); + tdLang.textContent = t["lang"]; + tr.appendChild(tdLang); + + tbody.appendChild(tr); + } + tbody.children[0].dataset.activeRow = true; + main.appendChild(table); + + filterstatus.textContent = `Showing ${json.length-numFilteredOut} of ${json.length} results (${numFilteredOut} filtered out)`; +}; + +update(); + +var setFlag = async function (flag) +{ + var ar = document.querySelector("[data-active-row]"); + rowid = ar.dataset.rowid; + try + { + var response = await fetch("/flag", { + method: "POST", + body: `rowid=${encodeURIComponent(rowid)}&flag=${encodeURIComponent(flag)}`, + headers: {"Content-Type": "application/x-www-form-urlencoded"}, + }); + if (response.status !== 200) + throw new Error(`Request failed with status ${response.status}`); + ar.dataset.flag = flag; + } + catch (e) + { + alert(e); + } +}; + +document.onkeydown = function (event) +{ + //~ console.log(event); + + if (event.ctrlKey || event.altKey || event.shiftKey) + return; + + if (event.srcElement.matches("input")) + return; + + var dir = 1; + var nextOrPrev = null; + switch (event.key) + { + case "ArrowDown": + nextOrPrev = "nextElementSibling"; + case "ArrowUp": + if (!nextOrPrev) + nextOrPrev = "previousElementSibling"; + var ar = document.querySelector("[data-active-row]"); + if (ar && ar[nextOrPrev]) + { + ar[nextOrPrev].dataset.activeRow = true; + delete ar.dataset.activeRow; + event.preventDefault(); + ar.scrollIntoView({block: "center"}); + } + break; + case "a": + setFlag("spam"); + break; + case "s": + setFlag(""); + break; + case "d": + setFlag("notSpam"); + break; + case "w": + var ar = document.querySelector("[data-active-row]"); + if (ar) + ar.querySelector(".owner-link").click(); + break; + case "e": + var ar = document.querySelector("[data-active-row]"); + if (ar) + ar.querySelector(".repo-link").click(); + break; + default: + return; + } + +}; + +document.onclick = function (event) +{ + if (event.target.matches("tr, tr *")) + { + var newAr = event.target.closest("tr"); + var ar = document.querySelector("[data-active-row]"); + if (ar) + delete ar.dataset.activeRow; + newAr.dataset.activeRow = true; + } +};