forked from Ag/disspam
1
0
Fork 0

initial commit

This commit is contained in:
Ag 2024-01-16 16:23:31 +00:00
commit 9c0cc51be2
6 changed files with 493 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.db
*.html
*.json

56
README.md Normal file
View File

@ -0,0 +1,56 @@
# disspam
## Scripts
### discrawl.py
Crawls the explore tab to list public repos.
### disimport.py
Parses the html downloaded by discrawl.py and saves it in an sqlite database.
### disserve.py
Serves a web interface for tagging repos as good or bad. This uses the
`disrepos.db` database file created by disimport.py.
Run: `flask --app disserve --debug run` (tested with Flask 2.2.2)
Display:
Repos that are marked good or bad have their first column change the background
color to green or red.
Repo owners with at least one bad repo are shown with a red background color on
all of their repos. Note that this isn't immediately applied after marking one
of their repos, the table has to be updated first.
The currently selected row has a yellow background color. The selection is used
for keyboard shortcuts that operate on a row.
The row currently being hovered with the mouse has a light pink background color
to make the table easier to read.
Controls:
- W: Open repo owner in new tab
- E: Open repo itself in new tab
- A: Mark as bad
- S: Remove mark
- D: Mark as good
- Up/Down: Select previous/next row
- Click: Select clicked row
The "Filter" box takes an expression to filter the table of shown repos.
Expressions can use the columns shown in the table. For example,
`!stars && !forks && desc` would show only repos with no forks or stars and a
non-empty description.
To match repos marked a specific way, use `flag === "notSpam"` for good repos,
`flag === "spam"` for bad repos and `!flag` for unmarked repos.
Feature wishlist:
- Multiple selection
- Hide mirrors

43
discrawl.py Normal file
View File

@ -0,0 +1,43 @@
import os
import json
import time
import requests
import contextlib
from typing import Dict
with contextlib.closing(requests.Session()) as session:
for page in range(1, 245+1):
file = f'disroot_repos_{page}.html'
hfile = f'disroot_repos_{page}_head.json'
if os.path.exists(file):
continue
print(f'get page {page}')
url = f'https://git.disroot.org/explore/repos?page={page}&sort=oldest&q=&topic=false&language=&only_show_relevant=false'
response = session.get(url)
with open(hfile, 'w') as f:
f.write(json.dumps({
'url': url,
'status': response.status_code,
'headers': [(k, v) for k, v in response.headers.items()],
}))
try:
response.raise_for_status()
except:
print(f'Error fetching URL "{url}"!')
print(f' Status: {response.status_code}')
print(f' Headers:')
for k, v in response.headers.items():
print(f' {k}: {v}')
raise
with open(file, 'w') as f:
f.write(response.text)
time.sleep(2)

76
disimport.py Normal file
View File

@ -0,0 +1,76 @@
import os
import json
import time
import sqlite3
import requests
import contextlib
from bs4 import BeautifulSoup
from typing import Dict
with (
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
contextlib.closing(db.cursor()) as cursor,
):
try:
cursor.execute('''
DROP TABLE `repos`
''')
except:
pass
cursor.execute('''
CREATE TABLE `repos` (
`owner` TEXT NOT NULL CHECK(`owner`<>''),
`name` TEXT NOT NULL CHECK(`name`<>''),
`stars` INTEGER NOT NULL CHECK(`stars`>=0),
`forks` INTEGER NOT NULL CHECK(`forks`>=0),
`desc` TEXT CHECK(`desc`<>''),
`lang` TEXT CHECK(`lang`<>''),
`flag` TEXT CHECK(`flag`<>'')
) STRICT
''')
for page in range(1, 244+1):
last = (page == 244)
with open(f'disroot_repos_{page}.html', 'r') as f:
doc_text = f.read()
bs = BeautifulSoup(doc_text, 'html.parser')
flex_lists = bs.select('.flex-list')
assert len(flex_lists) == 1
flex_items = flex_lists[0].select(':scope > .flex-item')
if last:
assert flex_items and len(flex_items) <= 20
else:
assert len(flex_items) == 20
for flex_item in flex_items:
flex_item_mains = flex_item.select(':scope > .flex-item-main')
assert len(flex_item_mains) == 1
row = flex_item_mains[0]
url = row.select(':scope > .flex-item-header > .flex-item-title > a')[0]['href']
desc_el = row.select(':scope > .flex-item-body')
desc = desc_el[0].decode_contents().strip() if len(desc_el) == 2 else None
if not desc: # no empty strings in the db
desc = None
lang_el = row.select(':scope > .flex-item-header > .flex-item-trailing > a.muted[href*="&language="]')
lang = lang_el[0].text.strip() if lang_el else None
stars = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/stars"]')[0].text)
forks = int(row.select(':scope > .flex-item-header > .flex-item-trailing > a.text.grey.flex-text-inline[href$="/forks"]')[0].text)
url_parts = url.split('/')[1:]
owner = url_parts[0]
name = url_parts[1]
print(owner, name, stars, forks, desc, lang)
cursor.execute('''
INSERT INTO `repos` (`owner`, `name`, `stars`, `forks`, `desc`, `lang`)
VALUES (?, ?, ?, ?, ?, ?)
''', (owner, name, stars, forks, desc, lang))

108
disserve.py Normal file
View File

@ -0,0 +1,108 @@
import json
import sqlite3
import contextlib
from flask import Flask, Response, request
app = Flask(__name__)
@app.route('/main.js')
def main_js():
with open('disstuff.js', 'r') as f:
return Response(
f.read(),
mimetype='application/javascript',
)
@app.route('/repos.json')
def repos():
with (
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
contextlib.closing(db.cursor()) as cursor,
):
cursor.execute('''
SELECT `rowid`, `owner`, `name`, `stars`, `forks`, `desc`, `lang`, `flag`
FROM `repos`
ORDER BY `rowid` ASC
''')
keys = 'rowid owner name stars forks desc lang flag'.split(' ')
return Response(
json.dumps([dict(zip(keys, row)) for row in cursor]),
mimetype='text/json',
)
@app.route('/badusers.json')
def badusers():
with (
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
contextlib.closing(db.cursor()) as cursor,
):
cursor.execute('''
SELECT DISTINCT `owner`
FROM `repos`
WHERE `owner` IN (
SELECT `owner`
FROM `repos`
WHERE `flag`='spam')
''')
return Response(
json.dumps([row[0] for row in cursor]),
mimetype='text/json',
)
@app.route('/flag', methods=['POST'])
def flag():
with (
contextlib.closing(sqlite3.connect('disrepos.db', isolation_level=None)) as db,
contextlib.closing(db.cursor()) as cursor,
):
rowid = int(request.form['rowid'])
flag = request.form['flag']
if flag == '':
flag = None
cursor.execute('''
UPDATE `repos`
SET `flag`=?
WHERE `rowid`=?
''', (flag, rowid))
assert cursor.rowcount == 1
return ''
@app.route('/')
def index():
return '''
<!doctype html>
<style>
html {
font-size: smaller;
background: gainsboro;
}
form {
display: inline-block;
}
th {
position: sticky;
top: 0;
background: gainsboro; /* sorry dark theme users */
}
tr:hover { background-color: mistyrose; }
tr[data-active-row] {
background-color: yellow;
}
[data-flag=spam] td:first-of-type { background: red; }
td[data-bad-user] { background: red; }
[data-flag=notSpam] td:first-of-type { background: green; }
</style>
Filter: <form onsubmit="update(); return false"><input list="filters" type="text" id="filter" style="width: 400px;"> <input type="submit" value="Update"></form> <span id="filterstatus"></span>
<datalist id="filters">
<option value="!stars&&!forks&&!lang&&desc"></option>
<option value="desc&&!lang"></option>
<option value="desc&&/viagra/i.test(desc)"></option>
<option value="desc&&/&lt;a href=&quot;(?![^&quot;]*(github\.com|\\bgit\.))/i.test(desc)"></option>
<option value="desc&&/buy|cashapp|erectile|escort|essay|support|viagra|wholesale/i.test(desc)"></option>
</datalist>
<hr>
<div id="main"></div>
<script src="/main.js"></script>
'''

207
disstuff.js Normal file
View File

@ -0,0 +1,207 @@
var update = async function ()
{
filterstatus.textContent = "Working...";
var response = await fetch("badusers.json");
var json = await response.json();
var badUsers = {};
for (var t of json)
{
badUsers[t] = true;
}
var response = await fetch("repos.json");
var json = await response.json();
console.log(json.length);
main.innerHTML = "";
var filterFn = null;
if (filter.value.trim())
{
filterFn = Function("with (arguments[1]) return eval(arguments[0]);").bind(null, filter.value);
}
var table = document.createElement("table");
table.setAttribute("border", "");
var thead = document.createElement("thead");
thead.innerHTML = `
<th>rowid</th>
<th>owner</th>
<th>name</th>
<th>desc</th>
<th>stars</th>
<th>forks</th>
<th>lang</th>
`;
table.appendChild(thead);
var tbody = document.createElement("tbody");
table.appendChild(tbody);
var numFilteredOut = 0;
for (var t of json)
{
if (filterFn)
{
try
{
if (!filterFn(t))
{
numFilteredOut++;
continue;
}
}
catch (e)
{
// want to catch syntax errors
alert(e);
break;
}
}
var tr = document.createElement("tr");
tr.setAttribute("class", "table-row");
if (t["flag"])
tr.setAttribute("data-flag", t["flag"]);
tr.setAttribute("data-rowid", t["rowid"]);
var tdRowid = document.createElement("td");
tdRowid.textContent = t["rowid"];
tdRowid.setAttribute("align", "right");
tr.appendChild(tdRowid);
var tdOwner = document.createElement("td");
if (badUsers[t["owner"]])
tdOwner.dataset.badUser = true;
var ownerLink = document.createElement("a");
ownerLink.className = "owner-link";
ownerLink.textContent = t["owner"];
ownerLink.href = `https://git.disroot.org/${t["owner"]}`;
ownerLink.target = "_blank";
tdOwner.appendChild(ownerLink);
tr.appendChild(tdOwner);
var tdName = document.createElement("td");
var repoLink = document.createElement("a");
repoLink.className = "repo-link";
repoLink.textContent = t["name"];
repoLink.href = `https://git.disroot.org/${t["owner"]}/${t["name"]}`;
repoLink.target = "_blank";
tdName.appendChild(repoLink);
tr.appendChild(tdName);
var tdDesc = document.createElement("td");
//~ tdDesc.textContent = t["desc"];
tdDesc.innerHTML = t["desc"];
tr.appendChild(tdDesc);
var tdStars = document.createElement("td");
tdStars.textContent = t["stars"];
tdStars.setAttribute("align", "right");
tr.appendChild(tdStars);
var tdForks = document.createElement("td");
tdForks.textContent = t["forks"];
tdForks.setAttribute("align", "right");
tr.appendChild(tdForks);
var tdLang = document.createElement("td");
tdLang.textContent = t["lang"];
tr.appendChild(tdLang);
tbody.appendChild(tr);
}
tbody.children[0].dataset.activeRow = true;
main.appendChild(table);
filterstatus.textContent = `Showing ${json.length-numFilteredOut} of ${json.length} results (${numFilteredOut} filtered out)`;
};
update();
var setFlag = async function (flag)
{
var ar = document.querySelector("[data-active-row]");
rowid = ar.dataset.rowid;
try
{
var response = await fetch("/flag", {
method: "POST",
body: `rowid=${encodeURIComponent(rowid)}&flag=${encodeURIComponent(flag)}`,
headers: {"Content-Type": "application/x-www-form-urlencoded"},
});
if (response.status !== 200)
throw new Error(`Request failed with status ${response.status}`);
ar.dataset.flag = flag;
}
catch (e)
{
alert(e);
}
};
document.onkeydown = function (event)
{
//~ console.log(event);
if (event.ctrlKey || event.altKey || event.shiftKey)
return;
if (event.srcElement.matches("input"))
return;
var dir = 1;
var nextOrPrev = null;
switch (event.key)
{
case "ArrowDown":
nextOrPrev = "nextElementSibling";
case "ArrowUp":
if (!nextOrPrev)
nextOrPrev = "previousElementSibling";
var ar = document.querySelector("[data-active-row]");
if (ar && ar[nextOrPrev])
{
ar[nextOrPrev].dataset.activeRow = true;
delete ar.dataset.activeRow;
event.preventDefault();
ar.scrollIntoView({block: "center"});
}
break;
case "a":
setFlag("spam");
break;
case "s":
setFlag("");
break;
case "d":
setFlag("notSpam");
break;
case "w":
var ar = document.querySelector("[data-active-row]");
if (ar)
ar.querySelector(".owner-link").click();
break;
case "e":
var ar = document.querySelector("[data-active-row]");
if (ar)
ar.querySelector(".repo-link").click();
break;
default:
return;
}
};
document.onclick = function (event)
{
if (event.target.matches("tr, tr *"))
{
var newAr = event.target.closest("tr");
var ar = document.querySelector("[data-active-row]");
if (ar)
delete ar.dataset.activeRow;
newAr.dataset.activeRow = true;
}
};