forked from Ag/disspam
1
0
Fork 0
disspam/discrawl.py

44 lines
1.2 KiB
Python

import os
import json
import time
import requests
import contextlib
from typing import Dict
with contextlib.closing(requests.Session()) as session:
for page in range(1, 245+1):
file = f'disroot_repos_{page}.html'
hfile = f'disroot_repos_{page}_head.json'
if os.path.exists(file):
continue
print(f'get page {page}')
url = f'https://git.disroot.org/explore/repos?page={page}&sort=oldest&q=&topic=false&language=&only_show_relevant=false'
response = session.get(url)
with open(hfile, 'w') as f:
f.write(json.dumps({
'url': url,
'status': response.status_code,
'headers': [(k, v) for k, v in response.headers.items()],
}))
try:
response.raise_for_status()
except:
print(f'Error fetching URL "{url}"!')
print(f' Status: {response.status_code}')
print(f' Headers:')
for k, v in response.headers.items():
print(f' {k}: {v}')
raise
with open(file, 'w') as f:
f.write(response.text)
time.sleep(2)