cheese-shop/tools/make-cheeses.py

119 lines
4.8 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
2020-07-04 18:05:06 +02:00
from json import JSONDecodeError
from typing import Any, Awaitable, Dict, Iterable, List
from asks.sessions import Session
2020-07-04 11:04:53 +02:00
from mysql.connector import connect
from trio import Nursery, open_nursery, run
2020-07-04 18:05:06 +02:00
from trove_classifiers import classifiers
2020-07-04 11:04:53 +02:00
USER = 'wensleydale'
DB = 'cheese_shop'
INDEX = 'https://pypi.org'
CONNECTIONS = 20
2020-07-05 08:32:46 +02:00
SKIP = 'botocore', 'ccxt', 'fluidasserts'
2020-07-04 11:04:53 +02:00
class CheeseMaker:
2020-07-04 18:05:06 +02:00
"""Cheese maker for Mr Wensleydale's cheese shop."""
2020-07-04 11:04:53 +02:00
def __init__(self, nursery: Nursery, session: Session) -> None:
self.nursery, self.session = nursery, session
2020-07-04 18:05:06 +02:00
self.connection = connect(user=USER, database=DB)
self.connection.autocommit = True
self.cursor = self.connection.cursor()
for classifier in classifiers:
self.insert('troves', classifier=classifier)
2020-07-04 20:21:31 +02:00
self.cursor.execute('SELECT classifier, id FROM troves')
self.classifiers = dict(self.cursor.fetchall())
2020-07-04 11:04:53 +02:00
def start_soon(self, async_fn: Awaitable, *args: Any) -> None:
"""Creates a child task, scheduling await async_fn(*args)."""
self.nursery.start_soon(async_fn, *args)
2020-07-04 18:05:06 +02:00
def insert(self, table: str, **kwargs: str) -> None:
"""Insert items into the given table."""
items = {k: v for k, v in kwargs.items() if v is not None}
self.cursor.execute(
f'INSERT IGNORE INTO {table} ({", ".join(items)}) '
f'VALUES ({", ".join(map(repr, items.values()))})')
2020-07-04 20:21:31 +02:00
def insert_info(self, info: Dict[str, Any]) -> int:
"""Insert auxiliary information of the given release.
Return the release ID.
"""
2020-07-04 18:05:06 +02:00
self.insert('contacts', name=info['author'],
email=info['author_email'])
2020-07-04 20:21:31 +02:00
self.insert('releases', project=info['name'], version=info['version'],
2020-07-04 18:05:06 +02:00
summary=info['summary'], homepage=info['home_page'],
email=info['author_email'])
2020-07-04 20:21:31 +02:00
release_id = self.cursor.lastrowid
for classifier in (info['classifiers'] or []):
2020-07-04 18:05:06 +02:00
self.insert('classifiers', release_id=release_id,
trove_id=self.classifiers[classifier])
for keyword in (info['keywords'] or '').split(','):
self.insert('keywords', release_id=release_id, term=keyword)
for dep in (info['requires_dist'] or []):
self.insert('dependencies', release_id=release_id, dependency=dep)
2020-07-04 20:21:31 +02:00
return release_id
2020-07-04 18:05:06 +02:00
def insert_dist(self, release_id: int,
distributions: List[Dict[str, Any]]) -> None:
"""Insert distribution information of the given release."""
for dist in distributions:
self.insert('distributions', release_id=release_id,
filename=dist['filename'], size=dist['size'],
url=dist['url'], dist_type=dist['packagetype'],
python_version=dist['python_version'],
requires_python=dist['requires_python'],
sha256=dist['digests']['sha256'],
md5=dist['digests']['md5'])
2020-07-04 11:04:53 +02:00
async def json(self, path: str) -> Any:
"""Return the JSON response to the given GET request."""
response = await self.session.get(
path=path, headers={'Accept': 'application/json'})
return response.json()
2020-07-04 18:05:06 +02:00
async def culture(self) -> Iterable[str]:
2020-07-04 11:04:53 +02:00
"""Return the 100 most popular cheeses in cheese shop."""
stats = await self.json('/stats')
return stats['top_packages'].keys()
async def drain(self, project_name: str, version: str) -> None:
"""Fetch metadata of the given distribution."""
2020-07-04 18:05:06 +02:00
try:
content = await self.json(f'/pypi/{project_name}/{version}/json')
except JSONDecodeError:
return
print('Processing', project_name, version)
2020-07-04 20:21:31 +02:00
release_id = self.insert_info(content['info'])
2020-07-04 18:05:06 +02:00
self.insert_dist(release_id, content['urls'])
2020-07-04 11:04:53 +02:00
async def coagulate(self, project_name: str) -> None:
"""Fetch project's available versions and metadata."""
2020-07-04 18:05:06 +02:00
content = await self.json(f'/pypi/{project_name}/json')
print('Fetching', project_name)
for version in content['releases'].keys():
2020-07-04 11:04:53 +02:00
# Recklessly filter out prereleases
for n in version.split('.'):
try:
int(n)
except ValueError:
break
else:
self.start_soon(self.drain, project_name, version)
2020-07-04 18:05:06 +02:00
async def main():
"""Make cheeses."""
async with open_nursery() as nursery:
2020-07-04 18:05:06 +02:00
maker = CheeseMaker(nursery, Session(INDEX, connections=CONNECTIONS))
2020-07-04 11:04:53 +02:00
for project_name in await maker.culture():
2020-07-05 08:32:46 +02:00
if project_name in SKIP: continue
2020-07-04 11:04:53 +02:00
maker.start_soon(maker.coagulate, project_name)
2020-07-04 18:05:06 +02:00
if __name__ == '__main__': run(main)