Initial commit

This commit is contained in:
Christopher Lane 2023-12-09 20:15:22 -05:00
parent d6adf41905
commit f62ac21333
8 changed files with 267 additions and 0 deletions

71
README.md Normal file
View File

@ -0,0 +1,71 @@
# csschooser
## Video Demo: <URL HERE>
## Description:
An interactive CLI tool for choosing CSS selectors for a web page. Designed for use as a library with BeautifulSoup and Scrapy.
This project uses the [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) and [`rich`](https://rich.readthedocs.io/en/stable/index.html) libraries to create an interactive element-selecting experience. It can be run as program or used as a library.
Created as a final project for the CS50P course.
## Prerequisites
This project was made using Python `3.10.12` and pip `22.0.2`. See `requirements.txt` for module information.
## Installation
### Using Git:
```bash
git clone https://github.com/Makaze/csschooser.git
cd csschooser
pip install -r requirements.txt
```
## Usage
### On the Command Line:
```bash
$ python3 csschooser.py
```
### As A Library:
Example using the `BeautifulSoup` library to print the text from all matching elements:
```py
import csschooser
soup = csschooser.get_soup("http://github.com/Makaze/csschooser") # Example URLexit
selector = csschooser.interactive_select(soup)
for tag in soup.select(selector):
print(tag.get_text().strip())
```
## API / Documentation
#### `get_soup(name)`:
> Takes in a string `name` and returns a [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) instance based on the contents of the file or URL named `name`. Raises a `FileNotFoundError` if `name` is neither a valid URL nor a valid file name.
#### `get_regex(s)`:
> Takes in a string `s` and returns a Regular Expression pattern as a string for matching the outermost element in `s`. Returns `s` unchanged if it contains no elements.
#### `interactive_select(soup)`:
> Takes in `soup` as a [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) instance and prompts the user to enter a CSS selector. Matching elements are highlighted in an auto-scrolling output window. Clears the terminal screen and returns the last chosen selector when the user follows the prompt to exit.
#### `clear(lines)`:
> Takes in an int `lines`. If `lines` is ``>= 1``, moves the cursor up and to the end of the line `lines` times and returns the resulting backtrack sequence as a string. Otherwise calls the system's clear terminal command, clearing the terminal screen, then returns False.
#### `paginate(console, pretty)`:
> Takes in `console` as a [`rich.Console`](https://rich.readthedocs.io/en/stable/console.html) instance and `pretty` as a string, then passes pretty to the console and sends the rich string to the system's pager utility (`less` for Linux systems).

0
__init__.py Normal file
View File

137
csschooser.py Normal file
View File

@ -0,0 +1,137 @@
from bs4 import BeautifulSoup
import os
import re
import requests
from rich.console import Console
from rich.highlighter import RegexHighlighter
from rich.theme import Theme
import sys
import validators
# Fetch the file or URL as prettyprinted HTML
# Selector editor
# Highlight matching elements
# Tune the selector
# Return selector as output
regex_count = 3
def get_soup(name):
is_url = validators.url(name)
try:
if is_url:
r = re.sub(r"\<(script|style)[\s\S]*?\<\/\1\>", "", requests.get(name).text.replace("\r\n", "\n").replace("\n\r", "\n").strip())
s = BeautifulSoup(r, "html.parser")
else:
with open(name, "r") as f:
s = BeautifulSoup(f.read().strip(), "html.parser")
except:
raise FileNotFoundError
return s
def clear(lines=1, out=True):
if lines < 1:
_ = os.system('cls') if os.name == 'nt' else os.system('clear')
return False
up = '\033[1A'
erase = '\x1b[2K'
s = ""
for _ in range(lines):
if out:
print(up, end=erase)
s += up + erase
return s
def get_regex(s):
global regex_count
s = str(s)
r = re.findall(r"(<[^\>\<]*?>)", s)
if r:
open, close = re.escape(r[0]), re.escape(r[len(r) - 1])
else:
s = re.escape(s)
open = s
close = s
if open != close:
s = r"(\s*){}[\s\S]*?\{}{}".format(open, regex_count, close)
regex_count += 1
return s
def paginate(console, pretty):
with console.pager(styles=True):
console.print(pretty)
def interactive_select(soup):
global regex_count
first = True
full = ""
finalize = ""
sel = "null"
theme = Theme({'selector.elements': 'blue', 'code': 'none', 'reverse': 'none'})
print()
while first or full:
console = Console(highlighter=ClassHighlighter(soup, sel=sel), theme=theme)
old_log = "" if first else pretty
pretty = soup.prettify()
if len(pretty.split("\n")) > os.get_terminal_size()[1]:
paginate(console, pretty)
clear(-1)
elif not first:
clear(len(old_log.split("\n")) + 2)
console.print(pretty)
first = False
finalize = " [Leave empty to exit]"
old = full
full = input(f"\nSelector ({full}){finalize}: ")
sel = full
regex_count = 3
clear(-1)
return old
def main():
while True:
try:
name = input("Filename or URL: ")
soup = get_soup(name)
old = interactive_select(soup)
break
except:
sys.exit("Invalid filename or URL!")
print()
print(f"You chose: {old}")
class ClassHighlighter(RegexHighlighter):
"""Apply style to anything that looks like an email."""
base_style = "selector."
def __init__(self, soup, sel="null"):
s = soup.select(sel)
regex = "(" + "|".join(list(map(get_regex, s))) + ")"
self.highlights = [r"(?P<elements>" + regex + ")"]
if __name__ == "__main__":
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

8
example.py Normal file
View File

@ -0,0 +1,8 @@
import csschooser
soup = csschooser.get_soup("https://google.com") # Example URL
selector = csschooser.interactive_select(soup)
for tag in soup.select(selector):
print(tag.get_text().strip())

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
beautifulsoup4==4.10.0
pytest==7.4.0
Requests==2.31.0
rich==13.7.0
validators==0.22.0

2
test.html Normal file
View File

@ -0,0 +1,2 @@
<html><body><p class="hi">Hi!!</p><p>Hello</p><div class="hi">Hello there!<p id="hi"><a href="#hi">Hi!</a></p></div></body></html>

44
test_csschooser.py Normal file
View File

@ -0,0 +1,44 @@
import pytest
from csschooser.csschooser import clear, get_soup, interactive_select, get_regex
# Disable console output
def paginate(c, p):
pass
class Console():
def print(self, s):
pass
def test_get_regex():
assert get_regex('') == ''
assert get_regex(None) == 'None'
assert get_regex('<p>Hi!</p>') == r"(\s*)<p>[\s\S]*?\{}</p>".format(3)
assert get_regex('<p>Hi! Two</p>') == r"(\s*)<p>[\s\S]*?\{}</p>".format(4)
assert get_regex('<meta />') == '<meta />'
def test_get_soup():
soup = str(get_soup("test.html"))
assert soup == '<html><body><p class="hi">Hi!!</p><p>Hello</p><div class="hi">Hello there!<p id="hi"><a href="#hi">Hi!</a></p></div></body></html>'
with pytest.raises(FileNotFoundError):
soup = str(get_soup("fake.html"))
def test_interactive_select(monkeypatch):
global regex_count
soup = get_soup("test.html")
inputs = iter(['p', '', '#hi', ''])
monkeypatch.setattr('builtins.input', lambda _="": next(inputs))
monkeypatch.setattr('os.get_terminal_size', lambda _="": (10, 10))
selection = interactive_select(soup)
assert selection == "p"
selection = interactive_select(soup)
assert selection == "#hi"
def test_clear(monkeypatch):
monkeypatch.setattr('os.system', lambda _="": 0)
assert clear(-1) == False
assert clear(10) == "\033[1A\x1b[2K" * 10