Initial commit
This commit is contained in:
parent
d6adf41905
commit
f62ac21333
|
@ -0,0 +1,71 @@
|
|||
# csschooser
|
||||
|
||||
## Video Demo: <URL HERE>
|
||||
|
||||
## Description:
|
||||
|
||||
An interactive CLI tool for choosing CSS selectors for a web page. Designed for use as a library with BeautifulSoup and Scrapy.
|
||||
|
||||
This project uses the [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) and [`rich`](https://rich.readthedocs.io/en/stable/index.html) libraries to create an interactive element-selecting experience. It can be run as program or used as a library.
|
||||
|
||||
Created as a final project for the CS50P course.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
This project was made using Python `3.10.12` and pip `22.0.2`. See `requirements.txt` for module information.
|
||||
|
||||
## Installation
|
||||
|
||||
### Using Git:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Makaze/csschooser.git
|
||||
cd csschooser
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### On the Command Line:
|
||||
|
||||
```bash
|
||||
$ python3 csschooser.py
|
||||
```
|
||||
|
||||
### As A Library:
|
||||
|
||||
Example using the `BeautifulSoup` library to print the text from all matching elements:
|
||||
|
||||
```py
|
||||
import csschooser
|
||||
|
||||
soup = csschooser.get_soup("http://github.com/Makaze/csschooser") # Example URLexit
|
||||
|
||||
selector = csschooser.interactive_select(soup)
|
||||
|
||||
for tag in soup.select(selector):
|
||||
print(tag.get_text().strip())
|
||||
```
|
||||
|
||||
## API / Documentation
|
||||
|
||||
#### `get_soup(name)`:
|
||||
|
||||
> Takes in a string `name` and returns a [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) instance based on the contents of the file or URL named `name`. Raises a `FileNotFoundError` if `name` is neither a valid URL nor a valid file name.
|
||||
|
||||
#### `get_regex(s)`:
|
||||
|
||||
> Takes in a string `s` and returns a Regular Expression pattern as a string for matching the outermost element in `s`. Returns `s` unchanged if it contains no elements.
|
||||
|
||||
#### `interactive_select(soup)`:
|
||||
|
||||
> Takes in `soup` as a [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) instance and prompts the user to enter a CSS selector. Matching elements are highlighted in an auto-scrolling output window. Clears the terminal screen and returns the last chosen selector when the user follows the prompt to exit.
|
||||
|
||||
#### `clear(lines)`:
|
||||
|
||||
> Takes in an int `lines`. If `lines` is ``>= 1``, moves the cursor up and to the end of the line `lines` times and returns the resulting backtrack sequence as a string. Otherwise calls the system's clear terminal command, clearing the terminal screen, then returns False.
|
||||
|
||||
|
||||
#### `paginate(console, pretty)`:
|
||||
|
||||
> Takes in `console` as a [`rich.Console`](https://rich.readthedocs.io/en/stable/console.html) instance and `pretty` as a string, then passes pretty to the console and sends the rich string to the system's pager utility (`less` for Linux systems).
|
|
@ -0,0 +1,137 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
from rich.console import Console
|
||||
from rich.highlighter import RegexHighlighter
|
||||
from rich.theme import Theme
|
||||
import sys
|
||||
import validators
|
||||
|
||||
# Fetch the file or URL as prettyprinted HTML
|
||||
# Selector editor
|
||||
# Highlight matching elements
|
||||
# Tune the selector
|
||||
# Return selector as output
|
||||
|
||||
regex_count = 3
|
||||
|
||||
|
||||
def get_soup(name):
|
||||
is_url = validators.url(name)
|
||||
|
||||
try:
|
||||
if is_url:
|
||||
r = re.sub(r"\<(script|style)[\s\S]*?\<\/\1\>", "", requests.get(name).text.replace("\r\n", "\n").replace("\n\r", "\n").strip())
|
||||
s = BeautifulSoup(r, "html.parser")
|
||||
else:
|
||||
with open(name, "r") as f:
|
||||
s = BeautifulSoup(f.read().strip(), "html.parser")
|
||||
except:
|
||||
raise FileNotFoundError
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def clear(lines=1, out=True):
|
||||
if lines < 1:
|
||||
_ = os.system('cls') if os.name == 'nt' else os.system('clear')
|
||||
return False
|
||||
|
||||
up = '\033[1A'
|
||||
erase = '\x1b[2K'
|
||||
s = ""
|
||||
|
||||
for _ in range(lines):
|
||||
if out:
|
||||
print(up, end=erase)
|
||||
|
||||
s += up + erase
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def get_regex(s):
|
||||
global regex_count
|
||||
s = str(s)
|
||||
r = re.findall(r"(<[^\>\<]*?>)", s)
|
||||
if r:
|
||||
open, close = re.escape(r[0]), re.escape(r[len(r) - 1])
|
||||
else:
|
||||
s = re.escape(s)
|
||||
open = s
|
||||
close = s
|
||||
if open != close:
|
||||
s = r"(\s*){}[\s\S]*?\{}{}".format(open, regex_count, close)
|
||||
regex_count += 1
|
||||
return s
|
||||
|
||||
|
||||
def paginate(console, pretty):
|
||||
with console.pager(styles=True):
|
||||
console.print(pretty)
|
||||
|
||||
|
||||
def interactive_select(soup):
|
||||
global regex_count
|
||||
|
||||
first = True
|
||||
full = ""
|
||||
finalize = ""
|
||||
sel = "null"
|
||||
theme = Theme({'selector.elements': 'blue', 'code': 'none', 'reverse': 'none'})
|
||||
|
||||
print()
|
||||
|
||||
while first or full:
|
||||
console = Console(highlighter=ClassHighlighter(soup, sel=sel), theme=theme)
|
||||
old_log = "" if first else pretty
|
||||
pretty = soup.prettify()
|
||||
|
||||
if len(pretty.split("\n")) > os.get_terminal_size()[1]:
|
||||
paginate(console, pretty)
|
||||
clear(-1)
|
||||
elif not first:
|
||||
clear(len(old_log.split("\n")) + 2)
|
||||
|
||||
console.print(pretty)
|
||||
|
||||
first = False
|
||||
finalize = " [Leave empty to exit]"
|
||||
old = full
|
||||
full = input(f"\nSelector ({full}){finalize}: ")
|
||||
sel = full
|
||||
regex_count = 3
|
||||
|
||||
clear(-1)
|
||||
return old
|
||||
|
||||
|
||||
def main():
|
||||
while True:
|
||||
try:
|
||||
name = input("Filename or URL: ")
|
||||
soup = get_soup(name)
|
||||
|
||||
old = interactive_select(soup)
|
||||
break
|
||||
except:
|
||||
sys.exit("Invalid filename or URL!")
|
||||
|
||||
print()
|
||||
print(f"You chose: {old}")
|
||||
|
||||
class ClassHighlighter(RegexHighlighter):
|
||||
"""Apply style to anything that looks like an email."""
|
||||
|
||||
base_style = "selector."
|
||||
|
||||
def __init__(self, soup, sel="null"):
|
||||
s = soup.select(sel)
|
||||
regex = "(" + "|".join(list(map(get_regex, s))) + ")"
|
||||
|
||||
self.highlights = [r"(?P<elements>" + regex + ")"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Binary file not shown.
After Width: | Height: | Size: 1.3 MiB |
|
@ -0,0 +1,8 @@
|
|||
import csschooser
|
||||
|
||||
soup = csschooser.get_soup("https://google.com") # Example URL
|
||||
|
||||
selector = csschooser.interactive_select(soup)
|
||||
|
||||
for tag in soup.select(selector):
|
||||
print(tag.get_text().strip())
|
|
@ -0,0 +1,5 @@
|
|||
beautifulsoup4==4.10.0
|
||||
pytest==7.4.0
|
||||
Requests==2.31.0
|
||||
rich==13.7.0
|
||||
validators==0.22.0
|
|
@ -0,0 +1,2 @@
|
|||
<html><body><p class="hi">Hi!!</p><p>Hello</p><div class="hi">Hello there!<p id="hi"><a href="#hi">Hi!</a></p></div></body></html>
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
import pytest
|
||||
|
||||
from csschooser.csschooser import clear, get_soup, interactive_select, get_regex
|
||||
|
||||
# Disable console output
|
||||
def paginate(c, p):
|
||||
pass
|
||||
|
||||
class Console():
|
||||
def print(self, s):
|
||||
pass
|
||||
|
||||
def test_get_regex():
|
||||
assert get_regex('') == ''
|
||||
assert get_regex(None) == 'None'
|
||||
assert get_regex('<p>Hi!</p>') == r"(\s*)<p>[\s\S]*?\{}</p>".format(3)
|
||||
assert get_regex('<p>Hi! Two</p>') == r"(\s*)<p>[\s\S]*?\{}</p>".format(4)
|
||||
assert get_regex('<meta />') == '<meta />'
|
||||
|
||||
|
||||
def test_get_soup():
|
||||
soup = str(get_soup("test.html"))
|
||||
assert soup == '<html><body><p class="hi">Hi!!</p><p>Hello</p><div class="hi">Hello there!<p id="hi"><a href="#hi">Hi!</a></p></div></body></html>'
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
soup = str(get_soup("fake.html"))
|
||||
|
||||
def test_interactive_select(monkeypatch):
|
||||
global regex_count
|
||||
soup = get_soup("test.html")
|
||||
|
||||
inputs = iter(['p', '', '#hi', ''])
|
||||
monkeypatch.setattr('builtins.input', lambda _="": next(inputs))
|
||||
monkeypatch.setattr('os.get_terminal_size', lambda _="": (10, 10))
|
||||
selection = interactive_select(soup)
|
||||
assert selection == "p"
|
||||
|
||||
selection = interactive_select(soup)
|
||||
assert selection == "#hi"
|
||||
|
||||
def test_clear(monkeypatch):
|
||||
monkeypatch.setattr('os.system', lambda _="": 0)
|
||||
assert clear(-1) == False
|
||||
assert clear(10) == "\033[1A\x1b[2K" * 10
|
Loading…
Reference in New Issue