diff --git a/.github/workflows/automated-updates.yml b/.github/workflows/automated-updates.yml index 16aab69..386fe3e 100644 --- a/.github/workflows/automated-updates.yml +++ b/.github/workflows/automated-updates.yml @@ -17,8 +17,7 @@ jobs: python-version: 3.8 - name: Update categories run: | - cd scripts - python categories.py + python3 convert.py categories - name: Commit categories run: | git config --global user.name 'Nick Spaargaren' @@ -27,16 +26,14 @@ jobs: git push - name: Update AdGuard list run: | - cd scripts - python adguard.py + python3 convert.py adguard - name: Commit AdGuard list run: | git add . && git commit -am "Automated AdGuard list update" git push - name: Update parsed list run: | - cd scripts - python parsed.py + python3 convert.py pihole - name: Commit parsed list run: | git add . && git commit -am "Automated parsed list update" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a09c56d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fd6654b..c1ea0dd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,21 @@ ## Submitting a domain -- Pull request will be made to our [develop branch](https://github.com/nickspaargaren/no-google/tree/develop) -- Changes will be only be made in our main [pihole-google.txt](https://github.com/nickspaargaren/no-google/blob/master/pihole-google.txt) file. To submit a new domain, please check if it is not already present. -- The file is divided in categories and each category is alphabetically sorted. -- When changes are made to the main file, a script is run to automatically update the following files: - - [Categories](https://github.com/nickspaargaren/no-google/tree/master/categories) - - [AdGuard list](https://github.com/nickspaargaren/no-google/blob/master/pihole-google-adguard.txt) - - [Parsed list](https://github.com/nickspaargaren/no-google/blob/master/google-domains) +- Changes will be only be made in the main [pihole-google.txt] source file. + To submit a new domain, please check if it is not already present. +- The file is divided into sections along different categories, each category is alphabetically sorted. +- When changes are made to the main file, a program is run to automatically update the following files: + - [AdGuard blocklist] + - [Pi-hole blocklist] + - [Unbound blocklist] + - [per-category blocklist files] ## Suggesting a domain -If you are unable to open a pull request, please open an [issue](https://github.com/nickspaargaren/no-google/issues/new/choose) and we will investigate. +If you are unable to open a pull request, please open an [issue] and we will investigate. + + +[issue]: https://github.com/nickspaargaren/no-google/issues/new/choose +[pihole-google.txt]: https://github.com/nickspaargaren/no-google/blob/master/pihole-google.txt +[AdGuard blocklist]: https://github.com/nickspaargaren/no-google/blob/master/pihole-google-adguard.txt +[Pi-hole blocklist]: https://github.com/nickspaargaren/no-google/blob/master/google-domains +[Unbound blocklist]: https://github.com/nickspaargaren/no-google/blob/master/unbound-blocklist-google.conf +[per-category blocklist files]: https://github.com/nickspaargaren/no-google/tree/master/categories diff --git a/README.md b/README.md index 05265ed..3d60ab9 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,42 @@ Note that the main filter is being worked here, so, updates and modifications on Otherwise, if you do prefer to use Gitlab, feel free to use it, and even contribute to our list there instead!

+## The converter program + +### Data conversion +The program `convert.py` will read `pihole-google.txt` as input file and convert its +content into different output formats. It can be invoked like +```shell +python3 convert.py pihole +``` + +In order to produce all different output formats, run +```shell +python3 convert.py all +``` + +### JSON output +It also can output the data in JSON format: +```shell +python3 convert.py json +``` + +With this output, and the `jq` program, the data can be filtered and manipulated easily. +In the following section, you can find corresponding examples. + +```shell +# Produce list of all category names +python3 convert.py json | jq -r 'keys | .[]' + +# Produce list of domains for the `Doubleclick` category only +python3 convert.py json | jq -r '.Doubleclick | .[]' + +# Produce list of domains for the `Analytics` and `Doubleclick` categories +python3 convert.py json | jq -r '(.Analytics,.Doubleclick) | .[]' +``` + + + ## Can I block the other letters of GAFAM ? Of course, here’s some filterlists that should help you accomplish that. diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..68fa5af --- /dev/null +++ b/convert.py @@ -0,0 +1,176 @@ +import json +import sys +from collections import OrderedDict, defaultdict +from datetime import date +from pathlib import Path +from typing import Dict, List + + +class DomainBlocklistConverter: + + INPUT_FILE = "pihole-google.txt" + PIHOLE_FILE = "google-domains" + UNBOUND_FILE = "unbound-blocklist-google.conf" + ADGUARD_FILE = "pihole-google-adguard.txt" + CATEGORIES_PATH = "categories" + + BLOCKLIST_ABOUT = "This blocklist helps to restrict access to Google and its domains. Contribute at https://github.com/nickspaargaren/no-google" + + def __init__(self): + self.data: Dict[List] = OrderedDict() + self.timestamp: str = date.today().strftime("%Y-%m-%d") + + def read(self): + """ + Read input file into `self.data`, a dictionary mapping category names to lists of member items. + """ + with open(self.INPUT_FILE, "r") as f: + category = None + for line in f: + line = line.strip() + if line.startswith("#"): + category = line.lstrip("# ") + self.data.setdefault(category, []) + else: + if category is None: + raise ValueError("Unable to store item without category") + self.data[category].append(line) + + def dump(self): + """ + Output data in JSON format on STDOUT. + """ + print(json.dumps(self.data, indent=4)) + + def pihole(self): + """ + Produce blocklist for the Pi-hole. + """ + with open(self.PIHOLE_FILE, "w") as f: + f.write(f"# {self.BLOCKLIST_ABOUT}\n") + f.write(f"# Last updated: {self.timestamp}\n") + for category, entries in self.data.items(): + f.write(f"# {category}\n") + for entry in entries: + f.write(f"0.0.0.0 {entry}\n") + + def unbound(self): + """ + Produce blocklist for the Unbound DNS server. + + https://github.com/nickspaargaren/no-google/issues/67 + """ + with open(self.UNBOUND_FILE, "w") as f: + f.write(f"# {self.BLOCKLIST_ABOUT}\n") + f.write(f"# Last updated: {self.timestamp}\n") + for category, entries in self.data.items(): + f.write(f"\n# Category: {category}\n") + for entry in entries: + f.write(f'local-zone: "{entry}" always_refuse\n') + + def adguard(self): + """ + Produce blocklist for AdGuard. + """ + with open(self.ADGUARD_FILE, "w") as f: + f.write(f"! {self.BLOCKLIST_ABOUT}\n") + f.write(f"! Last updated: {self.timestamp}\n") + for category, entries in self.data.items(): + f.write(f"! {category}\n") + for entry in entries: + f.write(f"||{entry}^\n") + + def categories(self): + """ + Produce individual per-category blocklist files. + """ + + def write_file(path, category, entries, line_prefix=""): + """ + Generic function to write per-category file in both flavours. + """ + with open(path, "w") as f: + f.write(f"# {self.BLOCKLIST_ABOUT}\n") + f.write(f"# Last updated: {self.timestamp}\n") + f.write(f"# {category}\n") + f.write(f"\n") + for entry in entries: + f.write(f"{line_prefix}{entry}\n") + + for category, entries in self.data.items(): + + # Compute file names. + filename = category.replace(" ", "").lower() + filepath = Path(self.CATEGORIES_PATH).joinpath(filename) + text_file = filepath.with_suffix(".txt") + parsed_file = str(filepath) + "parsed" + + # Write two flavours of per-category file. + write_file(text_file, category, entries, line_prefix="0.0.0.0 ") + write_file(parsed_file, category, entries) + + def duplicates(self): + """ + Find duplicates in main source file. + """ + hashes = defaultdict(int) + for category, entries in self.data.items(): + for entry in entries: + hashes[hash(entry)] += 1 + for category, entries in self.data.items(): + for entry in entries: + hashvalue = hash(entry) + if hashvalue in hashes: + count = hashes[hashvalue] + if count > 1: + print( + f"Domain {entry} found {count} times, please remove duplicate domains." + ) + hashes[hashvalue] = 0 + + +def run(action: str): + """ + Invoke different actions on converter engine. + """ + + # Create converter instance and read input file. + converter = DomainBlocklistConverter() + converter.read() + + # Invoke special action "json". + if action == "json": + converter.dump() + sys.exit() + + # Either invoke specific action, or expand to all actions. + if action == "all": + subcommands = action_candidates + else: + subcommands = [action] + + # Invoke all actions subsequently. + for action in subcommands: + print(f"Invoking subcommand '{action}'") + method = getattr(converter, action) + method() + + +if __name__ == "__main__": + + # Read subcommand from command line, with error handling. + action_candidates = ["pihole", "unbound", "adguard", "categories"] + special_candidates = ["all", "duplicates", "json"] + subcommand = None + try: + subcommand = sys.argv[1] + except: + pass + if subcommand not in action_candidates + special_candidates: + print( + f"ERROR: Subcommand not given or invalid, please use one of {action_candidates + special_candidates}" + ) + sys.exit(1) + + # Invoke subcommand. + run(subcommand) diff --git a/pihole-google.txt b/pihole-google.txt index f2cb2e8..1f3e7ec 100644 --- a/pihole-google.txt +++ b/pihole-google.txt @@ -7349,5 +7349,4 @@ www.widgets.ft.nest.com www.widgets.nest.com www.wulfview.nest.com www.wwn-catalog-api.nest.com -zipkin.ft.nest.com -# End of file \ No newline at end of file +zipkin.ft.nest.com \ No newline at end of file diff --git a/scripts/adguard.py b/scripts/adguard.py deleted file mode 100644 index ea0bcfb..0000000 --- a/scripts/adguard.py +++ /dev/null @@ -1,14 +0,0 @@ -from datetime import date -today = date.today() - -newfile = open('../pihole-google-adguard.txt', 'w') -newfile.write('! This blocklist helps Pi-hole\'s admin restrict access to Google and its domains.'+'\n') -newfile.write('! Last updated: ' + today.strftime('%d-%m-%Y') +'\n') - -with open('../pihole-google.txt', 'r') as main: - - for line in main: - if '#' in line: - newfile.write('! ' + line[2:]) - elif not '#' in line: - newfile.write('||' + line.rstrip("\n") + '^' + '\n') \ No newline at end of file diff --git a/scripts/categories.py b/scripts/categories.py deleted file mode 100644 index 6664eda..0000000 --- a/scripts/categories.py +++ /dev/null @@ -1,42 +0,0 @@ -from datetime import date -today = date.today() - -class temporary: - title = '' - categories = [] - -def Create(title, categories): - file_name = title.strip('#').rstrip('\n').replace(' ', '').lower() - - # txt files - newfile = open('../categories/' + file_name + '.txt', 'w') - newfile.write('# This blocklist helps Pi-hole\'s admin restrict access to Google and its domains.'+'\n') - newfile.write('# Last updated: ' + today.strftime('%d-%m-%Y') +'\n') - newfile.write(title +'\n') - for url in categories: - newfile.write('0.0.0.0 ' + url + '\n') - - # Parsed files - newfile = open('../categories/' + file_name + 'parsed', 'w') - newfile.write('# This blocklist helps Pi-hole\'s admin restrict access to Google and its domains.'+'\n') - newfile.write('# Last updated: ' + today.strftime('%d-%m-%Y') +'\n') - newfile.write(title +'\n') - for url in categories: - newfile.write(url + '\n') - - -with open('../pihole-google.txt', 'r') as main: - - for line in main: - - if '#' in line: - - if temporary.title and temporary.categories: - Create(temporary.title, temporary.categories) - temporary.title = '' - temporary.categories = [] - - temporary.title = line - - elif not '#' in line: - temporary.categories.append(line.rstrip('\n')) \ No newline at end of file diff --git a/scripts/parsed.py b/scripts/parsed.py deleted file mode 100644 index 4df0c5a..0000000 --- a/scripts/parsed.py +++ /dev/null @@ -1,14 +0,0 @@ -from datetime import date -today = date.today() - -newfile = open('../google-domains', 'w') -newfile.write('# This blocklist helps Pi-hole\'s admin restrict access to Google and its domains.'+'\n') -newfile.write('# Last updated: ' + today.strftime('%d-%m-%Y') +'\n') - -with open('../pihole-google.txt', 'r') as main: - - for line in main: - if '#' in line: - newfile.write('# ' + line[2:]) - elif not '#' in line: - newfile.write('0.0.0.0 ' + line.rstrip("\n") + '\n') \ No newline at end of file