From d7c61c3466d6208a1c36966122333af0a8a6a4b9 Mon Sep 17 00:00:00 2001 From: soratobuneko Date: Mon, 14 Dec 2020 16:27:12 +0100 Subject: [PATCH] urltitel: get URL title. Display or send titles of URLs from incoming and outcoming messages. Also features an optional URL buffer --- .gitignore | 1 + urltitel.py | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 .gitignore create mode 100755 urltitel.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..db9bdf1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/test.py diff --git a/urltitel.py b/urltitel.py new file mode 100755 index 0000000..46d5c60 --- /dev/null +++ b/urltitel.py @@ -0,0 +1,261 @@ +import html +import re +import weechat +from urllib.error import URLError +from urllib.parse import quote, urlsplit, urlunsplit +from urllib.request import Request, urlopen + +SCRIPT_NAME = "urltitel" +SCRIPT_AUTHOR = "soratobuneko" +SCRIPT_VERSION = "4" +SCRIPT_LICENCE = "WTFPL" +SCRIPT_DESCRIPTION = ( + "Display or send titles of URLs from incoming and outcoming messages. " + + "Also features an optional URL buffer" +) +UA = "Mozilla/5.0 (Python) weechat {}".format(SCRIPT_NAME) + +script_options = { + "timeout": ("3", "Maximum time to wait to fetch URL."), + "retry": ("off", "Retry fetching URL if it fails the first time."), + "maxlength": ("200", "Maximum length of title."), + "maxdownload": ("262144", "Maximum size (Bytes) to fetch from URL."), + "serverchans": ( + "*,*", + '"|" separated list of server,#channel to parse. for instance: "server0,#channel1|server0,#channel2"', + ), + "replyto": ( + "", + '"|" separated list of server,#channel for which instead of displaying localy a message we send it to the channel.', + ), + "sendfromme": ("off", "Alway send titles for URLs sent by ourself."), + "urlbuffer": ("off", "Create a buffer to collect the URLs with their titles."), + "debug": ("off", "Show debug messages"), +} + + +def create_buffer(): + global url_buffer + BUFFER_NAME = "{}".format(SCRIPT_NAME) + url_buffer = weechat.buffer_new(BUFFER_NAME, "", "", "on_buffer_close", "") + weechat.buffer_set( + url_buffer, "title", "URL buffer ({} v{})".format(SCRIPT_NAME, SCRIPT_VERSION) + ) + + +def debug(message): + if script_options["debug"] == "on": + weechat.prnt("", "{}: {}".format(SCRIPT_NAME, message)) + + +def error(message): + weechat.prnt("", "{}{}: {}".format(weechat.prefix("error"), SCRIPT_NAME, message)) + + +def fetch_html(url): + # IRI to URL (unicode to ascii) + url = urlsplit(url) + url = list(url) + url[1] = quote(url[1]) # URL encode domain + url[2] = quote(url[2]) # URL encode path + url = urlunsplit(url) + request = Request(url, data=None, headers={"User-Agent": UA}) + + tries = 2 if script_options["retry"] == "on" else 1 + for i in range(0, tries): + try: + with urlopen(request, timeout=int(script_options["timeout"])) as res: + is_html = bool(re.match(".*/html.*", res.info()["Content-Type"])) + if is_html: + debug( + "Got an HTML document. Reading at most {} bytes.".format( + script_options["maxdownload"] + ), + ) + html_doc_head = res.read(int(script_options["maxdownload"])).decode( + "utf-8", "replace" + ) + return html.unescape(html_doc_head) + else: + debug("Not an HTML document.") + return None + except URLError as err: + error( + "Cannot fetch URL. {}".format(err.reason), + ) + + +_re_url = re.compile(r"https?://[\w0-9@:%._\+~#=()?&/\-]+") + + +def find_urls(message): + # Found URLs with title [["http://perdu.com", "Vous Etes Perdu ?"], ...] + # If URL point to a non HTML document the list element is None. If the + # HTML doc has no the list element is ["https://..", None] + urls = [] + urls_count = 0 + + if re.match(r"^url\|\d+\): ", message): + return (0, ()) + + if re.match(r"https?://[^ ]", message) and not re.match(_re_url, message): + debug("Failling to match URL in message: {}".format(message)) + + for url in re.findall(_re_url, message): + debug("Fetching title for URL: {}".format(url)) + html = fetch_html(url) + if html != None: + title = get_title(html) + if title != None and len(title): + urls_count += 1 + debug("Found title: {}".format(title)) + if len(title) > int(script_options["maxlength"]): + urls.append([url, title[0 : int(script_options["maxlength"])]]) + else: + urls.append([url, title]) + else: + urls.append(None) + + return (urls_count, urls) + + +_re_whitespace = re.compile(r"\s") + + +def get_title(html): + title = re.search(r"(?i)<title ?[^<>]*>([^<>]*)", html) + if title == None: + debug("No found.") + return None + else: + title = title.group(1) + + # many whitespace to one space + stripped_title = "" + for i, char in enumerate(title): + if not re.match(_re_whitespace, char): + stripped_title += char + elif i > 0 and not re.match(_re_whitespace, title[i - 1]): + stripped_title += " " + stripped_title = stripped_title.strip() + + return stripped_title + + +def on_config_change(data, option, value): + key = option.split(".")[-1] + script_options[key] = value + return weechat.WEECHAT_RC_OK + + +def on_buffer_close(data, buffer): + global url_buffer + url_buffer = None + return weechat.WEECHAT_RC_OK + + +def on_privmsg(data, signal, signal_data): + server = signal.split(",")[0] + msg = weechat.info_get_hashtable("irc_message_parse", {"message": signal_data}) + srvchan = "{},{}".format(server, msg["channel"]) + + # Parse only messages from configured server/channels + if not srvchan_in_list(srvchan, script_options["serverchans"].split("|")): + debug("Ignoring message from {}/{}".format(server, msg["channel"])) + return weechat.WEECHAT_RC_OK + + urls_found = find_urls(msg["text"]) + if script_options["urlbuffer"] == "on" and len(urls_found[1]): + nick = msg["nick"] + if not len(nick): + nick = "{}{}{}".format( + weechat.color("*white"), + weechat.info_get("irc_nick", server), + weechat.color("default"), + ) + if not url_buffer: + create_buffer() + weechat.prnt( + url_buffer, + "<{}{}@{}{}/{}>\t{}".format( + nick, + weechat.color("red"), + weechat.color("default"), + server, + msg["channel"], + msg["text"], + ), + ) + if urls_found[0]: + force_send = ( + True + if script_options["sendfromme"] == "on" and not len(msg["nick"]) + else False + ) + show_urls_title(srvchan, urls_found[1], force_send) + + return weechat.WEECHAT_RC_OK + + +def show_urls_title(srvchan, urls, force_send): + ACTION_SEND = "Sending" + buffer = weechat.info_get("irc_buffer", srvchan) + action = ( + (ACTION_SEND, "to") + if force_send or srvchan_in_list(srvchan, script_options["replyto"].split("|")) + else ("Displaying", "on") + ) + if buffer: + for i, url in enumerate(urls): + if url != None: + debug( + "{} title(s) {} {}".format(action[0], action[1], srvchan), + ) + if action[0] == ACTION_SEND: + weechat.command(buffer, "url|{}): {}".format(i + 1, url[1])) + else: # We have already checked script_options["serverchans"] in on_privmsg + weechat.prnt(buffer, "{}:\t{}".format(i + 1, url[1])) + if script_options["urlbuffer"] == "on": + if not url_buffer: + create_buffer() + weechat.prnt(url_buffer, "{}:\t{}".format(i + 1, url[1])) + + +def srvchan_in_list(srvchan, srvchan_list): + srvchan = srvchan.lower().split(",") + for _srvchan in srvchan_list: + _srvchan = _srvchan.lower().split(",") + if (_srvchan[0] == "*" or srvchan[0] == _srvchan[0]) and ( + _srvchan[1] == "*" or srvchan[1] == _srvchan[1] + ): + return True + return False + + +weechat.register( + SCRIPT_NAME, + SCRIPT_AUTHOR, + SCRIPT_VERSION, + SCRIPT_LICENCE, + SCRIPT_DESCRIPTION, + "", + "", +) + +for option, default_value in list(script_options.items()): + if not weechat.config_is_set_plugin(option): + weechat.config_set_plugin(option, default_value[0]) + script_options[option] = default_value[0] + else: + script_options[option] = weechat.config_get_plugin(option) + weechat.config_set_desc_plugin( + option, "{} (default: {})".format(default_value[1], default_value[0]) + ) + +if script_options["urlbuffer"] == "on": + create_buffer() + + +weechat.hook_config("plugins.var.python." + SCRIPT_NAME + ".*", "on_config_change", "") +weechat.hook_signal("*,irc_in2_privmsg", "on_privmsg", "") +weechat.hook_signal("*,irc_out1_privmsg", "on_privmsg", "")