urltitel: get URL title.

Display or send titles of URLs from incoming and outcoming messages.
Also features an optional URL buffer
This commit is contained in:
soratobuneko 2020-12-14 16:27:12 +01:00
commit d7c61c3466
2 changed files with 262 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/test.py

261
urltitel.py Executable file
View File

@ -0,0 +1,261 @@
import html
import re
import weechat
from urllib.error import URLError
from urllib.parse import quote, urlsplit, urlunsplit
from urllib.request import Request, urlopen
SCRIPT_NAME = "urltitel"
SCRIPT_AUTHOR = "soratobuneko"
SCRIPT_VERSION = "4"
SCRIPT_LICENCE = "WTFPL"
SCRIPT_DESCRIPTION = (
"Display or send titles of URLs from incoming and outcoming messages. "
+ "Also features an optional URL buffer"
)
UA = "Mozilla/5.0 (Python) weechat {}".format(SCRIPT_NAME)
script_options = {
"timeout": ("3", "Maximum time to wait to fetch URL."),
"retry": ("off", "Retry fetching URL if it fails the first time."),
"maxlength": ("200", "Maximum length of title."),
"maxdownload": ("262144", "Maximum size (Bytes) to fetch from URL."),
"serverchans": (
"*,*",
'"|" separated list of server,#channel to parse. for instance: "server0,#channel1|server0,#channel2"',
),
"replyto": (
"",
'"|" separated list of server,#channel for which instead of displaying localy a message we send it to the channel.',
),
"sendfromme": ("off", "Alway send titles for URLs sent by ourself."),
"urlbuffer": ("off", "Create a buffer to collect the URLs with their titles."),
"debug": ("off", "Show debug messages"),
}
def create_buffer():
global url_buffer
BUFFER_NAME = "{}".format(SCRIPT_NAME)
url_buffer = weechat.buffer_new(BUFFER_NAME, "", "", "on_buffer_close", "")
weechat.buffer_set(
url_buffer, "title", "URL buffer ({} v{})".format(SCRIPT_NAME, SCRIPT_VERSION)
)
def debug(message):
if script_options["debug"] == "on":
weechat.prnt("", "{}: {}".format(SCRIPT_NAME, message))
def error(message):
weechat.prnt("", "{}{}: {}".format(weechat.prefix("error"), SCRIPT_NAME, message))
def fetch_html(url):
# IRI to URL (unicode to ascii)
url = urlsplit(url)
url = list(url)
url[1] = quote(url[1]) # URL encode domain
url[2] = quote(url[2]) # URL encode path
url = urlunsplit(url)
request = Request(url, data=None, headers={"User-Agent": UA})
tries = 2 if script_options["retry"] == "on" else 1
for i in range(0, tries):
try:
with urlopen(request, timeout=int(script_options["timeout"])) as res:
is_html = bool(re.match(".*/html.*", res.info()["Content-Type"]))
if is_html:
debug(
"Got an HTML document. Reading at most {} bytes.".format(
script_options["maxdownload"]
),
)
html_doc_head = res.read(int(script_options["maxdownload"])).decode(
"utf-8", "replace"
)
return html.unescape(html_doc_head)
else:
debug("Not an HTML document.")
return None
except URLError as err:
error(
"Cannot fetch URL. {}".format(err.reason),
)
_re_url = re.compile(r"https?://[\w0-9@:%._\+~#=()?&/\-]+")
def find_urls(message):
# Found URLs with title [["http://perdu.com", "Vous Etes Perdu ?"], ...]
# If URL point to a non HTML document the list element is None. If the
# HTML doc has no <title> the list element is ["https://..", None]
urls = []
urls_count = 0
if re.match(r"^url\|\d+\): ", message):
return (0, ())
if re.match(r"https?://[^ ]", message) and not re.match(_re_url, message):
debug("Failling to match URL in message: {}".format(message))
for url in re.findall(_re_url, message):
debug("Fetching title for URL: {}".format(url))
html = fetch_html(url)
if html != None:
title = get_title(html)
if title != None and len(title):
urls_count += 1
debug("Found title: {}".format(title))
if len(title) > int(script_options["maxlength"]):
urls.append([url, title[0 : int(script_options["maxlength"])]])
else:
urls.append([url, title])
else:
urls.append(None)
return (urls_count, urls)
_re_whitespace = re.compile(r"\s")
def get_title(html):
title = re.search(r"(?i)<title ?[^<>]*>([^<>]*)</title>", html)
if title == None:
debug("No <title> found.")
return None
else:
title = title.group(1)
# many whitespace to one space
stripped_title = ""
for i, char in enumerate(title):
if not re.match(_re_whitespace, char):
stripped_title += char
elif i > 0 and not re.match(_re_whitespace, title[i - 1]):
stripped_title += " "
stripped_title = stripped_title.strip()
return stripped_title
def on_config_change(data, option, value):
key = option.split(".")[-1]
script_options[key] = value
return weechat.WEECHAT_RC_OK
def on_buffer_close(data, buffer):
global url_buffer
url_buffer = None
return weechat.WEECHAT_RC_OK
def on_privmsg(data, signal, signal_data):
server = signal.split(",")[0]
msg = weechat.info_get_hashtable("irc_message_parse", {"message": signal_data})
srvchan = "{},{}".format(server, msg["channel"])
# Parse only messages from configured server/channels
if not srvchan_in_list(srvchan, script_options["serverchans"].split("|")):
debug("Ignoring message from {}/{}".format(server, msg["channel"]))
return weechat.WEECHAT_RC_OK
urls_found = find_urls(msg["text"])
if script_options["urlbuffer"] == "on" and len(urls_found[1]):
nick = msg["nick"]
if not len(nick):
nick = "{}{}{}".format(
weechat.color("*white"),
weechat.info_get("irc_nick", server),
weechat.color("default"),
)
if not url_buffer:
create_buffer()
weechat.prnt(
url_buffer,
"<{}{}@{}{}/{}>\t{}".format(
nick,
weechat.color("red"),
weechat.color("default"),
server,
msg["channel"],
msg["text"],
),
)
if urls_found[0]:
force_send = (
True
if script_options["sendfromme"] == "on" and not len(msg["nick"])
else False
)
show_urls_title(srvchan, urls_found[1], force_send)
return weechat.WEECHAT_RC_OK
def show_urls_title(srvchan, urls, force_send):
ACTION_SEND = "Sending"
buffer = weechat.info_get("irc_buffer", srvchan)
action = (
(ACTION_SEND, "to")
if force_send or srvchan_in_list(srvchan, script_options["replyto"].split("|"))
else ("Displaying", "on")
)
if buffer:
for i, url in enumerate(urls):
if url != None:
debug(
"{} title(s) {} {}".format(action[0], action[1], srvchan),
)
if action[0] == ACTION_SEND:
weechat.command(buffer, "url|{}): {}".format(i + 1, url[1]))
else: # We have already checked script_options["serverchans"] in on_privmsg
weechat.prnt(buffer, "{}:\t{}".format(i + 1, url[1]))
if script_options["urlbuffer"] == "on":
if not url_buffer:
create_buffer()
weechat.prnt(url_buffer, "{}:\t{}".format(i + 1, url[1]))
def srvchan_in_list(srvchan, srvchan_list):
srvchan = srvchan.lower().split(",")
for _srvchan in srvchan_list:
_srvchan = _srvchan.lower().split(",")
if (_srvchan[0] == "*" or srvchan[0] == _srvchan[0]) and (
_srvchan[1] == "*" or srvchan[1] == _srvchan[1]
):
return True
return False
weechat.register(
SCRIPT_NAME,
SCRIPT_AUTHOR,
SCRIPT_VERSION,
SCRIPT_LICENCE,
SCRIPT_DESCRIPTION,
"",
"",
)
for option, default_value in list(script_options.items()):
if not weechat.config_is_set_plugin(option):
weechat.config_set_plugin(option, default_value[0])
script_options[option] = default_value[0]
else:
script_options[option] = weechat.config_get_plugin(option)
weechat.config_set_desc_plugin(
option, "{} (default: {})".format(default_value[1], default_value[0])
)
if script_options["urlbuffer"] == "on":
create_buffer()
weechat.hook_config("plugins.var.python." + SCRIPT_NAME + ".*", "on_config_change", "")
weechat.hook_signal("*,irc_in2_privmsg", "on_privmsg", "")
weechat.hook_signal("*,irc_out1_privmsg", "on_privmsg", "")