diff --git a/NOTICES-3RDPARTY.txt b/NOTICES-3RDPARTY.txt new file mode 100644 index 0000000..a09cbf7 --- /dev/null +++ b/NOTICES-3RDPARTY.txt @@ -0,0 +1,101 @@ +THIRD PARTY FILE LICENSES: + +nyx_bot/wordcloud_font.ttf: +This file is originally named `SourceHanSansSC-Regular.otf` and is avaliable under the following license: + +Copyright 2014-2021 Adobe (http://www.adobe.com/), with Reserved Font +Name 'Source'. Source is a trademark of Adobe in the United States +and/or other countries. + +This Font Software is licensed under the SIL Open Font License, +Version 1.1. + +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font +creation efforts of academic and linguistic communities, and to +provide a free and open framework in which fonts may be shared and +improved in partnership with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply to +any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software +components as distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, +deleting, or substituting -- in part or in whole -- any of the +components of the Original Version, by changing formats or by porting +the Font Software to a new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, +modify, redistribute, and sell modified and unmodified copies of the +Font Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, in +Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the +corresponding Copyright Holder. This restriction only applies to the +primary font name as presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created using +the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/SETUP.md b/SETUP.md index be5dd7b..6a7a1a5 100644 --- a/SETUP.md +++ b/SETUP.md @@ -106,6 +106,17 @@ pip install -e ".[postgres]" [Sarasa Gothic](https://github.com/be5invis/Sarasa-Gothic) should be installed for best quote image results. It is also recommanded to install the Noto Color Emoji font on the machine running the bot. +## Build word segmenter + +Use Rust to build the segmenter: + +``` +cd cutword +cargo build --release +``` + +Copy the `target/release/nyx_bot-cutword` to a `PATH` avaliable to the bot. + ## Configuration Copy the sample configuration file to a new `config.yaml` file. diff --git a/cutword/.gitignore b/cutword/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/cutword/.gitignore @@ -0,0 +1 @@ +/target diff --git a/cutword/Cargo.lock b/cutword/Cargo.lock new file mode 100644 index 0000000..2e05c85 --- /dev/null +++ b/cutword/Cargo.lock @@ -0,0 +1,158 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "jieba-rs" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "nyx_bot-cutword" +version = "0.1.0" +dependencies = [ + "jieba-rs", +] + +[[package]] +name = "phf" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +dependencies = [ + "siphasher", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "regex" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" diff --git a/cutword/Cargo.toml b/cutword/Cargo.toml new file mode 100644 index 0000000..6397b96 --- /dev/null +++ b/cutword/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "nyx_bot-cutword" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +jieba-rs = "*" diff --git a/cutword/src/main.rs b/cutword/src/main.rs new file mode 100644 index 0000000..f88599a --- /dev/null +++ b/cutword/src/main.rs @@ -0,0 +1,60 @@ +use jieba_rs::Jieba; +use std::collections::HashMap; +use std::io::{Result, Write}; + +fn main() -> Result<()> { + let jieba = Jieba::new(); + let stdin = std::io::stdin(); + let mut result = HashMap::new(); + for line in stdin.lines() { + match line { + Ok(line) => { + if line.is_empty() { + continue; + } + + if line.starts_with('/') { + continue; + } + + for tag in jieba.tag(&line, true) { + if STOP_FLAGS.contains(&tag.tag) || tag.word.len() > 21 { + continue; + } + result + .entry(tag.word.to_lowercase()) + .and_modify(|c| *c += 1) + .or_insert(1); + } + } + Err(_) => break, + } + } + + let stdout = std::io::stdout(); + let mut stdout = stdout.lock(); + for (k, v) in result { + writeln!(stdout, "{}\t{}", k, v)?; + } + + Ok(()) +} + +const STOP_FLAGS: &[&str] = &[ + "d", // 副词 + "f", // 方位名词 + "x", // 标点符号(文档说是 w 但是实际测试是 x + "p", // 介词 + "t", // 时间 + "q", // 量词 + "m", // 数量词 + "nr", // 人名,你我他 + "r", // 代词 + "c", // 连词 + "e", // 文档没说,看着像语气词 + "xc", // 其他虚词 + "zg", // 文档没说,给出的词也没找到规律,但都不是想要的 + "y", // 文档没说,看着像语气词 + // u 开头的都是助词,具体细分的分类文档没说 + "uj", "ug", "ul", "ud", +]; diff --git a/nyx_bot/bot_commands.py b/nyx_bot/bot_commands.py index 8daaa27..09843aa 100644 --- a/nyx_bot/bot_commands.py +++ b/nyx_bot/bot_commands.py @@ -24,6 +24,7 @@ from nyx_bot.config import Config from nyx_bot.errors import NyxBotRuntimeError, NyxBotValueError from nyx_bot.storage import MatrixMessage, MembershipUpdates, UserTag from nyx_bot.utils import make_divergence, parse_matrixdotto_link +from nyx_bot.wordcloud import send_wordcloud logger = logging.getLogger(__name__) @@ -106,6 +107,8 @@ class Command: await self._last_message() elif self.command == "divergence": await self._divergence() + elif self.command == "wordcloud": + await self._wordcloud() else: await self._unknown_command() @@ -536,3 +539,18 @@ Outside of a reply, send the avatar of the command sender.\ raise NyxBotRuntimeError(f"Failed to fetch event: {error}") sender = target_event.event.sender UserTag.delete_user_tag(self.room.room_id, sender) + + async def _wordcloud(self): + if not self.reply_to: + sender = self.event.sender + else: + target_event = await self.client.room_get_event( + self.room.room_id, self.reply_to + ) + if isinstance(target_event, RoomGetEventError): + error = target_event.message + raise NyxBotRuntimeError(f"Failed to fetch event: {error}") + sender = target_event.event.sender + await self.client.room_typing(self.room.room_id) + await send_wordcloud(self.client, self.room, self.event, sender) + await self.client.room_typing(self.room.room_id, False) diff --git a/nyx_bot/utils.py b/nyx_bot/utils.py index bab94e8..a392845 100644 --- a/nyx_bot/utils.py +++ b/nyx_bot/utils.py @@ -1,5 +1,6 @@ from datetime import datetime -from io import BytesIO +from html.parser import HTMLParser +from io import BytesIO, StringIO from random import Random from typing import Optional from urllib.parse import unquote, urlparse @@ -213,3 +214,24 @@ def make_divergence(room: MatrixRoom): result = first_value return result + + +class MLStripper(HTMLParser): + def __init__(self): + super().__init__() + self.reset() + self.strict = False + self.convert_charrefs = True + self.text = StringIO() + + def handle_data(self, d): + self.text.write(d) + + def get_data(self): + return self.text.getvalue() + + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() diff --git a/nyx_bot/wordcloud.py b/nyx_bot/wordcloud.py new file mode 100644 index 0000000..4dae880 --- /dev/null +++ b/nyx_bot/wordcloud.py @@ -0,0 +1,137 @@ +import asyncio +import logging +import os +import re +from asyncio import create_subprocess_exec +from asyncio.subprocess import PIPE +from io import BytesIO, StringIO + +from nio import AsyncClient, MatrixRoom, RoomMessageText, UploadResponse +from wand.image import Image +from wordcloud import WordCloud + +import nyx_bot +from nyx_bot.storage import MatrixMessage +from nyx_bot.utils import strip_tags + +CUTWORDS_EXE = "nyx_bot-cutword" +FONT = os.path.join(nyx_bot.__path__[0], "wordcloud_font.ttf") +logger = logging.getLogger(__name__) + + +async def get_word_freqs(text): + proc = await create_subprocess_exec( + CUTWORDS_EXE, + stdin=PIPE, + stdout=PIPE, + ) + + stdout, _ = await proc.communicate(input=text.encode("utf-8")) + + freqs = {} + lines = stdout.decode().splitlines() + for line in lines: + word, freq = line.split(None, 1) + freqs[word] = int(freq) + + return freqs + + +def make_image(freqs, bytesio): + image = ( + WordCloud( + font_path=FONT, + width=800, + height=400, + ) + .generate_from_frequencies(freqs) + .to_image() + ) + image.save(bytesio, "PNG") + + +async def send_wordcloud( + client: AsyncClient, + room: MatrixRoom, + event: RoomMessageText, + sender: str, +): + bytesio = BytesIO() + texts = gather_messages(room, sender) + freqs = await get_word_freqs(texts) + + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, make_image, freqs, bytesio) + + length = bytesio.getbuffer().nbytes + bytesio.seek(0) + + image = Image(file=bytesio) + (width, height) = (image.width, image.height) + + # Seek again + bytesio.seek(0) + resp, maybe_keys = await client.upload( + bytesio, + content_type="image/png", + filename="image.png", + filesize=length, + ) + if isinstance(resp, UploadResponse): + print("Image was uploaded successfully to server. ") + else: + print(f"Failed to upload image. Failure response: {resp}") + + content = { + "body": "[Wordcloud]", + "info": { + "size": length, + "mimetype": "image/png", + "thumbnail_info": { + "mimetype": "image/png", + "size": length, + "w": width, # width in pixel + "h": height, # height in pixel + }, + "w": width, # width in pixel + "h": height, # height in pixel + "thumbnail_url": resp.content_uri, + }, + "msgtype": "m.image", + "url": resp.content_uri, + } + + content["m.relates_to"] = {"m.in_reply_to": {"event_id": event.event_id}} + + # Add custom data for tracking bot message. + content["io.github.shadowrz.nyx_bot"] = { + "in_reply_to": event.event_id, + "type": "image", + } + + await client.room_send(room.room_id, message_type="m.room.message", content=content) + + +def gather_messages( + room: MatrixRoom, + sender: str, +): + stringio = StringIO() + msg_items = ( + MatrixMessage.select() + .where( + (MatrixMessage.room_id == room.room_id) & (MatrixMessage.sender == sender) + ) + .order_by(MatrixMessage.origin_server_ts.desc()) + ) + for msg_item in msg_items: + if msg_item.formatted_body is not None: + string = re.sub(r".*", "", msg_item.formatted_body) + print(strip_tags(string), file=stringio) + elif msg_item.body is not None: + print(msg_item.body, file=stringio) + else: + continue + + ret = stringio.getvalue() + return ret diff --git a/nyx_bot/wordcloud_font.ttf b/nyx_bot/wordcloud_font.ttf new file mode 100644 index 0000000..73fcdaf Binary files /dev/null and b/nyx_bot/wordcloud_font.ttf differ