Add word cloud functions

This commit is contained in:
夜坂雅 2022-12-25 21:07:13 +08:00
parent 584d131bfb
commit 2e0f2dc902
10 changed files with 518 additions and 1 deletions

101
NOTICES-3RDPARTY.txt Normal file
View File

@ -0,0 +1,101 @@
THIRD PARTY FILE LICENSES:
nyx_bot/wordcloud_font.ttf:
This file is originally named `SourceHanSansSC-Regular.otf` and is avaliable under the following license:
Copyright 2014-2021 Adobe (http://www.adobe.com/), with Reserved Font
Name 'Source'. Source is a trademark of Adobe in the United States
and/or other countries.
This Font Software is licensed under the SIL Open Font License,
Version 1.1.
This license is copied below, and is also available with a FAQ at:
http://scripts.sil.org/OFL
-----------------------------------------------------------
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
-----------------------------------------------------------
PREAMBLE
The goals of the Open Font License (OFL) are to stimulate worldwide
development of collaborative font projects, to support the font
creation efforts of academic and linguistic communities, and to
provide a free and open framework in which fonts may be shared and
improved in partnership with others.
The OFL allows the licensed fonts to be used, studied, modified and
redistributed freely as long as they are not sold by themselves. The
fonts, including any derivative works, can be bundled, embedded,
redistributed and/or sold with any software provided that any reserved
names are not used by derivative works. The fonts and derivatives,
however, cannot be released under any other type of license. The
requirement for fonts to remain under this license does not apply to
any document created using the fonts or their derivatives.
DEFINITIONS
"Font Software" refers to the set of files released by the Copyright
Holder(s) under this license and clearly marked as such. This may
include source files, build scripts and documentation.
"Reserved Font Name" refers to any names specified as such after the
copyright statement(s).
"Original Version" refers to the collection of Font Software
components as distributed by the Copyright Holder(s).
"Modified Version" refers to any derivative made by adding to,
deleting, or substituting -- in part or in whole -- any of the
components of the Original Version, by changing formats or by porting
the Font Software to a new environment.
"Author" refers to any designer, engineer, programmer, technical
writer or other person who contributed to the Font Software.
PERMISSION & CONDITIONS
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Font Software, to use, study, copy, merge, embed,
modify, redistribute, and sell modified and unmodified copies of the
Font Software, subject to the following conditions:
1) Neither the Font Software nor any of its individual components, in
Original or Modified Versions, may be sold by itself.
2) Original or Modified Versions of the Font Software may be bundled,
redistributed and/or sold with any software, provided that each copy
contains the above copyright notice and this license. These can be
included either as stand-alone text files, human-readable headers or
in the appropriate machine-readable metadata fields within text or
binary files as long as those fields can be easily viewed by the user.
3) No Modified Version of the Font Software may use the Reserved Font
Name(s) unless explicit written permission is granted by the
corresponding Copyright Holder. This restriction only applies to the
primary font name as presented to the users.
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
Software shall not be used to promote, endorse or advertise any
Modified Version, except to acknowledge the contribution(s) of the
Copyright Holder(s) and the Author(s) or with their explicit written
permission.
5) The Font Software, modified or unmodified, in part or in whole,
must be distributed entirely under this license, and must not be
distributed under any other license. The requirement for fonts to
remain under this license does not apply to any document created using
the Font Software.
TERMINATION
This license becomes null and void if any of the above conditions are
not met.
DISCLAIMER
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
OTHER DEALINGS IN THE FONT SOFTWARE.

View File

@ -106,6 +106,17 @@ pip install -e ".[postgres]"
[Sarasa Gothic](https://github.com/be5invis/Sarasa-Gothic) should be installed for best quote image results. It is also recommanded to install the Noto Color Emoji font on the machine running the bot.
## Build word segmenter
Use Rust to build the segmenter:
```
cd cutword
cargo build --release
```
Copy the `target/release/nyx_bot-cutword` to a `PATH` avaliable to the bot.
## Configuration
Copy the sample configuration file to a new `config.yaml` file.

1
cutword/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

158
cutword/Cargo.lock generated Normal file
View File

@ -0,0 +1,158 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "0.7.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
dependencies = [
"memchr",
]
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cedarwood"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
dependencies = [
"smallvec",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "jieba-rs"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "nyx_bot-cutword"
version = "0.1.0"
dependencies = [
"jieba-rs",
]
[[package]]
name = "phf"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
dependencies = [
"siphasher",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "regex"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
[[package]]
name = "siphasher"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "smallvec"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"

9
cutword/Cargo.toml Normal file
View File

@ -0,0 +1,9 @@
[package]
name = "nyx_bot-cutword"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
jieba-rs = "*"

60
cutword/src/main.rs Normal file
View File

@ -0,0 +1,60 @@
use jieba_rs::Jieba;
use std::collections::HashMap;
use std::io::{Result, Write};
fn main() -> Result<()> {
let jieba = Jieba::new();
let stdin = std::io::stdin();
let mut result = HashMap::new();
for line in stdin.lines() {
match line {
Ok(line) => {
if line.is_empty() {
continue;
}
if line.starts_with('/') {
continue;
}
for tag in jieba.tag(&line, true) {
if STOP_FLAGS.contains(&tag.tag) || tag.word.len() > 21 {
continue;
}
result
.entry(tag.word.to_lowercase())
.and_modify(|c| *c += 1)
.or_insert(1);
}
}
Err(_) => break,
}
}
let stdout = std::io::stdout();
let mut stdout = stdout.lock();
for (k, v) in result {
writeln!(stdout, "{}\t{}", k, v)?;
}
Ok(())
}
const STOP_FLAGS: &[&str] = &[
"d", // 副词
"f", // 方位名词
"x", // 标点符号(文档说是 w 但是实际测试是 x
"p", // 介词
"t", // 时间
"q", // 量词
"m", // 数量词
"nr", // 人名,你我他
"r", // 代词
"c", // 连词
"e", // 文档没说,看着像语气词
"xc", // 其他虚词
"zg", // 文档没说,给出的词也没找到规律,但都不是想要的
"y", // 文档没说,看着像语气词
// u 开头的都是助词,具体细分的分类文档没说
"uj", "ug", "ul", "ud",
];

View File

@ -24,6 +24,7 @@ from nyx_bot.config import Config
from nyx_bot.errors import NyxBotRuntimeError, NyxBotValueError
from nyx_bot.storage import MatrixMessage, MembershipUpdates, UserTag
from nyx_bot.utils import make_divergence, parse_matrixdotto_link
from nyx_bot.wordcloud import send_wordcloud
logger = logging.getLogger(__name__)
@ -106,6 +107,8 @@ class Command:
await self._last_message()
elif self.command == "divergence":
await self._divergence()
elif self.command == "wordcloud":
await self._wordcloud()
else:
await self._unknown_command()
@ -536,3 +539,18 @@ Outside of a reply, send the avatar of the command sender.\
raise NyxBotRuntimeError(f"Failed to fetch event: {error}")
sender = target_event.event.sender
UserTag.delete_user_tag(self.room.room_id, sender)
async def _wordcloud(self):
if not self.reply_to:
sender = self.event.sender
else:
target_event = await self.client.room_get_event(
self.room.room_id, self.reply_to
)
if isinstance(target_event, RoomGetEventError):
error = target_event.message
raise NyxBotRuntimeError(f"Failed to fetch event: {error}")
sender = target_event.event.sender
await self.client.room_typing(self.room.room_id)
await send_wordcloud(self.client, self.room, self.event, sender)
await self.client.room_typing(self.room.room_id, False)

View File

@ -1,5 +1,6 @@
from datetime import datetime
from io import BytesIO
from html.parser import HTMLParser
from io import BytesIO, StringIO
from random import Random
from typing import Optional
from urllib.parse import unquote, urlparse
@ -213,3 +214,24 @@ def make_divergence(room: MatrixRoom):
result = first_value
return result
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()

137
nyx_bot/wordcloud.py Normal file
View File

@ -0,0 +1,137 @@
import asyncio
import logging
import os
import re
from asyncio import create_subprocess_exec
from asyncio.subprocess import PIPE
from io import BytesIO, StringIO
from nio import AsyncClient, MatrixRoom, RoomMessageText, UploadResponse
from wand.image import Image
from wordcloud import WordCloud
import nyx_bot
from nyx_bot.storage import MatrixMessage
from nyx_bot.utils import strip_tags
CUTWORDS_EXE = "nyx_bot-cutword"
FONT = os.path.join(nyx_bot.__path__[0], "wordcloud_font.ttf")
logger = logging.getLogger(__name__)
async def get_word_freqs(text):
proc = await create_subprocess_exec(
CUTWORDS_EXE,
stdin=PIPE,
stdout=PIPE,
)
stdout, _ = await proc.communicate(input=text.encode("utf-8"))
freqs = {}
lines = stdout.decode().splitlines()
for line in lines:
word, freq = line.split(None, 1)
freqs[word] = int(freq)
return freqs
def make_image(freqs, bytesio):
image = (
WordCloud(
font_path=FONT,
width=800,
height=400,
)
.generate_from_frequencies(freqs)
.to_image()
)
image.save(bytesio, "PNG")
async def send_wordcloud(
client: AsyncClient,
room: MatrixRoom,
event: RoomMessageText,
sender: str,
):
bytesio = BytesIO()
texts = gather_messages(room, sender)
freqs = await get_word_freqs(texts)
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, make_image, freqs, bytesio)
length = bytesio.getbuffer().nbytes
bytesio.seek(0)
image = Image(file=bytesio)
(width, height) = (image.width, image.height)
# Seek again
bytesio.seek(0)
resp, maybe_keys = await client.upload(
bytesio,
content_type="image/png",
filename="image.png",
filesize=length,
)
if isinstance(resp, UploadResponse):
print("Image was uploaded successfully to server. ")
else:
print(f"Failed to upload image. Failure response: {resp}")
content = {
"body": "[Wordcloud]",
"info": {
"size": length,
"mimetype": "image/png",
"thumbnail_info": {
"mimetype": "image/png",
"size": length,
"w": width, # width in pixel
"h": height, # height in pixel
},
"w": width, # width in pixel
"h": height, # height in pixel
"thumbnail_url": resp.content_uri,
},
"msgtype": "m.image",
"url": resp.content_uri,
}
content["m.relates_to"] = {"m.in_reply_to": {"event_id": event.event_id}}
# Add custom data for tracking bot message.
content["io.github.shadowrz.nyx_bot"] = {
"in_reply_to": event.event_id,
"type": "image",
}
await client.room_send(room.room_id, message_type="m.room.message", content=content)
def gather_messages(
room: MatrixRoom,
sender: str,
):
stringio = StringIO()
msg_items = (
MatrixMessage.select()
.where(
(MatrixMessage.room_id == room.room_id) & (MatrixMessage.sender == sender)
)
.order_by(MatrixMessage.origin_server_ts.desc())
)
for msg_item in msg_items:
if msg_item.formatted_body is not None:
string = re.sub(r"<mx-reply>.*</mx-reply>", "", msg_item.formatted_body)
print(strip_tags(string), file=stringio)
elif msg_item.body is not None:
print(msg_item.body, file=stringio)
else:
continue
ret = stringio.getvalue()
return ret

BIN
nyx_bot/wordcloud_font.ttf Normal file

Binary file not shown.