Update word cutter

This commit is contained in:
夜坂雅 2023-02-09 12:16:33 +08:00
parent b47b92fc01
commit 8c82de7bc1
4 changed files with 78 additions and 7 deletions

11
cutword/Cargo.lock generated
View file

@ -11,6 +11,12 @@ dependencies = [
"memchr",
]
[[package]]
name = "anyhow"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
[[package]]
name = "byteorder"
version = "1.4.3"
@ -72,6 +78,7 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
name = "nyx_bot-cutword"
version = "0.1.0"
dependencies = [
"anyhow",
"jieba-rs",
]
@ -130,9 +137,9 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "regex"
version = "1.7.0"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a"
checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
dependencies = [
"aho-corasick",
"memchr",

View file

@ -6,4 +6,5 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0"
jieba-rs = "*"

View file

@ -1,9 +1,10 @@
#!/usr/bin/env python3
import sys
from jieba import posseg
from collections import defaultdict
import jieba
from jieba import posseg
STOP_FLAGS = [
"d", # 副词
"f", # 方位名词
@ -26,8 +27,22 @@ STOP_FLAGS = [
"ud",
]
try:
jieba.load_userdict("userdict.txt")
except: # noqa: E722
pass
result = defaultdict(int)
stopwords = set()
try:
with open("StopWords-simple.txt") as f:
for line in f:
stopwords.add(line.strip())
except: # noqa: E722
pass
for line in sys.stdin:
if line.startswith("/"):
continue
@ -37,6 +52,8 @@ for line in sys.stdin:
for word, flag in words:
if flag in STOP_FLAGS:
continue
if word.lower() in stopwords:
continue
result[word.lower()] += 1

View file

@ -1,11 +1,23 @@
use anyhow::{anyhow, Result};
use jieba_rs::Jieba;
use std::collections::HashMap;
use std::io::{Result, Write};
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Write};
fn main() -> Result<()> {
let jieba = Jieba::new();
let mut jieba = Jieba::new();
let stdin = std::io::stdin();
let mut result = HashMap::new();
let stderr = std::io::stderr();
let mut stderr = stderr.lock();
if let Err(e) = load_dict(&mut jieba) {
writeln!(stderr, "Reading userdict.txt failed: {:#}", e)?;
}
let stopwords = match load_stopwords() {
Ok(s) => s,
Err(_) => HashSet::new(),
};
for line in stdin.lines() {
match line {
Ok(line) => {
@ -21,8 +33,12 @@ fn main() -> Result<()> {
if STOP_FLAGS.contains(&tag.tag) || tag.word.len() > 21 {
continue;
}
let word = tag.word.to_lowercase();
if stopwords.contains(&word) {
continue;
}
result
.entry(tag.word.to_lowercase())
.entry(word)
.and_modify(|c| *c += 1)
.or_insert(1);
}
@ -40,6 +56,36 @@ fn main() -> Result<()> {
Ok(())
}
fn load_dict(jieba: &mut Jieba) -> Result<()> {
let file = BufReader::new(File::open("userdict.txt")?);
for line in file.lines() {
match line {
Ok(line) => {
let mut it = line.split_whitespace();
let word = it.next().ok_or_else(|| anyhow!("Bad line: {}", line))?;
let tag = Some(it.next().ok_or_else(|| anyhow!("Bad line: {}", line))?);
jieba.add_word(word, None, tag);
}
Err(_) => break,
}
}
Ok(())
}
fn load_stopwords() -> Result<HashSet<String>> {
let file = BufReader::new(File::open("StopWords-simple.txt")?);
let mut result = HashSet::new();
for line in file.lines() {
match line {
Ok(line) => {
result.insert(line);
}
Err(_) => break,
}
}
Ok(result)
}
const STOP_FLAGS: &[&str] = &[
"d", // 副词
"f", // 方位名词