Update word cutter
This commit is contained in:
parent
b47b92fc01
commit
8c82de7bc1
11
cutword/Cargo.lock
generated
11
cutword/Cargo.lock
generated
|
@ -11,6 +11,12 @@ dependencies = [
|
|||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
|
@ -72,6 +78,7 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
|||
name = "nyx_bot-cutword"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"jieba-rs",
|
||||
]
|
||||
|
||||
|
@ -130,9 +137,9 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
|||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.7.0"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a"
|
||||
checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
|
|
|
@ -6,4 +6,5 @@ edition = "2021"
|
|||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
jieba-rs = "*"
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys
|
||||
|
||||
from jieba import posseg
|
||||
from collections import defaultdict
|
||||
|
||||
import jieba
|
||||
from jieba import posseg
|
||||
|
||||
STOP_FLAGS = [
|
||||
"d", # 副词
|
||||
"f", # 方位名词
|
||||
|
@ -26,8 +27,22 @@ STOP_FLAGS = [
|
|||
"ud",
|
||||
]
|
||||
|
||||
try:
|
||||
jieba.load_userdict("userdict.txt")
|
||||
except: # noqa: E722
|
||||
pass
|
||||
|
||||
result = defaultdict(int)
|
||||
|
||||
stopwords = set()
|
||||
|
||||
try:
|
||||
with open("StopWords-simple.txt") as f:
|
||||
for line in f:
|
||||
stopwords.add(line.strip())
|
||||
except: # noqa: E722
|
||||
pass
|
||||
|
||||
for line in sys.stdin:
|
||||
if line.startswith("/"):
|
||||
continue
|
||||
|
@ -37,6 +52,8 @@ for line in sys.stdin:
|
|||
for word, flag in words:
|
||||
if flag in STOP_FLAGS:
|
||||
continue
|
||||
if word.lower() in stopwords:
|
||||
continue
|
||||
result[word.lower()] += 1
|
||||
|
||||
|
||||
|
|
|
@ -1,11 +1,23 @@
|
|||
use anyhow::{anyhow, Result};
|
||||
use jieba_rs::Jieba;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{Result, Write};
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Write};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let jieba = Jieba::new();
|
||||
let mut jieba = Jieba::new();
|
||||
let stdin = std::io::stdin();
|
||||
let mut result = HashMap::new();
|
||||
let stderr = std::io::stderr();
|
||||
let mut stderr = stderr.lock();
|
||||
if let Err(e) = load_dict(&mut jieba) {
|
||||
writeln!(stderr, "Reading userdict.txt failed: {:#}", e)?;
|
||||
}
|
||||
let stopwords = match load_stopwords() {
|
||||
Ok(s) => s,
|
||||
Err(_) => HashSet::new(),
|
||||
};
|
||||
for line in stdin.lines() {
|
||||
match line {
|
||||
Ok(line) => {
|
||||
|
@ -21,8 +33,12 @@ fn main() -> Result<()> {
|
|||
if STOP_FLAGS.contains(&tag.tag) || tag.word.len() > 21 {
|
||||
continue;
|
||||
}
|
||||
let word = tag.word.to_lowercase();
|
||||
if stopwords.contains(&word) {
|
||||
continue;
|
||||
}
|
||||
result
|
||||
.entry(tag.word.to_lowercase())
|
||||
.entry(word)
|
||||
.and_modify(|c| *c += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
|
@ -40,6 +56,36 @@ fn main() -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn load_dict(jieba: &mut Jieba) -> Result<()> {
|
||||
let file = BufReader::new(File::open("userdict.txt")?);
|
||||
for line in file.lines() {
|
||||
match line {
|
||||
Ok(line) => {
|
||||
let mut it = line.split_whitespace();
|
||||
let word = it.next().ok_or_else(|| anyhow!("Bad line: {}", line))?;
|
||||
let tag = Some(it.next().ok_or_else(|| anyhow!("Bad line: {}", line))?);
|
||||
jieba.add_word(word, None, tag);
|
||||
}
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_stopwords() -> Result<HashSet<String>> {
|
||||
let file = BufReader::new(File::open("StopWords-simple.txt")?);
|
||||
let mut result = HashSet::new();
|
||||
for line in file.lines() {
|
||||
match line {
|
||||
Ok(line) => {
|
||||
result.insert(line);
|
||||
}
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
const STOP_FLAGS: &[&str] = &[
|
||||
"d", // 副词
|
||||
"f", // 方位名词
|
||||
|
|
Loading…
Reference in a new issue