Update to 0.9
- add a perl script st-wordbreak
This commit is contained in:
parent
7f74bed877
commit
48e614e23d
5 changed files with 310 additions and 22 deletions
|
@ -1,7 +1,7 @@
|
|||
A collection of utilities to split Thai Unicode UTF-8 text by word
|
||||
boundaries, also known as word tokenization. The utilities use emacs,
|
||||
swath, and a c++ icu-project program. All use dictionary-based word
|
||||
splitting.
|
||||
boundaries, also known as word tokenization or word breaking. The
|
||||
utilities use emacs, swath, perl, and a c++ icu-project program. All
|
||||
use dictionary-based word splitting.
|
||||
|
||||
Also included is a merged dictionary file of Thai words and a perl
|
||||
script to grep Thai UTF-8 words.
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# $NetBSD: Makefile,v 1.9 2020/08/31 18:12:18 wiz Exp $
|
||||
# $NetBSD: Makefile,v 1.10 2020/09/05 18:02:36 scole Exp $
|
||||
|
||||
PKGNAME= split-thai-0.8
|
||||
PKGREVISION= 1
|
||||
PKGNAME= split-thai-0.9
|
||||
CATEGORIES= textproc
|
||||
MAINTAINER= pkgsrc-users@NetBSD.org
|
||||
COMMENT= Utilities to split UTF-8 Thai text into words
|
||||
|
@ -21,7 +20,7 @@ BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
|
|||
DEPENDS+= emacs-[0-9]*:../../editors/emacs
|
||||
DEPENDS+= swath-[0-9]*:../../textproc/swath
|
||||
|
||||
REPLACE_PERL= tgrep
|
||||
REPLACE_PERL= st-wordbreak tgrep
|
||||
REPLACE_SH= st-swath
|
||||
|
||||
UTF8_ENV= env LC_ALL=C.UTF-8
|
||||
|
@ -43,14 +42,14 @@ SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
|
|||
SUBST_CLASSES+= dictionary-app
|
||||
SUBST_STAGE.dictionary-app= pre-configure
|
||||
SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
|
||||
SUBST_FILES.dictionary-app= st-emacs st-swath
|
||||
SUBST_FILES.dictionary-app= st-emacs st-swath st-wordbreak
|
||||
SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
|
||||
SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
|
||||
|
||||
pre-extract:
|
||||
mkdir -p ${WRKSRC}
|
||||
cd files && cp README.txt st-emacs st-icu.cc st-swath \
|
||||
tgrep thai-utility.el thaidict.abm ${WRKSRC}
|
||||
st-wordbreak tgrep thai-utility.el thaidict.abm ${WRKSRC}
|
||||
|
||||
post-extract:
|
||||
cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
|
||||
|
@ -83,7 +82,7 @@ do-build:
|
|||
|
||||
do-install:
|
||||
${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
|
||||
${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
|
||||
${WRKSRC}/st-wordbreak ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
|
||||
${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
|
||||
.for i in ${ST_SHARE_FILES}
|
||||
${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $
|
||||
@comment $NetBSD: PLIST,v 1.4 2020/09/05 18:02:36 scole Exp $
|
||||
bin/st-emacs
|
||||
bin/st-icu
|
||||
bin/st-swath
|
||||
bin/st-wordbreak
|
||||
bin/tgrep
|
||||
share/split-thai/README.txt
|
||||
share/split-thai/thai-dict.el
|
||||
|
|
|
@ -2,10 +2,11 @@ NAME
|
|||
st-emacs
|
||||
st-icu
|
||||
st-swath
|
||||
st-wordbreak
|
||||
tgrep
|
||||
|
||||
SYNOPSIS
|
||||
st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
|
||||
st-emacs|st-icu|st-swath|st-wordbreak [filename|text1 text2 ...|'blank']
|
||||
tgrep [options] FILE ...
|
||||
|
||||
DESCRIPTION
|
||||
|
@ -13,14 +14,16 @@ DESCRIPTION
|
|||
by spaces (word tokenization). They can separate stdin, files,
|
||||
or text as arguments. It includes these utilities:
|
||||
|
||||
st-emacs: emacs-script using emacs lisp thai-word library
|
||||
https://www.gnu.org/software/emacs/
|
||||
st-icu: basic C++ program using the ICU library
|
||||
http://site.icu-project.org/
|
||||
st-swath: sh script wrapper to simplfy args to the swath program
|
||||
https://linux.thai.net/projects/swath
|
||||
st-emacs: emacs-script using emacs lisp thai-word library
|
||||
https://www.gnu.org/software/emacs/
|
||||
st-icu: basic C++ program using the ICU library
|
||||
http://site.icu-project.org/
|
||||
st-swath: sh script wrapper to simplfy args to the swath program
|
||||
https://linux.thai.net/projects/swath
|
||||
st-wordbreak: perl script to brute-force separate thai words,
|
||||
see "st-wordbreak -h"
|
||||
|
||||
tgrep: grep-like utility using perl, see "tgrep -h"
|
||||
tgrep: grep-like utility using perl, see "tgrep -h"
|
||||
|
||||
EXAMPLES
|
||||
split one or more text strings:
|
||||
|
@ -62,9 +65,9 @@ NOTES
|
|||
and corresponding .tri file, and emacs lisp .el files for reading
|
||||
and dumping out dictionary files.
|
||||
|
||||
st-emacs and st-swath are setup to use the combined dictionary
|
||||
with words from the emacs 'thai-word library, swath dictionary
|
||||
words, and the icu thai library words.
|
||||
st-emacs, st-swath, and st-wordbreak are setup to use the
|
||||
combined dictionary with words from the emacs 'thai-word library,
|
||||
swath dictionary words, and the icu thai library words.
|
||||
|
||||
st-icu uses its own built in library. To customise the icu
|
||||
dictionary, you apparently would have to modify
|
||||
|
@ -79,3 +82,4 @@ BUGS
|
|||
thai text mixed with other languages may not be handled well when
|
||||
splitting.
|
||||
this file should be converted to proper manpages.
|
||||
these utilities need better names.
|
||||
|
|
284
textproc/split-thai/files/st-wordbreak
Executable file
284
textproc/split-thai/files/st-wordbreak
Executable file
|
@ -0,0 +1,284 @@
|
|||
#!/bin/perl
|
||||
#
|
||||
# split thai text or file into longest (or smallest) possible words
|
||||
# based on words in a dictionary file.
|
||||
#
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
use Getopt::Std;
|
||||
|
||||
use utf8;
|
||||
use open qw/:std :utf8/;
|
||||
|
||||
our ($opt_a, $opt_d, $opt_h, $opt_l, $opt_s);
|
||||
|
||||
# global dictionary of thai words
|
||||
my %tdict = ();
|
||||
|
||||
# tmp results for recursions
|
||||
my @tmp_combos = ();
|
||||
|
||||
#
|
||||
# read thai dictionary into hash. each line in dictionary should be a
|
||||
# thai word. returns number of words read.
|
||||
#
|
||||
sub read_dictionary {
|
||||
# default dictionary
|
||||
my $fname = "ST_SHARE_DIR/thaidict";
|
||||
$fname = $opt_d if defined $opt_d;
|
||||
|
||||
die "error reading dictionary $fname" unless -f $fname && -s $fname;
|
||||
open(my $fh, '<', $fname) or die "error opening $fname: $!";
|
||||
while ( <$fh> ) {
|
||||
chomp;
|
||||
$tdict{$_} = 1;
|
||||
}
|
||||
close $fh or die "error closing $fname : $!";
|
||||
|
||||
# add punctuation and misc so parser won't choke
|
||||
my @misc_words =
|
||||
( "~", "`", "!", "@", "#", "\$", "%", "^", "&", "*", "(", ")",
|
||||
"-", "_", "=", "+", "\\", "|", "{", "}", "[", "]",
|
||||
";", ":", "'", "\"", "<", ">", ".", ",", "/", "?",
|
||||
"ๆ", "ฯาฯ", "ฯ", "฿", "๏", "๚", "๛" );
|
||||
|
||||
foreach ( @misc_words ) {
|
||||
$tdict{$_} = 1;
|
||||
}
|
||||
|
||||
my $size = keys %tdict;
|
||||
return $size;
|
||||
}
|
||||
|
||||
#
|
||||
# find all possible word breaks for a string,
|
||||
# returns an array of strings
|
||||
#
|
||||
sub word_break {
|
||||
my $str = shift @_;
|
||||
|
||||
# return string as is unless it contains thai
|
||||
return ( $str ) unless $str =~ m/\p{InThai}/;
|
||||
|
||||
# add numbers to dictionary entries so numbers will be parsed
|
||||
my @numbers = ( $str =~ m/([0123456789๑๒๓๔๕๖๗๘๙๐,.]+)/g );
|
||||
foreach ( @numbers ) {
|
||||
$tdict{$_} = 1;
|
||||
}
|
||||
|
||||
# add any english looking words
|
||||
my @eng_words = ( $str =~ m/([a-zA-Z]+)/g );
|
||||
foreach ( @eng_words ) {
|
||||
$tdict{$_} = 1;
|
||||
}
|
||||
|
||||
# clear any whitespace
|
||||
$str =~ s/\s+//g;
|
||||
|
||||
# clear any previous word breaks
|
||||
@tmp_combos = ();
|
||||
word_break_util( $str, length($str), "");
|
||||
|
||||
# filter results depending on args
|
||||
my @filtered = filter_combos( @tmp_combos );
|
||||
return @filtered;
|
||||
}
|
||||
|
||||
#
|
||||
# recursive function to find all possible word combos of string based
|
||||
# on words in dictionary. adapted from
|
||||
# https://www.geeksforgeeks.org/word-break-problem-using-backtracking
|
||||
#
|
||||
# saves combos to global list
|
||||
#
|
||||
sub word_break_util{
|
||||
my $str = shift @_;
|
||||
my $n = shift @_;
|
||||
my $result = shift @_;
|
||||
|
||||
for ( my $i=1; $i <= $n; $i++ ) {
|
||||
my $prefix = substr($str, 0, $i);
|
||||
|
||||
if ( exists $tdict{$prefix} ) {
|
||||
if ( $i == $n ) {
|
||||
$result .= $prefix;
|
||||
#print $result, "\n";
|
||||
push @tmp_combos, $result;
|
||||
return;
|
||||
}
|
||||
word_break_util( substr($str, $i, $n-$i),
|
||||
$n-$i,$result . $prefix . " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# filter results depending on input args
|
||||
#
|
||||
sub filter_combos {
|
||||
my @swords = @_;
|
||||
|
||||
# do nothing if keeping all or no words given
|
||||
if ( ! @swords ||
|
||||
$opt_a && ( ! $opt_l && ! $opt_s ) ) {
|
||||
return @swords;
|
||||
}
|
||||
|
||||
# get count of words for each str, make hash str => word-count
|
||||
my %combos = map { $_ => scalar( split / /, $_ ) } @swords;
|
||||
|
||||
# sort by least or most words
|
||||
my @scombos = ();
|
||||
foreach my $str ( sort
|
||||
{ $opt_s ? $combos{$b} <=> $combos{$a} :
|
||||
$combos{$a} <=> $combos{$b} } keys %combos ) {
|
||||
push @scombos, $str;
|
||||
}
|
||||
|
||||
# only one permutation found, not listing more than one permutation,
|
||||
# or only one "best" match
|
||||
if ( scalar ( @scombos ) < 2 ||
|
||||
! $opt_a ||
|
||||
$combos{$scombos[0]} != $combos{$scombos[1]} ) {
|
||||
return ( $scombos[0] );
|
||||
}
|
||||
|
||||
# more than one best match and want them listed, so return "best" matches
|
||||
my $mark = $combos{$scombos[0]};
|
||||
my @best_set = ();
|
||||
foreach ( @scombos ) {
|
||||
if ( $combos{$_} == $mark ) {
|
||||
push @best_set, $_;
|
||||
} else {
|
||||
last;
|
||||
}
|
||||
}
|
||||
die "could not find best set" unless @best_set;
|
||||
|
||||
return @best_set;
|
||||
}
|
||||
|
||||
#
|
||||
# word break a file line-by-line, print to stdout,
|
||||
# filename as arg
|
||||
#
|
||||
sub break_file {
|
||||
my $fname = shift @_;
|
||||
|
||||
open(my $fh, '<', $fname) or die "error opening $fname: $!";
|
||||
while ( my $line = <$fh> ) {
|
||||
chomp $line;
|
||||
|
||||
if ( $line !~ m/\p{InThai}/ ) {
|
||||
print $line, "\n";
|
||||
next;
|
||||
}
|
||||
my @swords = word_break( $line );
|
||||
|
||||
if ( @swords ) {
|
||||
# print out any leading spaces
|
||||
print "$1" if $line =~ m/^(\s+)/;
|
||||
foreach ( @swords ) {
|
||||
print $_, "\n";
|
||||
}
|
||||
} else {
|
||||
print $line, "\n";
|
||||
}
|
||||
}
|
||||
close $fh or die "error closing $fname : $!";
|
||||
return 0;
|
||||
}
|
||||
|
||||
#
|
||||
# usage
|
||||
#
|
||||
sub usage {
|
||||
print <<"EOF";
|
||||
|
||||
NAME
|
||||
st-wordbreak - print out possible word breaks for utf-8 Thai
|
||||
text, file, or stdin based on words in dictionary file
|
||||
|
||||
SYNOPSIS
|
||||
st-wordbreak [options] file|[text1 text2 text3]
|
||||
|
||||
OPTIONS
|
||||
-a list all combinations possible (per line if breaking a file)
|
||||
|
||||
-d [dictionary-name]
|
||||
use dictionary given instead of default
|
||||
|
||||
-h print out help message
|
||||
|
||||
-l longest words (or fewest words), lists one "best" match.
|
||||
this is the default option. if -a arg also given, include
|
||||
all the "best" matches, which could be more than one
|
||||
|
||||
-s smallest words (or most words), lists one "best" match.
|
||||
if -a arg also given, include all the "best" matches,
|
||||
which could be more than one
|
||||
|
||||
ENVIRONMENT
|
||||
You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
|
||||
setting for this to program to work
|
||||
|
||||
EXIT STATUS
|
||||
Return 0 if no errors, non-zero if errors or couldn't split up
|
||||
string.
|
||||
|
||||
BUGS
|
||||
Only utf-8 encodings are supported.
|
||||
Uses recursion so may not handle extremely long strings.
|
||||
Good splitting results are very dependent on dictionary data. If
|
||||
word not in dictionary, may not be able to split at all.
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
getopts('ad:hls');
|
||||
|
||||
if ( $opt_h ) {
|
||||
usage();
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die "invalid args" if $opt_l && $opt_s;
|
||||
|
||||
# default is to split into largest words
|
||||
$opt_l = 1 if ! $opt_a && ! $opt_l && ! $opt_s;
|
||||
|
||||
# read remaining args, convert to utf8
|
||||
my $argc = scalar( @ARGV );
|
||||
my @dargv = map { decode('UTF-8', $_ ) } @ARGV;
|
||||
|
||||
# read word list
|
||||
read_dictionary();
|
||||
|
||||
# reading stdin or a file?
|
||||
my $textname;
|
||||
if ( $argc == 0 || ($argc == 1 && $dargv[0] eq "-") ) {
|
||||
$textname = "/dev/stdin";
|
||||
} elsif ( $argc == 1 && -e $dargv[0] ) {
|
||||
$textname = $dargv[0];
|
||||
}
|
||||
|
||||
# splitting a file
|
||||
if ( $textname ) {
|
||||
break_file( $textname );
|
||||
} else {
|
||||
# splitting text args
|
||||
my $str = join(' ', @dargv);
|
||||
my @swords = word_break( $str );
|
||||
|
||||
if ( ! @swords ) {
|
||||
# could not parse, print original inputs
|
||||
print join(' ', @dargv), "\n";
|
||||
exit 1;
|
||||
} else {
|
||||
foreach ( @swords ) {
|
||||
print $_, "\n";
|
||||
}
|
||||
exit 0;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue