Update to 0.9

- add a perl script st-wordbreak
This commit is contained in:
scole 2020-09-05 18:02:36 +00:00
parent 7f74bed877
commit 48e614e23d
5 changed files with 310 additions and 22 deletions

View file

@ -1,7 +1,7 @@
A collection of utilities to split Thai Unicode UTF-8 text by word
boundaries, also known as word tokenization. The utilities use emacs,
swath, and a c++ icu-project program. All use dictionary-based word
splitting.
boundaries, also known as word tokenization or word breaking. The
utilities use emacs, swath, perl, and a c++ icu-project program. All
use dictionary-based word splitting.
Also included is a merged dictionary file of Thai words and a perl
script to grep Thai UTF-8 words.

View file

@ -1,7 +1,6 @@
# $NetBSD: Makefile,v 1.9 2020/08/31 18:12:18 wiz Exp $
# $NetBSD: Makefile,v 1.10 2020/09/05 18:02:36 scole Exp $
PKGNAME= split-thai-0.8
PKGREVISION= 1
PKGNAME= split-thai-0.9
CATEGORIES= textproc
MAINTAINER= pkgsrc-users@NetBSD.org
COMMENT= Utilities to split UTF-8 Thai text into words
@ -21,7 +20,7 @@ BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
DEPENDS+= emacs-[0-9]*:../../editors/emacs
DEPENDS+= swath-[0-9]*:../../textproc/swath
REPLACE_PERL= tgrep
REPLACE_PERL= st-wordbreak tgrep
REPLACE_SH= st-swath
UTF8_ENV= env LC_ALL=C.UTF-8
@ -43,14 +42,14 @@ SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
SUBST_CLASSES+= dictionary-app
SUBST_STAGE.dictionary-app= pre-configure
SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
SUBST_FILES.dictionary-app= st-emacs st-swath
SUBST_FILES.dictionary-app= st-emacs st-swath st-wordbreak
SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
pre-extract:
mkdir -p ${WRKSRC}
cd files && cp README.txt st-emacs st-icu.cc st-swath \
tgrep thai-utility.el thaidict.abm ${WRKSRC}
st-wordbreak tgrep thai-utility.el thaidict.abm ${WRKSRC}
post-extract:
cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
@ -83,7 +82,7 @@ do-build:
do-install:
${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
${WRKSRC}/st-wordbreak ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
.for i in ${ST_SHARE_FILES}
${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai

View file

@ -1,7 +1,8 @@
@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $
@comment $NetBSD: PLIST,v 1.4 2020/09/05 18:02:36 scole Exp $
bin/st-emacs
bin/st-icu
bin/st-swath
bin/st-wordbreak
bin/tgrep
share/split-thai/README.txt
share/split-thai/thai-dict.el

View file

@ -2,10 +2,11 @@ NAME
st-emacs
st-icu
st-swath
st-wordbreak
tgrep
SYNOPSIS
st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
st-emacs|st-icu|st-swath|st-wordbreak [filename|text1 text2 ...|'blank']
tgrep [options] FILE ...
DESCRIPTION
@ -13,14 +14,16 @@ DESCRIPTION
by spaces (word tokenization). They can separate stdin, files,
or text as arguments. It includes these utilities:
st-emacs: emacs-script using emacs lisp thai-word library
https://www.gnu.org/software/emacs/
st-icu: basic C++ program using the ICU library
http://site.icu-project.org/
st-swath: sh script wrapper to simplfy args to the swath program
https://linux.thai.net/projects/swath
st-emacs: emacs-script using emacs lisp thai-word library
https://www.gnu.org/software/emacs/
st-icu: basic C++ program using the ICU library
http://site.icu-project.org/
st-swath: sh script wrapper to simplfy args to the swath program
https://linux.thai.net/projects/swath
st-wordbreak: perl script to brute-force separate thai words,
see "st-wordbreak -h"
tgrep: grep-like utility using perl, see "tgrep -h"
tgrep: grep-like utility using perl, see "tgrep -h"
EXAMPLES
split one or more text strings:
@ -62,9 +65,9 @@ NOTES
and corresponding .tri file, and emacs lisp .el files for reading
and dumping out dictionary files.
st-emacs and st-swath are setup to use the combined dictionary
with words from the emacs 'thai-word library, swath dictionary
words, and the icu thai library words.
st-emacs, st-swath, and st-wordbreak are setup to use the
combined dictionary with words from the emacs 'thai-word library,
swath dictionary words, and the icu thai library words.
st-icu uses its own built in library. To customise the icu
dictionary, you apparently would have to modify
@ -79,3 +82,4 @@ BUGS
thai text mixed with other languages may not be handled well when
splitting.
this file should be converted to proper manpages.
these utilities need better names.

View file

@ -0,0 +1,284 @@
#!/bin/perl
#
# split thai text or file into longest (or smallest) possible words
# based on words in a dictionary file.
#
use strict;
use warnings;
use Encode;
use Getopt::Std;
use utf8;
use open qw/:std :utf8/;
our ($opt_a, $opt_d, $opt_h, $opt_l, $opt_s);
# global dictionary of thai words
my %tdict = ();
# tmp results for recursions
my @tmp_combos = ();
#
# read thai dictionary into hash. each line in dictionary should be a
# thai word. returns number of words read.
#
sub read_dictionary {
# default dictionary
my $fname = "ST_SHARE_DIR/thaidict";
$fname = $opt_d if defined $opt_d;
die "error reading dictionary $fname" unless -f $fname && -s $fname;
open(my $fh, '<', $fname) or die "error opening $fname: $!";
while ( <$fh> ) {
chomp;
$tdict{$_} = 1;
}
close $fh or die "error closing $fname : $!";
# add punctuation and misc so parser won't choke
my @misc_words =
( "~", "`", "!", "@", "#", "\$", "%", "^", "&", "*", "(", ")",
"-", "_", "=", "+", "\\", "|", "{", "}", "[", "]",
";", ":", "'", "\"", "<", ">", ".", ",", "/", "?",
"ๆ", "ฯาฯ", "ฯ", "฿", "๏", "๚", "๛" );
foreach ( @misc_words ) {
$tdict{$_} = 1;
}
my $size = keys %tdict;
return $size;
}
#
# find all possible word breaks for a string,
# returns an array of strings
#
sub word_break {
my $str = shift @_;
# return string as is unless it contains thai
return ( $str ) unless $str =~ m/\p{InThai}/;
# add numbers to dictionary entries so numbers will be parsed
my @numbers = ( $str =~ m/([0123456789๑๒๓๔๕๖๗๘๙,.]+)/g );
foreach ( @numbers ) {
$tdict{$_} = 1;
}
# add any english looking words
my @eng_words = ( $str =~ m/([a-zA-Z]+)/g );
foreach ( @eng_words ) {
$tdict{$_} = 1;
}
# clear any whitespace
$str =~ s/\s+//g;
# clear any previous word breaks
@tmp_combos = ();
word_break_util( $str, length($str), "");
# filter results depending on args
my @filtered = filter_combos( @tmp_combos );
return @filtered;
}
#
# recursive function to find all possible word combos of string based
# on words in dictionary. adapted from
# https://www.geeksforgeeks.org/word-break-problem-using-backtracking
#
# saves combos to global list
#
sub word_break_util{
my $str = shift @_;
my $n = shift @_;
my $result = shift @_;
for ( my $i=1; $i <= $n; $i++ ) {
my $prefix = substr($str, 0, $i);
if ( exists $tdict{$prefix} ) {
if ( $i == $n ) {
$result .= $prefix;
#print $result, "\n";
push @tmp_combos, $result;
return;
}
word_break_util( substr($str, $i, $n-$i),
$n-$i,$result . $prefix . " ");
}
}
}
#
# filter results depending on input args
#
sub filter_combos {
my @swords = @_;
# do nothing if keeping all or no words given
if ( ! @swords ||
$opt_a && ( ! $opt_l && ! $opt_s ) ) {
return @swords;
}
# get count of words for each str, make hash str => word-count
my %combos = map { $_ => scalar( split / /, $_ ) } @swords;
# sort by least or most words
my @scombos = ();
foreach my $str ( sort
{ $opt_s ? $combos{$b} <=> $combos{$a} :
$combos{$a} <=> $combos{$b} } keys %combos ) {
push @scombos, $str;
}
# only one permutation found, not listing more than one permutation,
# or only one "best" match
if ( scalar ( @scombos ) < 2 ||
! $opt_a ||
$combos{$scombos[0]} != $combos{$scombos[1]} ) {
return ( $scombos[0] );
}
# more than one best match and want them listed, so return "best" matches
my $mark = $combos{$scombos[0]};
my @best_set = ();
foreach ( @scombos ) {
if ( $combos{$_} == $mark ) {
push @best_set, $_;
} else {
last;
}
}
die "could not find best set" unless @best_set;
return @best_set;
}
#
# word break a file line-by-line, print to stdout,
# filename as arg
#
sub break_file {
my $fname = shift @_;
open(my $fh, '<', $fname) or die "error opening $fname: $!";
while ( my $line = <$fh> ) {
chomp $line;
if ( $line !~ m/\p{InThai}/ ) {
print $line, "\n";
next;
}
my @swords = word_break( $line );
if ( @swords ) {
# print out any leading spaces
print "$1" if $line =~ m/^(\s+)/;
foreach ( @swords ) {
print $_, "\n";
}
} else {
print $line, "\n";
}
}
close $fh or die "error closing $fname : $!";
return 0;
}
#
# usage
#
sub usage {
print <<"EOF";
NAME
st-wordbreak - print out possible word breaks for utf-8 Thai
text, file, or stdin based on words in dictionary file
SYNOPSIS
st-wordbreak [options] file|[text1 text2 text3]
OPTIONS
-a list all combinations possible (per line if breaking a file)
-d [dictionary-name]
use dictionary given instead of default
-h print out help message
-l longest words (or fewest words), lists one "best" match.
this is the default option. if -a arg also given, include
all the "best" matches, which could be more than one
-s smallest words (or most words), lists one "best" match.
if -a arg also given, include all the "best" matches,
which could be more than one
ENVIRONMENT
You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
setting for this to program to work
EXIT STATUS
Return 0 if no errors, non-zero if errors or couldn't split up
string.
BUGS
Only utf-8 encodings are supported.
Uses recursion so may not handle extremely long strings.
Good splitting results are very dependent on dictionary data. If
word not in dictionary, may not be able to split at all.
EOF
}
getopts('ad:hls');
if ( $opt_h ) {
usage();
exit 0;
}
die "invalid args" if $opt_l && $opt_s;
# default is to split into largest words
$opt_l = 1 if ! $opt_a && ! $opt_l && ! $opt_s;
# read remaining args, convert to utf8
my $argc = scalar( @ARGV );
my @dargv = map { decode('UTF-8', $_ ) } @ARGV;
# read word list
read_dictionary();
# reading stdin or a file?
my $textname;
if ( $argc == 0 || ($argc == 1 && $dargv[0] eq "-") ) {
$textname = "/dev/stdin";
} elsif ( $argc == 1 && -e $dargv[0] ) {
$textname = $dargv[0];
}
# splitting a file
if ( $textname ) {
break_file( $textname );
} else {
# splitting text args
my $str = join(' ', @dargv);
my @swords = word_break( $str );
if ( ! @swords ) {
# could not parse, print original inputs
print join(' ', @dargv), "\n";
exit 1;
} else {
foreach ( @swords ) {
print $_, "\n";
}
exit 0;
}
}