From e899e6021c1a45f4ad336cccb543f4abcbe88ab1 Mon Sep 17 00:00:00 2001 From: wiz Date: Fri, 18 May 2007 06:39:27 +0000 Subject: [PATCH] Initial import of tesseract-1.04b from pkgsrc-wip (packaged by heinz@ and myself): This code is a raw OCR engine. It has NO PAGE LAYOUT ANALYSIS, NO OUTPUT FORMATTING, and NO UI. It can only process an image of a single column and create text from it. It can detect fixed pitch vs proportional text. Having said that, in 1995, this engine was in the top 3 in terms of character accuracy, and it compiles and runs on both Linux and Windows. Another current limitation is that it only recognizes English and its character set is only US-ASCII. Training code IS included in the open source release however, and will be included in a future release. --- graphics/tesseract/DESCR | 9 + graphics/tesseract/Makefile | 32 +++ graphics/tesseract/PLIST | 286 ++++++++++++++++++++++++++ graphics/tesseract/distinfo | 9 + graphics/tesseract/files/tesseract.sh | 2 + graphics/tesseract/patches/patch-ae | 16 ++ graphics/tesseract/patches/patch-ag | 15 ++ graphics/tesseract/patches/patch-ah | 13 ++ graphics/tesseract/patches/patch-ai | 14 ++ 9 files changed, 396 insertions(+) create mode 100644 graphics/tesseract/DESCR create mode 100644 graphics/tesseract/Makefile create mode 100644 graphics/tesseract/PLIST create mode 100644 graphics/tesseract/distinfo create mode 100644 graphics/tesseract/files/tesseract.sh create mode 100644 graphics/tesseract/patches/patch-ae create mode 100644 graphics/tesseract/patches/patch-ag create mode 100644 graphics/tesseract/patches/patch-ah create mode 100644 graphics/tesseract/patches/patch-ai diff --git a/graphics/tesseract/DESCR b/graphics/tesseract/DESCR new file mode 100644 index 000000000000..dcc8fb3daade --- /dev/null +++ b/graphics/tesseract/DESCR @@ -0,0 +1,9 @@ +This code is a raw OCR engine. It has NO PAGE LAYOUT ANALYSIS, NO +OUTPUT FORMATTING, and NO UI. It can only process an image of a +single column and create text from it. It can detect fixed pitch +vs proportional text. Having said that, in 1995, this engine was +in the top 3 in terms of character accuracy, and it compiles and +runs on both Linux and Windows. Another current limitation is that +it only recognizes English and its character set is only US-ASCII. +Training code IS included in the open source release however, and +will be included in a future release. diff --git a/graphics/tesseract/Makefile b/graphics/tesseract/Makefile new file mode 100644 index 000000000000..3102c7a1479a --- /dev/null +++ b/graphics/tesseract/Makefile @@ -0,0 +1,32 @@ +# $NetBSD: Makefile,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ +# + +DISTNAME= tesseract-1.04b +CATEGORIES= graphics +MASTER_SITES= http://tesseract-ocr.googlecode.com/files/ + +MAINTAINER= pkgsrc-users@NetBSD.org +HOMEPAGE= http://code.google.com/p/tesseract-ocr/ +COMMENT= Commercial quality open source OCR engine + +PKG_DESTDIR_SUPPORT=user-destdir + +GNU_CONFIGURE= yes +USE_LANGUAGES= c c++ +WRKSRC= ${WRKDIR}/tesseract-1.04 + +post-build: + ${SED} -e "s,@PREFIX@,${PREFIX}," ${FILESDIR}/tesseract.sh \ + > ${WRKSRC}/tesseract.sh + +post-install: + ${INSTALL_LIB_DIR} ${DESTDIR}${PREFIX}/libexec + ${MV} ${DESTDIR}${PREFIX}/bin/tesseract ${DESTDIR}${PREFIX}/libexec + ${INSTALL_SCRIPT} ${WRKSRC}/tesseract.sh ${DESTDIR}${PREFIX}/bin/tesseract + ${INSTALL_DATA_DIR} ${DESTDIR}${PREFIX}/share/doc/tesseract + ${INSTALL_DATA} ${WRKSRC}/README ${DESTDIR}${PREFIX}/share/doc/tesseract + ${INSTALL_DATA_DIR} ${DESTDIR}${PREFIX}/share/tesseract + ${INSTALL_DATA} ${WRKSRC}/phototest.tif ${DESTDIR}${PREFIX}/share/tesseract + +.include "../../graphics/tiff/buildlink3.mk" +.include "../../mk/bsd.pkg.mk" diff --git a/graphics/tesseract/PLIST b/graphics/tesseract/PLIST new file mode 100644 index 000000000000..29f995cfb638 --- /dev/null +++ b/graphics/tesseract/PLIST @@ -0,0 +1,286 @@ +@comment $NetBSD: PLIST,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ +bin/cntraining +bin/mftraining +bin/tesseract +include/tesseract/adaptions.h +include/tesseract/adaptive.h +include/tesseract/adaptmatch.h +include/tesseract/applybox.h +include/tesseract/associate.h +include/tesseract/badwords.h +include/tesseract/baseapi.h +include/tesseract/basedir.h +include/tesseract/baseline.h +include/tesseract/bestfirst.h +include/tesseract/bits16.h +include/tesseract/bitstrm.h +include/tesseract/bitvec.h +include/tesseract/blckerr.h +include/tesseract/blkocc.h +include/tesseract/blobbox.h +include/tesseract/blobclass.h +include/tesseract/blobcmp.h +include/tesseract/blobcmpl.h +include/tesseract/blobs.h +include/tesseract/blread.h +include/tesseract/callcpp.h +include/tesseract/callnet.h +include/tesseract/charcut.h +include/tesseract/charsample.h +include/tesseract/chartoname.h +include/tesseract/choicearr.h +include/tesseract/choices.h +include/tesseract/chop.h +include/tesseract/chopper.h +include/tesseract/closed.h +include/tesseract/clst.h +include/tesseract/cluster.h +include/tesseract/clusttool.h +include/tesseract/cmndwin.h +include/tesseract/cnTraining.dsp +include/tesseract/const.h +include/tesseract/context.h +include/tesseract/control.h +include/tesseract/coutln.h +include/tesseract/crakedge.h +include/tesseract/cutil.h +include/tesseract/cutoffs.h +include/tesseract/danerror.h +include/tesseract/dawg.h +include/tesseract/debug.h +include/tesseract/debugwin.h +include/tesseract/djmenus.h +include/tesseract/dlltest.cpp +include/tesseract/dlltest.dsp +include/tesseract/docqual.h +include/tesseract/drawedg.h +include/tesseract/drawfx.h +include/tesseract/drawtord.h +include/tesseract/edgblob.h +include/tesseract/edgloop.h +include/tesseract/efio.h +include/tesseract/elst.h +include/tesseract/elst2.h +include/tesseract/emalloc.h +include/tesseract/errcode.h +include/tesseract/evntlst.h +include/tesseract/evnts.h +include/tesseract/expandblob.h +include/tesseract/extern.h +include/tesseract/extract.h +include/tesseract/featdefs.h +include/tesseract/fileerr.h +include/tesseract/findseam.h +include/tesseract/fixspace.h +include/tesseract/fixxht.h +include/tesseract/flexfx.h +include/tesseract/float2int.h +include/tesseract/fpchop.h +include/tesseract/fpoint.h +include/tesseract/freelist.h +include/tesseract/funcdefs.h +include/tesseract/fxdefs.h +include/tesseract/fxid.h +include/tesseract/gap_map.h +include/tesseract/genblob.h +include/tesseract/general.h +include/tesseract/globaloc.h +include/tesseract/globals.h +include/tesseract/gradechop.h +include/tesseract/grphics.h +include/tesseract/grphshm.h +include/tesseract/hashfn.h +include/tesseract/heuristic.h +include/tesseract/hideedge.h +include/tesseract/host.h +include/tesseract/hosthplb.h +include/tesseract/hpddef.h +include/tesseract/hpdsizes.h +include/tesseract/hyphen.h +include/tesseract/img.h +include/tesseract/imgbmp.h +include/tesseract/imgerrs.h +include/tesseract/imgio.h +include/tesseract/imgs.h +include/tesseract/imgscale.h +include/tesseract/imgtiff.h +include/tesseract/imgunpk.h +include/tesseract/intfx.h +include/tesseract/intmatcher.h +include/tesseract/intproto.h +include/tesseract/ipoints.h +include/tesseract/kdtree.h +include/tesseract/labls.h +include/tesseract/linlsq.h +include/tesseract/listio.h +include/tesseract/lmedsq.h +include/tesseract/lsterr.h +include/tesseract/mainblk.h +include/tesseract/makechop.h +include/tesseract/makerow.h +include/tesseract/matchdefs.h +include/tesseract/matchtab.h +include/tesseract/matmatch.h +include/tesseract/matrix.h +include/tesseract/measure.h +include/tesseract/memblk.h +include/tesseract/memry.h +include/tesseract/memryerr.h +include/tesseract/mergenf.h +include/tesseract/metrics.h +include/tesseract/mf.h +include/tesseract/mfTraining.dsp +include/tesseract/mfcpch.cpp +include/tesseract/mfcpch.h +include/tesseract/mfdefs.h +include/tesseract/mfoutline.h +include/tesseract/mfvars.h +include/tesseract/mfx.h +include/tesseract/minmax.h +include/tesseract/mod128.h +include/tesseract/msmenus.h +include/tesseract/name2char.h +include/tesseract/ndminx.h +include/tesseract/normalis.h +include/tesseract/normfeat.h +include/tesseract/normmatch.h +include/tesseract/notdll.h +include/tesseract/nwmain.h +include/tesseract/ocrblock.h +include/tesseract/ocrclass.h +include/tesseract/ocrfeatures.h +include/tesseract/ocrrow.h +include/tesseract/ocrshell.h +include/tesseract/oldbasel.h +include/tesseract/oldheap.h +include/tesseract/oldlist.h +include/tesseract/olutil.h +include/tesseract/outfeat.h +include/tesseract/outlines.h +include/tesseract/output.h +include/tesseract/pageblk.h +include/tesseract/pageres.h +include/tesseract/pagewalk.h +include/tesseract/paircmp.h +include/tesseract/pdblock.h +include/tesseract/pdclass.h +include/tesseract/permdawg.h +include/tesseract/permnum.h +include/tesseract/permute.h +include/tesseract/pgedit.h +include/tesseract/pgeditx.h +include/tesseract/picofeat.h +include/tesseract/pieces.h +include/tesseract/pithsync.h +include/tesseract/pitsync1.h +include/tesseract/platform.h +include/tesseract/plotedges.h +include/tesseract/plotseg.h +include/tesseract/points.h +include/tesseract/polyaprx.h +include/tesseract/polyblk.h +include/tesseract/polyblob.h +include/tesseract/polyvert.h +include/tesseract/poutline.h +include/tesseract/protos.h +include/tesseract/quadlsq.h +include/tesseract/quadratc.h +include/tesseract/quspline.h +include/tesseract/ratngs.h +include/tesseract/rect.h +include/tesseract/rejctmap.h +include/tesseract/reject.h +include/tesseract/render.h +include/tesseract/rwpoly.h +include/tesseract/sbdmenu.h +include/tesseract/sbgconst.h +include/tesseract/sbgdefs.h +include/tesseract/sbgtypes.h +include/tesseract/scaleimg.h +include/tesseract/scanedg.h +include/tesseract/scanutils.cpp +include/tesseract/scanutils.h +include/tesseract/seam.h +include/tesseract/secname.h +include/tesseract/serialis.h +include/tesseract/showim.h +include/tesseract/sigmenu.h +include/tesseract/sortflts.h +include/tesseract/speckle.h +include/tesseract/split.h +include/tesseract/states.h +include/tesseract/statistc.h +include/tesseract/stderr.h +include/tesseract/stepblob.h +include/tesseract/stopper.h +include/tesseract/strngs.h +include/tesseract/structures.h +include/tesseract/submen.h +include/tesseract/tally.h +include/tesseract/tessarray.h +include/tesseract/tessbox.h +include/tesseract/tessclas.h +include/tesseract/tessedit.h +include/tesseract/tessembedded.h +include/tesseract/tesseractmain.h +include/tesseract/tessinit.h +include/tesseract/tessio.h +include/tesseract/tessopt.h +include/tesseract/tessout.h +include/tesseract/tessvars.h +include/tesseract/tface.h +include/tesseract/tfacep.h +include/tesseract/tfacepp.h +include/tesseract/topitch.h +include/tesseract/tordmain.h +include/tesseract/tordvars.h +include/tesseract/tospace.h +include/tesseract/tovars.h +include/tesseract/tprintf.h +include/tesseract/training.h +include/tesseract/trie.h +include/tesseract/tstruct.h +include/tesseract/txtregn.h +include/tesseract/underlin.h +include/tesseract/unichar.h +include/tesseract/unicharmap.h +include/tesseract/unicharset.h +include/tesseract/varable.h +include/tesseract/varabled.h +include/tesseract/varblmen.h +include/tesseract/varblwin.h +include/tesseract/variables.h +include/tesseract/vecfuncs.h +include/tesseract/werd.h +include/tesseract/werdit.h +include/tesseract/wordclass.h +include/tesseract/wordseg.h +include/tesseract/xform2d.h +lib/libtesseract_ccstruct.a +lib/libtesseract_ccutil.a +lib/libtesseract_classify.a +lib/libtesseract_cutil.a +lib/libtesseract_dict.a +lib/libtesseract_display.a +lib/libtesseract_image.a +lib/libtesseract_main.a +lib/libtesseract_textord.a +lib/libtesseract_training.a +lib/libtesseract_viewer.a +lib/libtesseract_wordrec.a +libexec/tesseract +share/doc/tesseract/README +share/tessdata/confsets +share/tessdata/eng.DangAmbigs +share/tessdata/eng.freq-dawg +share/tessdata/eng.inttemp +share/tessdata/eng.normproto +share/tessdata/eng.pffmtable +share/tessdata/eng.unicharset +share/tessdata/eng.user-words +share/tessdata/eng.word-dawg +share/tesseract/phototest.tif +@dirrm share/doc/tesseract +@dirrm share/tesseract +@dirrm share/tessdata +@dirrm include/tesseract diff --git a/graphics/tesseract/distinfo b/graphics/tesseract/distinfo new file mode 100644 index 000000000000..3e8ed942a24a --- /dev/null +++ b/graphics/tesseract/distinfo @@ -0,0 +1,9 @@ +$NetBSD: distinfo,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ + +SHA1 (tesseract-1.04b.tar.gz) = 263a65e462ed864c4da115cdcb3f3e78613de485 +RMD160 (tesseract-1.04b.tar.gz) = 5e9c70d4435a59157f0af6503a57b02a4a74350e +Size (tesseract-1.04b.tar.gz) = 2899276 bytes +SHA1 (patch-ae) = c22f254b73fb9bbd02cf8ef7b4ccbea475afd5df +SHA1 (patch-ag) = 581ec7ac0528bb28fddb3fbaa35a87bb1835a82e +SHA1 (patch-ah) = 22987d8523631c5c6e8b2fb5096ff87c5bc13124 +SHA1 (patch-ai) = e219077d2acf0652a9bf6418d3f8ce4e11782ed5 diff --git a/graphics/tesseract/files/tesseract.sh b/graphics/tesseract/files/tesseract.sh new file mode 100644 index 000000000000..3871ab7bdcfc --- /dev/null +++ b/graphics/tesseract/files/tesseract.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec @PREFIX@/libexec/tesseract "$@" diff --git a/graphics/tesseract/patches/patch-ae b/graphics/tesseract/patches/patch-ae new file mode 100644 index 000000000000..04ad0e37cfb1 --- /dev/null +++ b/graphics/tesseract/patches/patch-ae @@ -0,0 +1,16 @@ +$NetBSD: patch-ae,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ + +--- cutil/globals.h.orig 2006-06-17 00:17:07.000000000 +0200 ++++ cutil/globals.h +@@ -43,9 +43,11 @@ extern int acts[MAXPROC]; /*actio + extern int debugs[MAXPROC]; /*debug flags */ + extern int plots[MAXPROC]; /*plot flags */ + extern int corners[4]; /*corners of scan window */ ++extern "C" { + extern int optind; /*option index */ + extern char *optarg; /*option argument */ + /*image file name */ ++} + extern char imagefile[FILENAMESIZE]; + /* main directory */ + extern char directory[FILENAMESIZE]; diff --git a/graphics/tesseract/patches/patch-ag b/graphics/tesseract/patches/patch-ag new file mode 100644 index 000000000000..280764380a64 --- /dev/null +++ b/graphics/tesseract/patches/patch-ag @@ -0,0 +1,15 @@ +$NetBSD: patch-ag,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ + +--- cutil/tordvars.h.orig 2007-05-16 21:33:53.000000000 +0000 ++++ cutil/tordvars.h +@@ -45,8 +45,8 @@ extern int similarity_enable; + extern int similarity_debug; /* Level of debug output */ + extern int write_raw_output; /* Text before context */ + extern int write_output; /* Text file output */ +-//extern "C" { extern int display_ratings; } /* Show the ratings */ +-extern int display_ratings; /* Show the ratings */ ++extern "C" { extern int display_ratings; } /* Show the ratings */ ++//extern int display_ratings; /* Show the ratings */ + extern int show_bold; /* Use bold text */ + extern int display_text; /* Show word text */ + extern int display_blocks; /* Show word as boxes */ diff --git a/graphics/tesseract/patches/patch-ah b/graphics/tesseract/patches/patch-ah new file mode 100644 index 000000000000..ced738242490 --- /dev/null +++ b/graphics/tesseract/patches/patch-ah @@ -0,0 +1,13 @@ +$NetBSD: patch-ah,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ + +--- ccutil/debugwin.cpp.orig 2006-06-16 22:17:04.000000000 +0000 ++++ ccutil/debugwin.cpp +@@ -229,7 +229,7 @@ DEBUG_WIN::DEBUG_WIN( + length += sprintf (command + length, "trap \"\" 1 2 3 13 15\n"); + length += + sprintf (command + length, +- "/usr/bin/X11/xterm -sb -sl " INT32FORMAT " -geometry " ++ "/usr/X11R6/bin/xterm -sb -sl " INT32FORMAT " -geometry " + INT32FORMAT "x" INT32FORMAT "", buflines, xsize / 8, ysize / 16); + if (xpos >= 0) + command[length++] = '+'; diff --git a/graphics/tesseract/patches/patch-ai b/graphics/tesseract/patches/patch-ai new file mode 100644 index 000000000000..45150d3da32a --- /dev/null +++ b/graphics/tesseract/patches/patch-ai @@ -0,0 +1,14 @@ +$NetBSD: patch-ai,v 1.1.1.1 2007/05/18 06:39:27 wiz Exp $ + +--- configure.orig 2007-02-02 21:37:43.000000000 +0100 ++++ configure +@@ -7083,7 +7083,8 @@ else + if test "$cross_compiling" = yes; then + ac_cv_func_fork_works=cross + else +- cat >conftest.$ac_ext <<_ACEOF ++ cat confdefs.h >conftest.$ac_ext ++ cat >>conftest.$ac_ext <<_ACEOF + /* By Ruediger Kuhlmann. */ + #include + #if HAVE_UNISTD_H