diff --git a/NanScan/Generics/InvoiceRecognizer.py b/NanScan/Generics/InvoiceRecognizer.py new file mode 100644 index 0000000..b6c44e0 --- /dev/null +++ b/NanScan/Generics/InvoiceRecognizer.py @@ -0,0 +1,153 @@ +# Copyright (C) 2009 by Albert Cervera i Areny +# albert@nan-tic.com +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from NanScan.LevenshteinDistance import * +from NanScan.Range import * +from NanScan.TextPatterns import * + +class InvoiceRecognizer: + Tags = { + 'number': { + 'tag': [ + u'factura', + u'numero factura', + u'factura numero', + u'num. de factura', + u'factura num.' + ], + 'type': 'mostly-numeric' + }, + 'date': { + 'tag': [ + u'fecha', + u'fecha factura', + u'fecha emision', + u'data:', + u'data', + u'data factura' + ], + 'type': 'date' + # With dates we need to be able to find a date with + # the format '1 Sep. 2009'. Also we need to find the + # date without a tag. Something like: + # + # 'fallback': functionName, + # + # might be appropiate for those cases in which the + # tag can't be found. + }, + 'amount': { + 'tag': [ + u'total', + u'total factura', + u'total a pagar (euros)' + ], + 'type': 'numeric' + } + } + def recognize(self, recognizer): + #text = recognizer.textInRegion('text') + analyzer = recognizer.analyzers['text'] + self.textLines = analyzer.textLinesWithSpaces() + result = '' + for tag in InvoiceRecognizer.Tags: + result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) ) + return result + + def formatedLine(self, line): + text = u'' + for c in line: + text += c.character + return text + + def findText(self, textToFind): + ranges = Range.extractAllRangesFromDocument( self.textLines, len(textToFind) ) + for ran in ranges: + text = ran.text() + value = Levenshtein.levenshtein( text, textToFind ) + ran.distance = value + ranges.sort( rangeDistanceComparison ) + if ranges: + return ranges[0] + else: + return None + + + def findTagValue(self, tag): + ranges = [] + for tagData in InvoiceRecognizer.Tags[tag]['tag']: + ran = self.findText( tagData ) + if ran: + ranges.append( ran ) + ranges.sort( rangeDistanceComparison ) + #ran = ranges[0] + distance = ranges[0].distance + sameDistance = [x for x in ranges if x.distance == distance] + sameDistance.sort( rangeLengthComparison ) + #print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]] + ran = sameDistance[-1] + + print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') ) + + # Extract text on the right + line = self.formatedLine( self.textLines[ ran.line ] ) + rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0] + print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore') + print "rightValue: ", rightValue.encode('ascii','ignore') + print "SAME LINE: ", line.encode('ascii','ignore') + + # Extract text on the bottom + if ran.line < len(self.textLines)-1: + line = self.textLines[ran.line+1] + print "NEXT LINE: ", self.formatedLine( self.textLines[ran.line+1] ).encode('ascii','ignore') + boxBottom = ran.rect() + boxBottom.moveTop( line[0].box.y() ) + bottomValue = u'' + for c in line: + if c.box.intersects( boxBottom ): + bottomValue += c.character + else: + bottomValue = u'' + + # Decide which of both values match the given tag type + type = InvoiceRecognizer.Tags[ tag ][ 'type' ] + if type == 'numeric': + if isFloat( rightValue ): + return textToFloat( rightValue ) + elif isFloat( bottomValue ): + return textToFloat( bottomValue ) + else: + return None + elif type == 'date': + if isDate( rightValue ): + return textToDate( rightValue ) + elif isDate( bottomValue ): + return textToDate( bottomValue ) + else: + return None + elif type == 'mostly-numeric': + if isMostlyNumeric( rightValue ): + return rightValue + elif isMostlyNumeric( bottomValue ): + return bottomValue + else: + return rightValue + else: + return rightValue + diff --git a/NanScan/Generics/__init__.py b/NanScan/Generics/__init__.py new file mode 100644 index 0000000..923d4c9 --- /dev/null +++ b/NanScan/Generics/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2009 by Albert Cervera i Areny +# albert@nan-tic.com +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + diff --git a/NanScan/Ocr.py b/NanScan/Ocr.py index a32e315..173fc8a 100755 --- a/NanScan/Ocr.py +++ b/NanScan/Ocr.py @@ -44,7 +44,7 @@ def boxComparison(x, y): else: return 0 -## @breif This class allows using an OCR and provides several convenient functions +## @brief This class allows using an OCR and provides several convenient functions # regarding text and image processing such as deskewing or obtaining formated text. class Ocr(Analyzer): file = "" @@ -132,12 +132,12 @@ class Ocr(Analyzer): ## @brief Returns the text of a given region of the image. # It's the same as calling formatedText(). - def textInRegion(self, region): + def textInRegion(self, region=None): return self.formatedText( region ) ## @brief Returns the bounding rectangle of the text returned by textInRegion for # the given region. - def featureRectInRegion(self, region): + def featureRectInRegion(self, region=None): lines = self.textLinesWithSpaces( region ) rect = QRectF() for line in lines: @@ -242,6 +242,58 @@ class Ocr(Analyzer): line.sort( boxComparison ) return lines + ## @brief This function adds spaces between words of a single line of boxes. + def textLineWithSpaces(self, line): + width = 0 + count = 0 + left = None + spacesToAdd = [] + words = [] + for c in line: + if left: + # If separtion between previous and current char + # is greater than a third of the average character + # width we'll add a space. + if c.box.left() - left > ( width / count ) / 3: + if spacesToAdd: + words.append( line[spacesToAdd[-1]:count] ) + spacesToAdd.append( count ) + + # c.character is already a unicode string + left = c.box.right() + width += c.box.width() + count += 1 + + # Try to find out if they are fixed sized characters + # We've got some problems with fixed size fonts. In some cases the 'I' letter will + # have the width of a pipe but the distance between characters will be fixed. In these + # cases it's very probable our algorithm will add incorrect spaces before and/or after + # the 'I' letter. This should be fixed by somehow determining if it's a fixed sized + # font. The commented code below tries to do just that by calculating distances within + # the letters of each word. We need to find out if something like this can work and + # use it. + #for x in words: + #dist = [] + #for c in range( len(x)-1 ): + #dist.append( x[c+1].box.center().x() - x[c].box.center().x() ) + #print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore') + #print 'Distancies: ', dist + + + # Reverse so indexes are still valid after insertions + spacesToAdd.reverse() + previousIdx = None + for idx in spacesToAdd: + c = Character() + c.character = u' ' + c.box = QRectF() + c.box.setTop( line[idx - 1].box.top() ) + c.box.setBottom( line[idx - 1].box.bottom() ) + c.box.setLeft( line[idx - 1].box.right() ) + c.box.setRight( line[idx].box.left() ) + line.insert( idx, c ) + + ## @brief This function is similar to textLines() but adds spaces between words. # The result is also a list of lines each line being a list of Character objects. def textLinesWithSpaces(self, region=None): @@ -257,54 +309,7 @@ class Ocr(Analyzer): # which is quite usual. for line in lines: - width = 0 - count = 0 - left = None - spacesToAdd = [] - words = [] - for c in line: - if left: - # If separtion between previous and current char - # is greater than a third of the average character - # width we'll add a space. - if c.box.left() - left > ( width / count ) / 3: - if spacesToAdd: - words.append( line[spacesToAdd[-1]:count] ) - spacesToAdd.append( count ) - - # c.character is already a unicode string - left = c.box.right() - width += c.box.width() - count += 1 - - # Try to find out if they are fixed sized characters - # We've got some problems with fixed size fonts. In some cases the 'I' letter will - # have the width of a pipe but the distance between characters will be fixed. In these - # cases it's very probable our algorithm will add incorrect spaces before and/or after - # the 'I' letter. This should be fixed by somehow determining if it's a fixed sized - # font. The commented code below tries to do just that by calculating distances within - # the letters of each word. We need to find out if something like this can work and - # use it. - #for x in words: - #dist = [] - #for c in range( len(x)-1 ): - #dist.append( x[c+1].box.center().x() - x[c].box.center().x() ) - #print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore') - #print 'Distancies: ', dist - - - # Reverse so indexes are still valid after insertions - spacesToAdd.reverse() - previousIdx = None - for idx in spacesToAdd: - c = Character() - c.character = u' ' - c.box = QRectF() - c.box.setTop( line[idx - 1].box.top() ) - c.box.setBottom( line[idx - 1].box.bottom() ) - c.box.setLeft( line[idx - 1].box.right() ) - c.box.setRight( line[idx].box.left() ) - line.insert( idx, c ) + self.textLineWithSpaces( line ) return lines diff --git a/NanScan/Recognizer.py b/NanScan/Recognizer.py index c1866aa..91627cf 100644 --- a/NanScan/Recognizer.py +++ b/NanScan/Recognizer.py @@ -29,6 +29,7 @@ from Trigram import * from Hamming import * from LevenshteinDistance import * from Translator import * +from Range import * import tempfile @@ -70,7 +71,7 @@ class Recognizer(QObject): if type in self.analyzers: return self.analyzers[type].boxes else: - return None + return [] def analyzersAvailable(self): return self.analyzers.keys() @@ -148,11 +149,9 @@ class Recognizer(QObject): # 5 (the default) will make the template move 5 millimeter to the right, # 5 to the left, 5 to the top and 5 to the bottom. This means 121 positions # per template. + # # Note that the image must have been scanned (using scan() or startScan()) # before using this function. - # - # TODO: Using offsets to find the best template is easy but highly inefficient. - # a smarter solution should be implemented. def findMatchingTemplateByOffset( self, templates, offset = 5 ): max = 0 best = { @@ -200,9 +199,6 @@ class Recognizer(QObject): # # Note that the image must have been scanned (using scan() or startScan()) # before using this function. - # - # TODO: Using offsets to find the best template is easy but highly inefficient. - # a smarter solution should be implemented. def findMatchingTemplateByText( self, templates ): max = 0 best = { @@ -224,7 +220,6 @@ class Recognizer(QObject): # Apply template with offset found currentDocument = self.extractWithTemplate( template, offset.x(), offset.y() ) for documentBox in currentDocument.boxes: - print "Applying..." if documentBox.templateBox.type != 'matcher': continue templateBox = documentBox.templateBox @@ -373,65 +368,3 @@ class TemplateBoxRangeIterator: break return result -def rangeDistanceComparison(x, y): - if x.distance > y.distance: - return 1 - elif x.distance < y.distance: - return -1 - else: - return 0 - -## @brief This class represents a group of characters in a document. -class Range: - def __init__(self): - self.line = 0 - self.pos = 0 - self.length = 0 - self.document = None - - ## @brief Returns a unicode string with the text of the current range - def text(self): - line = self.document[self.line] - chars = line[self.pos:self.pos + self.length] - return u''.join( [x.character for x in chars] ) - - ## @brief Returns the bounding rectangle of the text in the range - def rect(self): - line = self.document[self.line] - chars = line[self.pos:self.pos + self.length] - rect = QRectF() - for c in chars: - rect = rect.united( c.box ) - return rect - - ## @brief Returns a list with all possible ranges of size length of the - # given document - @staticmethod - def extractAllRangesFromDocument(lines, length, width=0): - if length <= 0: - return [] - ranges = [] - for line in range(len(lines)): - if length >= len(lines[line]): - ran = Range() - ran.line = line - ran.pos = 0 - ran.length = len(lines[line]) - ran.document = lines - #if width: - # while ran.rect().width() > width: - # ran.length -= 1 - ranges.append( ran ) - continue - for pos in range(len(lines[line]) - length + 1): - ran = Range() - ran.line = line - ran.pos = pos - ran.length = length - ran.document = lines - #if width: - # while ran.rect().width() > width: - # ran.length -= 1 - ranges.append( ran ) - return ranges - diff --git a/NanScan/test-scandialog.py b/NanScan/test-scandialog.py index 63c6cca..975ce1a 100644 --- a/NanScan/test-scandialog.py +++ b/NanScan/test-scandialog.py @@ -1,5 +1,5 @@ from PyQt4.QtGui import * -from scandialog import * +from ScanDialog import * import sys import os @@ -10,7 +10,7 @@ dialog = ScanDialog() if os.name == 'nt': FileSaveThreaded.directory = 'c:\\images' else: - FileSaveThreaded.directory = '/tmp' + FileSaveThreaded.directory = '/tmp/scan' dialog.exec_() diff --git a/Planta/MainWindow.py b/Planta/MainWindow.py index b177af3..fb8544b 100644 --- a/Planta/MainWindow.py +++ b/Planta/MainWindow.py @@ -454,6 +454,7 @@ class MainWindow(QMainWindow): self.connect( self.actionUnzoom, SIGNAL('triggered()'), self.unzoom ) self.connect( self.actionFindMatchingTemplateByOffset, SIGNAL('triggered()'), self.findMatchingTemplateByOffset ) self.connect( self.actionFindMatchingTemplateByText, SIGNAL('triggered()'), self.findMatchingTemplateByText ) + self.connect( self.actionRecognizeInvoice, SIGNAL('triggered()'), self.recognizeInvoice ) self.toggleImageBoxes() QTimer.singleShot( 1000, self.setup ) self.updateTitle() @@ -486,6 +487,12 @@ class MainWindow(QMainWindow): def findMatchingTemplateByText(self): self.findMatchingTemplate( 'text' ) + def recognizeInvoice(self): + from NanScan.Generics.InvoiceRecognizer import InvoiceRecognizer + p = InvoiceRecognizer() + result = p.recognize( self.recognizer ) + QMessageBox.information( self, _('Invoice Recognition'), result ) + def findMatchingTemplate(self, type): if type == 'offset': title = _('Template search by offset') diff --git a/Planta/mainwindow.ui b/Planta/mainwindow.ui index d203484..2f15d73 100644 --- a/Planta/mainwindow.ui +++ b/Planta/mainwindow.ui @@ -13,14 +13,6 @@ Planta - - - 0 - 48 - 709 - 439 - - @@ -35,16 +27,7 @@ - - - - 0 - 20 - 70 - 399 - - - + @@ -111,7 +94,7 @@ 0 0 709 - 22 + 25 @@ -150,6 +133,7 @@ + @@ -157,14 +141,6 @@ - - - 0 - 22 - 709 - 26 - - toolBar @@ -297,6 +273,11 @@ Deskew + + + Recognize Invoice + + diff --git a/Planta/planta.sh b/Planta/planta.sh index e846e3d..cdac317 100755 --- a/Planta/planta.sh +++ b/Planta/planta.sh @@ -4,4 +4,5 @@ #export PYTHONPATH=/home/albert/python/lib/python:../../bin:../../.. # NanScan export PYTHONPATH=..:/home/albert/d/koo +export LD_LIBRARY_PATH=/usr/lib ./planta.py $1 diff --git a/doc/doxygen/nanscan.doxyfile b/doc/doxygen/nanscan.doxyfile index be6106b..455fbf0 100644 --- a/doc/doxygen/nanscan.doxyfile +++ b/doc/doxygen/nanscan.doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- DOXYFILE_ENCODING = UTF-8 -PROJECT_NAME = NaNScaN +PROJECT_NAME = NanScan PROJECT_NUMBER = 1.0 OUTPUT_DIRECTORY = . CREATE_SUBDIRS = NO @@ -87,7 +87,7 @@ WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- -INPUT = ../../NaNScaN +INPUT = ../../NanScan INPUT_ENCODING = UTF-8 FILE_PATTERNS = *.c \ *.cc \