- Fixed doxygen file.

- Added invoice recognition module. Still missing block detection.
2009-03-14 18:10:23 +01:00 · 2009-03-14 18:10:23 +01:00 · 0eb944c512
parent 2cbba682f0
commit 0eb944c512
9 changed files with 251 additions and 152 deletions
--- a/NanScan/Generics/InvoiceRecognizer.py
+++ b/NanScan/Generics/InvoiceRecognizer.py
@ -0,0 +1,153 @@
+#   Copyright (C) 2009 by Albert Cervera i Areny
+#   albert@nan-tic.com
+#
+#   This program is free software; you can redistribute it and/or modify 
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or 
+#   (at your option) any later version. 
+#
+#   This program is distributed in the hope that it will be useful, 
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of 
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License 
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
+
+
+from NanScan.LevenshteinDistance import *
+from NanScan.Range import *
+from NanScan.TextPatterns import *
+
+class InvoiceRecognizer:
+	Tags = { 
+		'number': {
+			'tag': [
+				u'factura',
+				u'numero factura',
+				u'factura numero',
+				u'num. de factura',
+				u'factura num.'
+			],
+			'type': 'mostly-numeric'
+		},
+		'date': {
+			'tag': [
+				u'fecha',
+				u'fecha factura',
+				u'fecha emision',
+				u'data:',
+				u'data',
+				u'data factura'
+			],
+			'type': 'date'
+			# With dates we need to be able to find a date with
+			# the format '1 Sep. 2009'. Also we need to find the
+			# date without a tag. Something like:
+			#
+			# 'fallback': functionName,
+			# 
+			# might be appropiate for those cases in which the
+			# tag can't be found.
+		},
+		'amount': {
+			'tag': [
+				u'total',
+				u'total factura',
+				u'total a pagar (euros)'
+			],
+			'type': 'numeric'
+		}
+	}
+	def recognize(self, recognizer):
+		#text = recognizer.textInRegion('text')
+		analyzer = recognizer.analyzers['text']
+		self.textLines = analyzer.textLinesWithSpaces()
+		result = ''
+		for tag in InvoiceRecognizer.Tags:
+			result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
+		return result
+
+        def formatedLine(self, line):
+		text = u''
+		for c in line:
+			text += c.character
+		return text
+
+	def findText(self, textToFind):
+		ranges = Range.extractAllRangesFromDocument( self.textLines, len(textToFind) )
+		for ran in ranges:
+			text = ran.text()
+			value = Levenshtein.levenshtein( text, textToFind )
+			ran.distance = value
+		ranges.sort( rangeDistanceComparison )
+		if ranges:
+			return ranges[0]
+		else:
+			return None
+
+
+	def findTagValue(self, tag):
+		ranges = []
+		for tagData in InvoiceRecognizer.Tags[tag]['tag']:
+			ran = self.findText( tagData )
+			if ran:
+				ranges.append( ran )
+		ranges.sort( rangeDistanceComparison )
+		#ran = ranges[0]
+		distance = ranges[0].distance
+		sameDistance = [x for x in ranges if x.distance == distance]
+		sameDistance.sort( rangeLengthComparison )
+		#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
+		ran = sameDistance[-1]
+
+		print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
+
+		# Extract text on the right
+		line = self.formatedLine( self.textLines[ ran.line ] )
+		rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
+		print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
+		print "rightValue: ", rightValue.encode('ascii','ignore')
+		print "SAME LINE: ", line.encode('ascii','ignore')
+
+		# Extract text on the bottom
+		if ran.line < len(self.textLines)-1:
+			line = self.textLines[ran.line+1]
+			print "NEXT LINE: ", self.formatedLine( self.textLines[ran.line+1] ).encode('ascii','ignore')
+			boxBottom = ran.rect()
+			boxBottom.moveTop( line[0].box.y() )
+			bottomValue = u''
+			for c in line:
+				if c.box.intersects( boxBottom ):
+					bottomValue += c.character
+		else:
+			bottomValue = u''
+		
+		# Decide which of both values match the given tag type
+		type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
+		if type == 'numeric':
+			if isFloat( rightValue ):
+				return textToFloat( rightValue )
+			elif isFloat( bottomValue ):
+				return textToFloat( bottomValue )
+			else:
+				return None
+		elif type == 'date':
+			if isDate( rightValue ):
+				return textToDate( rightValue )
+			elif isDate( bottomValue ):
+				return textToDate( bottomValue )
+			else:
+				return None
+		elif type == 'mostly-numeric':
+			if isMostlyNumeric( rightValue ):
+				return rightValue
+			elif isMostlyNumeric( bottomValue ):
+				return bottomValue
+			else:
+				return rightValue
+		else:
+			return rightValue
+
--- a/NanScan/Generics/init.py
+++ b/NanScan/Generics/init.py
@ -0,0 +1,19 @@
+#   Copyright (C) 2009 by Albert Cervera i Areny
+#   albert@nan-tic.com
+#
+#   This program is free software; you can redistribute it and/or modify 
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or 
+#   (at your option) any later version. 
+#
+#   This program is distributed in the hope that it will be useful, 
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of 
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License 
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
+
+
--- a/NanScan/Ocr.py
+++ b/NanScan/Ocr.py
@ -44,7 +44,7 @@ def boxComparison(x, y):
 	else:
 		return 0

-## @breif This class allows using an OCR and provides several convenient functions 
+## @brief This class allows using an OCR and provides several convenient functions 
 # regarding text and image processing such as deskewing or obtaining formated text.
 class Ocr(Analyzer):
 	file = ""
@ -132,12 +132,12 @@ class Ocr(Analyzer):

 	## @brief Returns the text of a given region of the image. 
 	# It's the same as calling formatedText().
-	def textInRegion(self, region):
+	def textInRegion(self, region=None):
 		return self.formatedText( region )

 	## @brief Returns the bounding rectangle of the text returned by textInRegion for
 	# the given region.
-	def featureRectInRegion(self, region):
+	def featureRectInRegion(self, region=None):
 		lines = self.textLinesWithSpaces( region )
 		rect = QRectF()
 		for line in lines:
@ -242,6 +242,58 @@ class Ocr(Analyzer):
 			line.sort( boxComparison )
 		return lines

+	## @brief This function adds spaces between words of a single line of boxes.
+	def textLineWithSpaces(self, line):
+		width = 0
+		count = 0
+		left = None
+		spacesToAdd = []
+		words = []
+		for c in line:
+			if left:
+				# If separtion between previous and current char
+				# is greater than a third of the average character
+				# width we'll add a space.
+				if c.box.left() - left > ( width / count ) / 3:
+					if spacesToAdd:
+						words.append( line[spacesToAdd[-1]:count] )
+					spacesToAdd.append( count )
+
+			# c.character is already a unicode string
+			left = c.box.right()
+			width += c.box.width()
+			count += 1
+
+		# Try to find out if they are fixed sized characters
+		# We've got some problems with fixed size fonts. In some cases the 'I' letter will
+		# have the width of a pipe but the distance between characters will be fixed. In these
+		# cases it's very probable our algorithm will add incorrect spaces before and/or after
+		# the 'I' letter. This should be fixed by somehow determining if it's a fixed sized
+		# font. The commented code below tries to do just that by calculating distances within
+		# the letters of each word. We need to find out if something like this can work and 
+		# use it.
+		#for x in words:
+			#dist = []
+			#for c in range( len(x)-1 ):
+				#dist.append( x[c+1].box.center().x() - x[c].box.center().x() )
+			#print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore')
+			#print 'Distancies: ', dist
+				
+			
+		# Reverse so indexes are still valid after insertions
+		spacesToAdd.reverse()
+		previousIdx = None
+		for idx in spacesToAdd:
+			c = Character()
+			c.character = u' '
+			c.box = QRectF()
+			c.box.setTop( line[idx - 1].box.top() )
+			c.box.setBottom( line[idx - 1].box.bottom() )
+			c.box.setLeft( line[idx - 1].box.right() )
+			c.box.setRight( line[idx].box.left() )
+			line.insert( idx, c )
+
+
 	## @brief This function is similar to textLines() but adds spaces between words.
 	# The result is also a list of lines each line being a list of Character objects.
 	def textLinesWithSpaces(self, region=None):
@ -257,54 +309,7 @@ class Ocr(Analyzer):
 		# which is quite usual.

 		for line in lines:
-			width = 0
-			count = 0
-			left = None
-			spacesToAdd = []
-			words = []
-			for c in line:
-				if left:
-					# If separtion between previous and current char
-					# is greater than a third of the average character
-					# width we'll add a space.
-					if c.box.left() - left > ( width / count ) / 3:
-						if spacesToAdd:
-							words.append( line[spacesToAdd[-1]:count] )
-						spacesToAdd.append( count )
-
-				# c.character is already a unicode string
-				left = c.box.right()
-				width += c.box.width()
-				count += 1
-
-			# Try to find out if they are fixed sized characters
-			# We've got some problems with fixed size fonts. In some cases the 'I' letter will
-			# have the width of a pipe but the distance between characters will be fixed. In these
-			# cases it's very probable our algorithm will add incorrect spaces before and/or after
-			# the 'I' letter. This should be fixed by somehow determining if it's a fixed sized
-			# font. The commented code below tries to do just that by calculating distances within
-			# the letters of each word. We need to find out if something like this can work and 
-			# use it.
-			#for x in words:
-				#dist = []
-				#for c in range( len(x)-1 ):
-					#dist.append( x[c+1].box.center().x() - x[c].box.center().x() )
-				#print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore')
-				#print 'Distancies: ', dist
-					
-				
-			# Reverse so indexes are still valid after insertions
-			spacesToAdd.reverse()
-			previousIdx = None
-			for idx in spacesToAdd:
-				c = Character()
-				c.character = u' '
-				c.box = QRectF()
-				c.box.setTop( line[idx - 1].box.top() )
-				c.box.setBottom( line[idx - 1].box.bottom() )
-				c.box.setLeft( line[idx - 1].box.right() )
-				c.box.setRight( line[idx].box.left() )
-				line.insert( idx, c )
+			self.textLineWithSpaces( line )
 		return lines

 		
--- a/NanScan/Recognizer.py
+++ b/NanScan/Recognizer.py
@ -29,6 +29,7 @@ from Trigram import *
 from Hamming import *
 from LevenshteinDistance import *
 from Translator import *
+from Range import *

 import tempfile

@ -70,7 +71,7 @@ class Recognizer(QObject):
 		if type in self.analyzers:
 			return self.analyzers[type].boxes
 		else:
-			return None
+			return []

 	def analyzersAvailable(self):
 		return self.analyzers.keys()
@ -148,11 +149,9 @@ class Recognizer(QObject):
 	# 5 (the default) will make the template move 5 millimeter to the right,
 	# 5 to the left, 5 to the top and 5 to the bottom. This means 121 positions
 	# per template.
+	#
 	# Note that the image must have been scanned (using scan() or startScan()) 
 	# before using this function.
-	#
-	# TODO: Using offsets to find the best template is easy but highly inefficient.
-	#  a smarter solution should be implemented.
 	def findMatchingTemplateByOffset( self, templates, offset = 5 ):
 		max = 0
 		best = {
@ -200,9 +199,6 @@ class Recognizer(QObject):
 	#
 	# Note that the image must have been scanned (using scan() or startScan()) 
 	# before using this function.
-	#
-	# TODO: Using offsets to find the best template is easy but highly inefficient.
-	#  a smarter solution should be implemented.
 	def findMatchingTemplateByText( self, templates ):
 		max = 0
 		best = {
@ -224,7 +220,6 @@ class Recognizer(QObject):
 			# Apply template with offset found
 			currentDocument = self.extractWithTemplate( template, offset.x(), offset.y() )
 			for documentBox in currentDocument.boxes:
-				print "Applying..."
 				if documentBox.templateBox.type != 'matcher':
 					continue
 				templateBox = documentBox.templateBox
@ -373,65 +368,3 @@ class TemplateBoxRangeIterator:
 					break
 		return result

-def rangeDistanceComparison(x, y):
-	if x.distance > y.distance:
-		return 1
-	elif x.distance < y.distance:
-		return -1
-	else:
-		return 0
-
-## @brief This class represents a group of characters in a document.
-class Range:
-	def __init__(self):
-		self.line = 0
-		self.pos = 0
-		self.length = 0
-		self.document = None
-
-	## @brief Returns a unicode string with the text of the current range
-	def text(self):
-		line = self.document[self.line]
-		chars = line[self.pos:self.pos + self.length]
-		return u''.join( [x.character for x in chars] )
-
-	## @brief Returns the bounding rectangle of the text in the range
-	def rect(self):
-		line = self.document[self.line]
-		chars = line[self.pos:self.pos + self.length]
-		rect = QRectF()
-		for c in chars:
-			rect = rect.united( c.box )
-		return rect
-
-	## @brief Returns a list with all possible ranges of size length of the 
-	# given document
-	@staticmethod
-	def extractAllRangesFromDocument(lines, length, width=0):
-		if length <= 0:
-			return []
-		ranges = []
-		for line in range(len(lines)):
-			if length >= len(lines[line]):
-				ran = Range()
-				ran.line = line
-				ran.pos = 0
-				ran.length = len(lines[line])
-				ran.document = lines
-				#if width:
-				#	while ran.rect().width() > width:
-				#		ran.length -= 1
-				ranges.append( ran )
-				continue
-			for pos in range(len(lines[line]) - length + 1):
-				ran = Range()
-				ran.line = line
-				ran.pos = pos
-				ran.length = length
-				ran.document = lines
-				#if width:
-				#	while ran.rect().width() > width:
-				#		ran.length -= 1
-				ranges.append( ran )
-		return ranges
-
--- a/NanScan/test-scandialog.py
+++ b/NanScan/test-scandialog.py
@ -1,5 +1,5 @@
 from PyQt4.QtGui import *
-from scandialog import *
+from ScanDialog import *
 import sys
 import os

@ -10,7 +10,7 @@ dialog = ScanDialog()
 if os.name == 'nt':
 	FileSaveThreaded.directory = 'c:\\images'
 else:
-	FileSaveThreaded.directory = '/tmp'
+	FileSaveThreaded.directory = '/tmp/scan'

 dialog.exec_()

--- a/Planta/MainWindow.py
+++ b/Planta/MainWindow.py
@ -454,6 +454,7 @@ class MainWindow(QMainWindow):
 		self.connect( self.actionUnzoom, SIGNAL('triggered()'), self.unzoom )
 		self.connect( self.actionFindMatchingTemplateByOffset, SIGNAL('triggered()'), self.findMatchingTemplateByOffset )
 		self.connect( self.actionFindMatchingTemplateByText, SIGNAL('triggered()'), self.findMatchingTemplateByText )
+		self.connect( self.actionRecognizeInvoice, SIGNAL('triggered()'), self.recognizeInvoice )
 		self.toggleImageBoxes()
 		QTimer.singleShot( 1000, self.setup )
 		self.updateTitle()
@ -486,6 +487,12 @@ class MainWindow(QMainWindow):
 	def findMatchingTemplateByText(self):
 		self.findMatchingTemplate( 'text' )

+	def recognizeInvoice(self):
+		from NanScan.Generics.InvoiceRecognizer import InvoiceRecognizer
+		p = InvoiceRecognizer()
+		result = p.recognize( self.recognizer )
+		QMessageBox.information( self, _('Invoice Recognition'), result )
+
 	def findMatchingTemplate(self, type):
 		if type == 'offset':
 			title = _('Template search by offset')
--- a/Planta/mainwindow.ui
+++ b/Planta/mainwindow.ui
@ -13,14 +13,6 @@
   <string>Planta</string>
  </property>
  <widget class="QWidget" name="centralwidget" >
-   <property name="geometry" >
-    <rect>
-     <x>0</x>
-     <y>48</y>
-     <width>709</width>
-     <height>439</height>
-    </rect>
-   </property>
   <layout class="QHBoxLayout" name="horizontalLayout_2" >
    <item>
     <layout class="QVBoxLayout" >
@ -35,16 +27,7 @@
        <property name="windowTitle" >
         <string/>
        </property>
-        <widget class="QWidget" name="dockWidgetContents" >
-         <property name="geometry" >
-          <rect>
-           <x>0</x>
-           <y>20</y>
-           <width>70</width>
-           <height>399</height>
-          </rect>
-         </property>
-        </widget>
+        <widget class="QWidget" name="dockWidgetContents" />
       </widget>
      </item>
     </layout>
@ -111,7 +94,7 @@
     <x>0</x>
     <y>0</y>
     <width>709</width>
-     <height>22</height>
+     <height>25</height>
    </rect>
   </property>
   <widget class="QMenu" name="menuFile" >
@ -150,6 +133,7 @@
    <addaction name="actionFindMatchingTemplateByOffset" />
    <addaction name="actionFindMatchingTemplateByText" />
    <addaction name="actionDeskew" />
+    <addaction name="actionRecognizeInvoice" />
   </widget>
   <addaction name="menuFile" />
   <addaction name="menuEdit" />
@ -157,14 +141,6 @@
   <addaction name="menuView" />
  </widget>
  <widget class="QToolBar" name="toolBar" >
-   <property name="geometry" >
-    <rect>
-     <x>0</x>
-     <y>22</y>
-     <width>709</width>
-     <height>26</height>
-    </rect>
-   </property>
   <property name="windowTitle" >
    <string>toolBar</string>
   </property>
@ -297,6 +273,11 @@
    <string>Deskew</string>
   </property>
  </action>
+  <action name="actionRecognizeInvoice" >
+   <property name="text" >
+    <string>Recognize Invoice</string>
+   </property>
+  </action>
 </widget>
 <resources/>
 <connections/>
--- a/Planta/planta.sh
+++ b/Planta/planta.sh
@ -4,4 +4,5 @@
 #export PYTHONPATH=/home/albert/python/lib/python:../../bin:../../..
 # NanScan
 export PYTHONPATH=..:/home/albert/d/koo
+export LD_LIBRARY_PATH=/usr/lib
 ./planta.py $1
--- a/doc/doxygen/nanscan.doxyfile
+++ b/doc/doxygen/nanscan.doxyfile
@ -4,7 +4,7 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = NaNScaN
+PROJECT_NAME           = NanScan
 PROJECT_NUMBER         = 1.0
 OUTPUT_DIRECTORY       = .
 CREATE_SUBDIRS         = NO
@ -87,7 +87,7 @@ WARN_LOGFILE           =
 #---------------------------------------------------------------------------
 # configuration options related to the input files
 #---------------------------------------------------------------------------
-INPUT                  = ../../NaNScaN
+INPUT                  = ../../NanScan
 INPUT_ENCODING         = UTF-8
 FILE_PATTERNS          = *.c \
                         *.cc \