Several improvements in invoice recognition:

Added new types, improved date recognition, improved performance.
2009-03-23 16:09:11 +01:00 · 2009-03-23 16:09:11 +01:00 · 41224310da
parent 4119ec747c
commit 41224310da
4 changed files with 281 additions and 79 deletions
--- a/NanScan/Block.py
+++ b/NanScan/Block.py
@ -58,15 +58,23 @@ class Block:
 		self.document = None
 		self._boxes = []
 		self.outerDistance = 2.5
+		self._rect = None
+		self._outerRect = None
 	
 	def setBoxes(self, boxes):
 		self._boxes = boxes
+		self.invalidateCache()

 	def boxes(self):
 		return self._boxes

 	def addBox(self, box):
 		self._boxes.append( box )
+		self.invalidateCache()
+
+	def removeBox(self, box):
+		self._boxes.remove( box )
+		self.invalidateCache()

 	def count(self):
 		return len(self._boxes)
@ -75,20 +83,30 @@ class Block:
 	def text(self):
 		return self.formatedText()

+	def invalidateCache(self):
+		self._rect = None
+		self._outerRect = None
+
 	## @brief Returns the bounding rectangle of the text in the range
 	def rect(self):
-		rect = QRectF()
+		# If we have the value in the cache use it.
+		if self._rect:
+			return self._rect
+		self._rect = QRectF()
 		for c in self._boxes:
-			rect = rect.united( c.box )
-		return rect
+			self._rect = self._rect.united( c.box )
+		return self._rect

 	## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
 	# larger in all sides.
 	def outerRect(self):
+		if self._outerRect:
+			return self._outerRect
 		rect = self.rect()
 		rect.translate( - self.outerDistance, - self.outerDistance )
 		rect.setWidth( rect.width() + self.outerDistance * 2.0 )
 		rect.setHeight( rect.height() + self.outerDistance * 2.0 )
+		self._outerRect = rect
 		return rect

 	## @brief Returns a list with all possible ranges of size length of the 
@ -117,7 +135,7 @@ class Block:
 			block.document = lines
 			blocks.append( block )
 			for char in line:
-				if char.character != u' ':
+				if char.character != u' ' or block.count() == 0:
 					block.addBox( char )
 				else:
 					avgWidth = block.rect().width() / block.count()
@ -234,7 +252,6 @@ class Block:
 						words.append( line[spacesToAdd[-1]:count] )
 					spacesToAdd.append( count )

-			# c.character is already a unicode string
 			left = c.box.right()
 			width += c.box.width()
 			count += 1
--- a/NanScan/Generics/InvoiceRecognizer.py
+++ b/NanScan/Generics/InvoiceRecognizer.py
@ -1,3 +1,4 @@
+# encoding: iso-8859-1
 #   Copyright (C) 2009 by Albert Cervera i Areny
 #   albert@nan-tic.com
 #
@ -22,77 +23,130 @@ from NanScan.Range import *
 from NanScan.Block import *
 from NanScan.TextPatterns import *

+def findDate( recognizer ):
+	ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
+	for ran in ranges:
+		text = ran.text()
+		if isDate( ran.text() ):
+			return textToDate( text )
+	return None
+
+def findVat( recognizer ):
+	ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
+	for ran in ranges:
+		text = ran.text()
+		if isVat( ran.text() ):
+			return textToVat( text )
+	return None
+
 class InvoiceRecognizer:
 	Tags = { 
 		'number': {
 			'tag': [
-				u'factura',
 				u'numero factura',
 				u'factura numero',
 				u'num. de factura',
-				u'factura num.'
+				u'factura num.',
+				u'nº factura',
+				u'factura núm.',
+				u'factura',
+				u'número de factura'
 			],
 			'type': 'mostly-numeric'
 		},
 		'date': {
 			'tag': [
-				u'fecha',
+				u'fecha de factura'
 				u'fecha factura',
 				u'fecha emision',
+				u'data factura'
+				u'fecha',
 				u'data:',
 				u'data',
-				u'data factura'
 			],
-			'type': 'date'
-			# With dates we need to be able to find a date with
-			# the format '1 Sep. 2009'. Also we need to find the
-			# date without a tag. Something like:
-			#
-			# 'fallback': functionName,
-			# 
-			# might be appropiate for those cases in which the
-			# tag can't be found.
+			'type': 'date',
+			'fallback': findDate,
 		},
-		'amount': {
+		'base': {
+			'tag': [
+				u'base imponible',
+				u'base imposable',
+				u'total (base imposable)'
+			],
+			'type': 'numeric'
+		},
+		'taxes': {
+			'tag': [
+				u'IVA',
+			],
+			'type': 'numeric'
+		},
+		'total': {
 			'tag': [
 				u'total',
 				u'total factura',
 				u'total a pagar (euros)'
 			],
 			'type': 'numeric'
+		},
+		'vat': {
+			'tag': [
+				u'nif',
+				u'cif',
+				u'nif/cif',
+				u'nif:',
+				u'cif:',
+				u'nif/cif:',
+				u'nif :',
+				u'cif :',
+				u'nif/cif :',
+			],
+			'type': 'vat',
+			'fallback': findVat,
+		},
+		'pagina': {
+			'tag': [
+				u'pagina',
+				u'página',
+				u'pàgina',
+				u'pag.',
+				u'pàg.',
+				u'pág.'
+			],
+			'type': 'page-number'
 		}
 	}
+
 	def recognize(self, recognizer):
-		#text = recognizer.textInRegion('text')
 		analyzer = recognizer.analyzers['text']
 		self.textLines = analyzer.block.textLinesWithSpaces()
 		result = ''
 		for tag in InvoiceRecognizer.Tags:
 			result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )

-		print "========================================"
-		blocks = Block.extractAllBlocksFromDocument( self.textLines )
-		for block in blocks:
-			print "---"
-			print "BLOCK:", block.text().encode('ascii','ignore')
-			print "---"
-		print "========================================"
-		# Try to find out which of the blocks contains customer information
-
-		# This rect, picks up the first third of an A4 paper size.
-		top = QRectF( 0, 0, 210, 99 )
-		tops = []
-		for block in blocks:
-			if block.rect().intersects( top ):
-				tops.append( block )
-		# Once we have all the blocks of the first third of the paper
-		# try to guess which of them is the good one.
-
-		# Remove those blocks too wide
-		sized = []
-		for block in tops:
-			if block.width() < 120:
-				sized.append( block )
+#		print "========================================"
+#		blocks = Block.extractAllBlocksFromDocument( self.textLines )
+#		for block in blocks:
+#			print "---"
+#			print "BLOCK:", block.text().encode('ascii','ignore')
+#			print "---"
+#		print "========================================"
+#		# Try to find out which of the blocks contains customer information
+#
+#		# This rect, picks up the first third of an A4 paper size.
+#		top = QRectF( 0, 0, 210, 99 )
+#		tops = []
+#		for block in blocks:
+#			if block.rect().intersects( top ):
+#				tops.append( block )
+#		# Once we have all the blocks of the first third of the paper
+#		# try to guess which of them is the good one.
+#
+#		# Remove those blocks too wide
+#		sized = []
+#		for block in tops:
+#			if block.rect().width() < 120:
+#				sized.append( block )

 		return result

@ -109,34 +163,42 @@ class InvoiceRecognizer:
 			value = Levenshtein.levenshtein( text, textToFind )
 			ran.distance = value
 		ranges.sort( rangeDistanceComparison )
-		if ranges:
-			return ranges[0]
-		else:
-			return None
-
+		return ranges

 	def findTagValue(self, tag):
 		ranges = []
 		for tagData in InvoiceRecognizer.Tags[tag]['tag']:
-			ran = self.findText( tagData )
-			if ran:
-				ranges.append( ran )
-		ranges.sort( rangeDistanceComparison )
-		#ran = ranges[0]
-		distance = ranges[0].distance
-		sameDistance = [x for x in ranges if x.distance == distance]
-		sameDistance.sort( rangeLengthComparison )
-		#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
-		ran = sameDistance[-1]
-
+			ranges += self.findText( tagData )
+			#ran = self.findText( tagData )
+			#if ran:
+				#ranges.append( ran )
+		#ranges.sort( rangeDistanceComparison )
+		#distance = ranges[0].distance
+		#sameDistance = [x for x in ranges if x.distance == distance]
+		#sameDistance.sort( rangeLengthComparison )
+		#ran = sameDistance[-1]
+		ranges.sort( rangeDistanceLengthRatioComparison )
+		print "RANGES FOR TAG: %s\n%s" % ( tag, [ran.text().encode('ascii','replace') for ran in ranges[:20]] )
+		for ran in ranges[:5]:
 			print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
+			value = self.findTagValueFromRange( tag, ran )
+			if value:
+				return value
+		return None
+
+	def findTagValueFromRange(self, tag, ran):

 		# Extract text on the right
-		line = self.formatedLine( self.textLines[ ran.line ] )
-		rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
-		print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
-		print "rightValue: ", rightValue.encode('ascii','ignore')
-		print "SAME LINE: ", line.encode('ascii','ignore')
+		#line = self.formatedLine( self.textLines[ ran.line ] )
+		#rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
+		line = self.textLines[ ran.line ]
+		line = line[ran.pos+ran.length+1:]
+		rightValue = Block.extractAllBlocksFromDocument( [ line ] )[0].text()
+
+
+		#print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
+		print "rightValue: ", rightValue.encode('ascii','replace')
+		#print "SAME LINE: ", line.encode('ascii','ignore')

 		# Extract text on the bottom
 		if ran.line < len(self.textLines)-1:
@ -150,30 +212,40 @@ class InvoiceRecognizer:
 					bottomValue += c.character
 		else:
 			bottomValue = u''
+		print "bottomValue: ", bottomValue.encode('ascii','replace')
 		
 		# Decide which of both values match the given tag type
 		type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
+		value = None
 		if type == 'numeric':
 			if isFloat( rightValue ):
-				return textToFloat( rightValue )
+				value = textToFloat( rightValue )
 			elif isFloat( bottomValue ):
-				return textToFloat( bottomValue )
-			else:
-				return None
+				value = textToFloat( bottomValue )
 		elif type == 'date':
 			if isDate( rightValue ):
-				return textToDate( rightValue )
+				value = textToDate( rightValue )
 			elif isDate( bottomValue ):
-				return textToDate( bottomValue )
-			else:
-				return None
+				value = textToDate( bottomValue )
 		elif type == 'mostly-numeric':
 			if isMostlyNumeric( rightValue ):
-				return rightValue
+				value = textToMostlyNumeric( rightValue )
 			elif isMostlyNumeric( bottomValue ):
-				return bottomValue
+				value = textToMostlyNumeric( bottomValue )
+		elif type == 'vat':
+			if isVat( rightValue ):
+				value = textToVat( rightValue )
+			elif isVat( bottomValue ):
+				value = textToVat( bottomValue )
+		elif type == 'page-number':
+			if isPageNumber( rightValue ):
+				value = textToPageNumber( rightValue )
+			elif isPageNumber( bottomValue ):
+				value = textToPageNumber( bottomValue )
 		else:
-				return rightValue
-		else:
-			return rightValue
+			value = rightValue
+
+		if not value and 'fallback' in InvoiceRecognizer.Tags[ tag ]:
+			value = InvoiceRecognizer.Tags[ tag ]['fallback']( self )
+		return value

--- a/NanScan/Range.py
+++ b/NanScan/Range.py
@ -37,6 +37,24 @@ def rangeLengthComparison(x, y):
 	else:
 		return 0

+def rangeDistanceLengthRatioComparison(x, y):
+	xt = x.text()
+	if len( xt ):
+		xl = (1.0/len(xt)) + float( x.distance ) / len( xt ) 
+	else:
+		xl = 999
+	yt = y.text()
+	if len( y.text() ):
+		yl = (1.0/len(yt)) + float( y.distance ) / len( yt )
+	else:
+		yl = 999
+	if xl > yl:
+		return 1
+	elif xl < yl:
+		return -1
+	else:
+		return 0
+
 ## @brief This class represents a group of characters in a document.
 class Range:
 	def __init__(self):
--- a/NanScan/TextPatterns.py
+++ b/NanScan/TextPatterns.py
@ -1,3 +1,4 @@
+# encoding: iso-8859-1
 #   Copyright (C) 2009 by Albert Cervera i Areny
 #   albert@nan-tic.com
 #
@ -17,6 +18,7 @@
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 

 from PyQt4.QtCore import *
+import re

 def textToFloat( value ):
 	if ',' in value and '.' in value:
@ -30,6 +32,11 @@ def textToFloat( value ):
 		newValue = value.replace( ',', '.' )
 	else:
 		newValue = value
+	# Remove spaces
+	newValue = newValue.replace( ' ', '' )
+	# Remove possible coin symbol in the end
+	if not newValue[-1] in '0123456789':
+		newValue = newValue[:-1]
 	return float( newValue )

 def isFloat( value ):
@ -44,14 +51,51 @@ def isDate( value ):
 	return date.isValid()

 def textToDate( value ):
-	patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
+	value = value.replace(' ','')
+	value = textMonthToNumber( value )
+	# Replace texts for cases such as '25 de juny de 2009'
+	value = re.sub( r'[a-z]', '', value )
+	value = re.sub( r'[A-Z]', '', value )
+	patterns = [
+		'dd/MM/yyyy', 'dd-MM-yyyy', 'dd/MM/yy', 'dd-MM-yy', 
+		'd/MM/yyyy', 'd-MM-yyyy', 'd/MM/yy', 'd-MM-yy',
+		'dd.mm.yyyy', 'dd.mm.yy', 'd.mm.yyyy', 'd.mm.yy']
 	for pattern in patterns:
-		date = QDate.fromString( value.replace(' ',''), pattern )
+		date = QDate.fromString( value, pattern )
 		if date.isValid():
+			# If only two digits where used to specify year 
+			# it probably meant 200x or 20xx not 190x or 19xx 
+			# (which is what QDate interprets).
+			if date.year() < 1930 and not 'yyyy' in pattern:
+				date = date.addYears( 100 )
 			return date
 	return QDate()

+def textMonthToNumber( value ):
+	months = [ 
+		('gen', '01'), ('gener', '01'), ('enero', '01'), ('january', '01'), 
+		('feb', '02'), ('febrer', '02'), ('febrero', '02'), ('february', '02'),
+		('mar', '03'), ('marc', '03'), ('marzo', '03'), ('march', '03'),
+		('abr', '04'), ('apr', '04'), ('abril', '04'), ('april', '04'),
+		('mai', '05'), ('may', '05'), ('maig', '05'), ('mayo', '05'), 
+		('jun', '06'), ('jul', '07'), ('juny', '06'), ('junio', '06'), ('june', '06'),
+		('ago', '08'), ('agost', '09'), ('agosto', '08'), ('august', '08'), 
+		('set', '09'), ('sep', '09'), ('setembre', '09'), ('september', '09'),
+		('oct', '10'), ('octubre', '10'), ('october', '10'), 
+		('nov', '11'), ('novembre', '11'), ('noviembre', '11'),
+		('des', '12'), ('dec', '12'), ('desembre', '12'), ('diciembre', '12'), ('december', '12')
+	]
+	# reverse sort so longer names are replaced first
+	months.sort( key=lambda a: a[0], reverse=True )
+	v = value
+	for x in months:
+		# Try to replace twice. Sometimes instead of 'dec' we see 'dec.'
+		v = v.replace( u'%s.' % x[0], u'/%s/' % x[1] )
+		v = v.replace( x[0], u'/%s/' % x[1] )
+	return v
+
 def isMostlyNumeric( text ):
+	text = text.replace(' ','')
 	numbers = 0
 	for x in text:
 		if x in '0123456789':
@ -61,3 +105,54 @@ def isMostlyNumeric( text ):
 	else:
 		return False

+def textToMostlyNumeric( text ):
+	text = text.replace(' ','')
+	return text
+
+def isVat( text ):
+	if textToVat( text ):
+		return True
+	else:
+		return False
+
+def textToVat( text ):
+	text = text.replace( ' ', '' )
+	text = text.upper()
+	expressions = [
+ 		'^[A-Z][0-9]{8}$', '^[0-9]{8}[A-Z]$'
+	]
+	for e in expressions:
+		ex = re.compile( e )
+		if ex.search( text ):
+			return text
+	return ''
+
+def isPageNumber( text ):
+	if textToPageNumber( text ):
+		return True
+	else:
+		return False
+
+def textToPageNumber( text ):
+	current = None	
+	total = None
+
+	blocks = []
+	inSequence = False
+	for c in text:
+		if c in '0123456789':
+			if not inSequence:
+				inSequence = True
+				blocks.append( u'' )
+			blocks[-1] += c
+		else:
+			inSequence = False
+	if len(blocks) > 0:
+		current = textToFloat( blocks[0] )
+	if len(blocks) > 1:
+		total = textToFloat( blocks[1] )
+	if current:
+		return (current, total)
+	else:
+		return None
+