Several improvements in invoice recognition:

Added new types, improved date recognition, improved performance.
2009-03-23 16:09:11 +01:00 · 2009-03-23 16:09:11 +01:00 · 41224310da
parent 4119ec747c
commit 41224310da
4 changed files with 281 additions and 79 deletions
--- a/NanScan/Block.py
+++ b/NanScan/Block.py
@ -58,15 +58,23 @@ class Block:
 		self.document = None
 		self._boxes = []
 		self.outerDistance = 2.5
 		self._rect = None
 		self._outerRect = None
 	def setBoxes(self, boxes):
 		self._boxes = boxes
 		self.invalidateCache()
 	def boxes(self):
 		return self._boxes
 	def addBox(self, box):
 		self._boxes.append( box )
 		self.invalidateCache()
 	def removeBox(self, box):
 		self._boxes.remove( box )
 		self.invalidateCache()
 	def count(self):
 		return len(self._boxes)
@ -75,20 +83,30 @@ class Block:
 	def text(self):
 		return self.formatedText()
 	def invalidateCache(self):
 		self._rect = None
 		self._outerRect = None
 	## @brief Returns the bounding rectangle of the text in the range
 	def rect(self):
-		rect = QRectF()
+		# If we have the value in the cache use it.
 		if self._rect:
 			return self._rect
 		self._rect = QRectF()
 		for c in self._boxes:
-			rect = rect.united( c.box )
+			self._rect = self._rect.united( c.box )
-		return rect
+		return self._rect
 	## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
 	# larger in all sides.
 	def outerRect(self):
 		if self._outerRect:
 			return self._outerRect
 		rect = self.rect()
 		rect.translate( - self.outerDistance, - self.outerDistance )
 		rect.setWidth( rect.width() + self.outerDistance * 2.0 )
 		rect.setHeight( rect.height() + self.outerDistance * 2.0 )
 		self._outerRect = rect
 		return rect
 	## @brief Returns a list with all possible ranges of size length of the 
@ -117,7 +135,7 @@ class Block:
 			block.document = lines
 			blocks.append( block )
 			for char in line:
-				if char.character != u' ':
+				if char.character != u' ' or block.count() == 0:
 					block.addBox( char )
 				else:
 					avgWidth = block.rect().width() / block.count()
@ -234,7 +252,6 @@ class Block:
 						words.append( line[spacesToAdd[-1]:count] )
 					spacesToAdd.append( count )
 			# c.character is already a unicode string
 			left = c.box.right()
 			width += c.box.width()
 			count += 1
--- a/NanScan/Generics/InvoiceRecognizer.py
+++ b/NanScan/Generics/InvoiceRecognizer.py
@ -1,3 +1,4 @@
 # encoding: iso-8859-1
 #   Copyright (C) 2009 by Albert Cervera i Areny
 #   albert@nan-tic.com
 #
@ -22,77 +23,130 @@ from NanScan.Range import *
 from NanScan.Block import *
 from NanScan.TextPatterns import *
 def findDate( recognizer ):
 	ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
 	for ran in ranges:
 		text = ran.text()
 		if isDate( ran.text() ):
 			return textToDate( text )
 	return None
 def findVat( recognizer ):
 	ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
 	for ran in ranges:
 		text = ran.text()
 		if isVat( ran.text() ):
 			return textToVat( text )
 	return None
 class InvoiceRecognizer:
 	Tags = { 
 		'number': {
 			'tag': [
 				u'factura',
 				u'numero factura',
 				u'factura numero',
 				u'num. de factura',
-				u'factura num.'
+				u'factura num.',
 				u'nº factura',
 				u'factura núm.',
 				u'factura',
 				u'número de factura'
 			],
 			'type': 'mostly-numeric'
 		},
 		'date': {
 			'tag': [
-				u'fecha',
+				u'fecha de factura'
 				u'fecha factura',
 				u'fecha emision',
 				u'data factura'
 				u'fecha',
 				u'data:',
 				u'data',
 				u'data factura'
 			],
-			'type': 'date'
+			'type': 'date',
-			# With dates we need to be able to find a date with
+			'fallback': findDate,
 			# the format '1 Sep. 2009'. Also we need to find the
 			# date without a tag. Something like:
 			#
 			# 'fallback': functionName,
 			# 
 			# might be appropiate for those cases in which the
 			# tag can't be found.
 		},
-		'amount': {
+		'base': {
 			'tag': [
 				u'base imponible',
 				u'base imposable',
 				u'total (base imposable)'
 			],
 			'type': 'numeric'
 		},
 		'taxes': {
 			'tag': [
 				u'IVA',
 			],
 			'type': 'numeric'
 		},
 		'total': {
 			'tag': [
 				u'total',
 				u'total factura',
 				u'total a pagar (euros)'
 			],
 			'type': 'numeric'
 		},
 		'vat': {
 			'tag': [
 				u'nif',
 				u'cif',
 				u'nif/cif',
 				u'nif:',
 				u'cif:',
 				u'nif/cif:',
 				u'nif :',
 				u'cif :',
 				u'nif/cif :',
 			],
 			'type': 'vat',
 			'fallback': findVat,
 		},
 		'pagina': {
 			'tag': [
 				u'pagina',
 				u'página',
 				u'pàgina',
 				u'pag.',
 				u'pàg.',
 				u'pág.'
 			],
 			'type': 'page-number'
 		}
 	}
 	def recognize(self, recognizer):
 		#text = recognizer.textInRegion('text')
 		analyzer = recognizer.analyzers['text']
 		self.textLines = analyzer.block.textLinesWithSpaces()
 		result = ''
 		for tag in InvoiceRecognizer.Tags:
 			result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
-		print "========================================"
+#		print "========================================"
-		blocks = Block.extractAllBlocksFromDocument( self.textLines )
+#		blocks = Block.extractAllBlocksFromDocument( self.textLines )
-		for block in blocks:
+#		for block in blocks:
-			print "---"
+#			print "---"
-			print "BLOCK:", block.text().encode('ascii','ignore')
+#			print "BLOCK:", block.text().encode('ascii','ignore')
-			print "---"
+#			print "---"
-		print "========================================"
+#		print "========================================"
-		# Try to find out which of the blocks contains customer information
+#		# Try to find out which of the blocks contains customer information
-
+#
-		# This rect, picks up the first third of an A4 paper size.
+#		# This rect, picks up the first third of an A4 paper size.
-		top = QRectF( 0, 0, 210, 99 )
+#		top = QRectF( 0, 0, 210, 99 )
-		tops = []
+#		tops = []
-		for block in blocks:
+#		for block in blocks:
-			if block.rect().intersects( top ):
+#			if block.rect().intersects( top ):
-				tops.append( block )
+#				tops.append( block )
-		# Once we have all the blocks of the first third of the paper
+#		# Once we have all the blocks of the first third of the paper
-		# try to guess which of them is the good one.
+#		# try to guess which of them is the good one.
-
+#
-		# Remove those blocks too wide
+#		# Remove those blocks too wide
-		sized = []
+#		sized = []
-		for block in tops:
+#		for block in tops:
-			if block.width() < 120:
+#			if block.rect().width() < 120:
-				sized.append( block )
+#				sized.append( block )
 		return result
@ -109,34 +163,42 @@ class InvoiceRecognizer:
 			value = Levenshtein.levenshtein( text, textToFind )
 			ran.distance = value
 		ranges.sort( rangeDistanceComparison )
-		if ranges:
+		return ranges
 			return ranges[0]
 		else:
 			return None
 	def findTagValue(self, tag):
 		ranges = []
 		for tagData in InvoiceRecognizer.Tags[tag]['tag']:
-			ran = self.findText( tagData )
+			ranges += self.findText( tagData )
-			if ran:
+			#ran = self.findText( tagData )
-				ranges.append( ran )
+			#if ran:
-		ranges.sort( rangeDistanceComparison )
+				#ranges.append( ran )
-		#ran = ranges[0]
+		#ranges.sort( rangeDistanceComparison )
-		distance = ranges[0].distance
+		#distance = ranges[0].distance
-		sameDistance = [x for x in ranges if x.distance == distance]
+		#sameDistance = [x for x in ranges if x.distance == distance]
-		sameDistance.sort( rangeLengthComparison )
+		#sameDistance.sort( rangeLengthComparison )
-		#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
+		#ran = sameDistance[-1]
-		ran = sameDistance[-1]
+		ranges.sort( rangeDistanceLengthRatioComparison )
-
+		print "RANGES FOR TAG: %s\n%s" % ( tag, [ran.text().encode('ascii','replace') for ran in ranges[:20]] )
 		for ran in ranges[:5]:
 			print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
 			value = self.findTagValueFromRange( tag, ran )
 			if value:
 				return value
 		return None
 	def findTagValueFromRange(self, tag, ran):
 		# Extract text on the right
-		line = self.formatedLine( self.textLines[ ran.line ] )
+		#line = self.formatedLine( self.textLines[ ran.line ] )
-		rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
+		#rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
-		print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
+		line = self.textLines[ ran.line ]
-		print "rightValue: ", rightValue.encode('ascii','ignore')
+		line = line[ran.pos+ran.length+1:]
-		print "SAME LINE: ", line.encode('ascii','ignore')
+		rightValue = Block.extractAllBlocksFromDocument( [ line ] )[0].text()
 		#print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
 		print "rightValue: ", rightValue.encode('ascii','replace')
 		#print "SAME LINE: ", line.encode('ascii','ignore')
 		# Extract text on the bottom
 		if ran.line < len(self.textLines)-1:
@ -150,30 +212,40 @@ class InvoiceRecognizer:
 					bottomValue += c.character
 		else:
 			bottomValue = u''
 		print "bottomValue: ", bottomValue.encode('ascii','replace')
 		# Decide which of both values match the given tag type
 		type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
 		value = None
 		if type == 'numeric':
 			if isFloat( rightValue ):
-				return textToFloat( rightValue )
+				value = textToFloat( rightValue )
 			elif isFloat( bottomValue ):
-				return textToFloat( bottomValue )
+				value = textToFloat( bottomValue )
 			else:
 				return None
 		elif type == 'date':
 			if isDate( rightValue ):
-				return textToDate( rightValue )
+				value = textToDate( rightValue )
 			elif isDate( bottomValue ):
-				return textToDate( bottomValue )
+				value = textToDate( bottomValue )
 			else:
 				return None
 		elif type == 'mostly-numeric':
 			if isMostlyNumeric( rightValue ):
-				return rightValue
+				value = textToMostlyNumeric( rightValue )
 			elif isMostlyNumeric( bottomValue ):
-				return bottomValue
+				value = textToMostlyNumeric( bottomValue )
 		elif type == 'vat':
 			if isVat( rightValue ):
 				value = textToVat( rightValue )
 			elif isVat( bottomValue ):
 				value = textToVat( bottomValue )
 		elif type == 'page-number':
 			if isPageNumber( rightValue ):
 				value = textToPageNumber( rightValue )
 			elif isPageNumber( bottomValue ):
 				value = textToPageNumber( bottomValue )
 		else:
-				return rightValue
+			value = rightValue
-		else:
+
-			return rightValue
+		if not value and 'fallback' in InvoiceRecognizer.Tags[ tag ]:
 			value = InvoiceRecognizer.Tags[ tag ]['fallback']( self )
 		return value
--- a/NanScan/Range.py
+++ b/NanScan/Range.py
@ -37,6 +37,24 @@ def rangeLengthComparison(x, y):
 	else:
 		return 0
 def rangeDistanceLengthRatioComparison(x, y):
 	xt = x.text()
 	if len( xt ):
 		xl = (1.0/len(xt)) + float( x.distance ) / len( xt ) 
 	else:
 		xl = 999
 	yt = y.text()
 	if len( y.text() ):
 		yl = (1.0/len(yt)) + float( y.distance ) / len( yt )
 	else:
 		yl = 999
 	if xl > yl:
 		return 1
 	elif xl < yl:
 		return -1
 	else:
 		return 0
 ## @brief This class represents a group of characters in a document.
 class Range:
 	def __init__(self):
--- a/NanScan/TextPatterns.py
+++ b/NanScan/TextPatterns.py
@ -1,3 +1,4 @@
 # encoding: iso-8859-1
 #   Copyright (C) 2009 by Albert Cervera i Areny
 #   albert@nan-tic.com
 #
@ -17,6 +18,7 @@
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 from PyQt4.QtCore import *
 import re
 def textToFloat( value ):
 	if ',' in value and '.' in value:
@ -30,6 +32,11 @@ def textToFloat( value ):
 		newValue = value.replace( ',', '.' )
 	else:
 		newValue = value
 	# Remove spaces
 	newValue = newValue.replace( ' ', '' )
 	# Remove possible coin symbol in the end
 	if not newValue[-1] in '0123456789':
 		newValue = newValue[:-1]
 	return float( newValue )
 def isFloat( value ):
@ -44,14 +51,51 @@ def isDate( value ):
 	return date.isValid()
 def textToDate( value ):
-	patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
+	value = value.replace(' ','')
 	value = textMonthToNumber( value )
 	# Replace texts for cases such as '25 de juny de 2009'
 	value = re.sub( r'[a-z]', '', value )
 	value = re.sub( r'[A-Z]', '', value )
 	patterns = [
 		'dd/MM/yyyy', 'dd-MM-yyyy', 'dd/MM/yy', 'dd-MM-yy', 
 		'd/MM/yyyy', 'd-MM-yyyy', 'd/MM/yy', 'd-MM-yy',
 		'dd.mm.yyyy', 'dd.mm.yy', 'd.mm.yyyy', 'd.mm.yy']
 	for pattern in patterns:
-		date = QDate.fromString( value.replace(' ',''), pattern )
+		date = QDate.fromString( value, pattern )
 		if date.isValid():
 			# If only two digits where used to specify year 
 			# it probably meant 200x or 20xx not 190x or 19xx 
 			# (which is what QDate interprets).
 			if date.year() < 1930 and not 'yyyy' in pattern:
 				date = date.addYears( 100 )
 			return date
 	return QDate()
 def textMonthToNumber( value ):
 	months = [ 
 		('gen', '01'), ('gener', '01'), ('enero', '01'), ('january', '01'), 
 		('feb', '02'), ('febrer', '02'), ('febrero', '02'), ('february', '02'),
 		('mar', '03'), ('marc', '03'), ('marzo', '03'), ('march', '03'),
 		('abr', '04'), ('apr', '04'), ('abril', '04'), ('april', '04'),
 		('mai', '05'), ('may', '05'), ('maig', '05'), ('mayo', '05'), 
 		('jun', '06'), ('jul', '07'), ('juny', '06'), ('junio', '06'), ('june', '06'),
 		('ago', '08'), ('agost', '09'), ('agosto', '08'), ('august', '08'), 
 		('set', '09'), ('sep', '09'), ('setembre', '09'), ('september', '09'),
 		('oct', '10'), ('octubre', '10'), ('october', '10'), 
 		('nov', '11'), ('novembre', '11'), ('noviembre', '11'),
 		('des', '12'), ('dec', '12'), ('desembre', '12'), ('diciembre', '12'), ('december', '12')
 	]
 	# reverse sort so longer names are replaced first
 	months.sort( key=lambda a: a[0], reverse=True )
 	v = value
 	for x in months:
 		# Try to replace twice. Sometimes instead of 'dec' we see 'dec.'
 		v = v.replace( u'%s.' % x[0], u'/%s/' % x[1] )
 		v = v.replace( x[0], u'/%s/' % x[1] )
 	return v
 def isMostlyNumeric( text ):
 	text = text.replace(' ','')
 	numbers = 0
 	for x in text:
 		if x in '0123456789':
@ -61,3 +105,54 @@ def isMostlyNumeric( text ):
 	else:
 		return False
 def textToMostlyNumeric( text ):
 	text = text.replace(' ','')
 	return text
 def isVat( text ):
 	if textToVat( text ):
 		return True
 	else:
 		return False
 def textToVat( text ):
 	text = text.replace( ' ', '' )
 	text = text.upper()
 	expressions = [
 		'^[A-Z][0-9]{8}$', '^[0-9]{8}[A-Z]$'
 	]
 	for e in expressions:
 		ex = re.compile( e )
 		if ex.search( text ):
 			return text
 	return ''
 def isPageNumber( text ):
 	if textToPageNumber( text ):
 		return True
 	else:
 		return False
 def textToPageNumber( text ):
 	current = None	
 	total = None
 	blocks = []
 	inSequence = False
 	for c in text:
 		if c in '0123456789':
 			if not inSequence:
 				inSequence = True
 				blocks.append( u'' )
 			blocks[-1] += c
 		else:
 			inSequence = False
 	if len(blocks) > 0:
 		current = textToFloat( blocks[0] )
 	if len(blocks) > 1:
 		total = textToFloat( blocks[1] )
 	if current:
 		return (current, total)
 	else:
 		return None