Several improvements in invoice recognition:

Added new types, improved date recognition, improved performance.
This commit is contained in:
Albert Cervera i Areny 2009-03-23 16:09:11 +01:00
parent 4119ec747c
commit 41224310da
4 changed files with 281 additions and 79 deletions

View File

@ -58,15 +58,23 @@ class Block:
self.document = None
self._boxes = []
self.outerDistance = 2.5
self._rect = None
self._outerRect = None
def setBoxes(self, boxes):
self._boxes = boxes
self.invalidateCache()
def boxes(self):
return self._boxes
def addBox(self, box):
self._boxes.append( box )
self.invalidateCache()
def removeBox(self, box):
self._boxes.remove( box )
self.invalidateCache()
def count(self):
return len(self._boxes)
@ -75,20 +83,30 @@ class Block:
def text(self):
return self.formatedText()
def invalidateCache(self):
self._rect = None
self._outerRect = None
## @brief Returns the bounding rectangle of the text in the range
def rect(self):
rect = QRectF()
# If we have the value in the cache use it.
if self._rect:
return self._rect
self._rect = QRectF()
for c in self._boxes:
rect = rect.united( c.box )
return rect
self._rect = self._rect.united( c.box )
return self._rect
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
# larger in all sides.
def outerRect(self):
if self._outerRect:
return self._outerRect
rect = self.rect()
rect.translate( - self.outerDistance, - self.outerDistance )
rect.setWidth( rect.width() + self.outerDistance * 2.0 )
rect.setHeight( rect.height() + self.outerDistance * 2.0 )
self._outerRect = rect
return rect
## @brief Returns a list with all possible ranges of size length of the
@ -117,7 +135,7 @@ class Block:
block.document = lines
blocks.append( block )
for char in line:
if char.character != u' ':
if char.character != u' ' or block.count() == 0:
block.addBox( char )
else:
avgWidth = block.rect().width() / block.count()
@ -234,7 +252,6 @@ class Block:
words.append( line[spacesToAdd[-1]:count] )
spacesToAdd.append( count )
# c.character is already a unicode string
left = c.box.right()
width += c.box.width()
count += 1

View File

@ -1,3 +1,4 @@
# encoding: iso-8859-1
# Copyright (C) 2009 by Albert Cervera i Areny
# albert@nan-tic.com
#
@ -22,77 +23,130 @@ from NanScan.Range import *
from NanScan.Block import *
from NanScan.TextPatterns import *
def findDate( recognizer ):
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
for ran in ranges:
text = ran.text()
if isDate( ran.text() ):
return textToDate( text )
return None
def findVat( recognizer ):
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
for ran in ranges:
text = ran.text()
if isVat( ran.text() ):
return textToVat( text )
return None
class InvoiceRecognizer:
Tags = {
'number': {
'tag': [
u'factura',
u'numero factura',
u'factura numero',
u'num. de factura',
u'factura num.'
u'factura num.',
u'nº factura',
u'factura núm.',
u'factura',
u'número de factura'
],
'type': 'mostly-numeric'
},
'date': {
'tag': [
u'fecha',
u'fecha de factura'
u'fecha factura',
u'fecha emision',
u'data factura'
u'fecha',
u'data:',
u'data',
u'data factura'
],
'type': 'date'
# With dates we need to be able to find a date with
# the format '1 Sep. 2009'. Also we need to find the
# date without a tag. Something like:
#
# 'fallback': functionName,
#
# might be appropiate for those cases in which the
# tag can't be found.
'type': 'date',
'fallback': findDate,
},
'amount': {
'base': {
'tag': [
u'base imponible',
u'base imposable',
u'total (base imposable)'
],
'type': 'numeric'
},
'taxes': {
'tag': [
u'IVA',
],
'type': 'numeric'
},
'total': {
'tag': [
u'total',
u'total factura',
u'total a pagar (euros)'
],
'type': 'numeric'
},
'vat': {
'tag': [
u'nif',
u'cif',
u'nif/cif',
u'nif:',
u'cif:',
u'nif/cif:',
u'nif :',
u'cif :',
u'nif/cif :',
],
'type': 'vat',
'fallback': findVat,
},
'pagina': {
'tag': [
u'pagina',
u'página',
u'pàgina',
u'pag.',
u'pàg.',
u'pág.'
],
'type': 'page-number'
}
}
def recognize(self, recognizer):
#text = recognizer.textInRegion('text')
analyzer = recognizer.analyzers['text']
self.textLines = analyzer.block.textLinesWithSpaces()
result = ''
for tag in InvoiceRecognizer.Tags:
result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
print "========================================"
blocks = Block.extractAllBlocksFromDocument( self.textLines )
for block in blocks:
print "---"
print "BLOCK:", block.text().encode('ascii','ignore')
print "---"
print "========================================"
# Try to find out which of the blocks contains customer information
# This rect, picks up the first third of an A4 paper size.
top = QRectF( 0, 0, 210, 99 )
tops = []
for block in blocks:
if block.rect().intersects( top ):
tops.append( block )
# Once we have all the blocks of the first third of the paper
# try to guess which of them is the good one.
# Remove those blocks too wide
sized = []
for block in tops:
if block.width() < 120:
sized.append( block )
# print "========================================"
# blocks = Block.extractAllBlocksFromDocument( self.textLines )
# for block in blocks:
# print "---"
# print "BLOCK:", block.text().encode('ascii','ignore')
# print "---"
# print "========================================"
# # Try to find out which of the blocks contains customer information
#
# # This rect, picks up the first third of an A4 paper size.
# top = QRectF( 0, 0, 210, 99 )
# tops = []
# for block in blocks:
# if block.rect().intersects( top ):
# tops.append( block )
# # Once we have all the blocks of the first third of the paper
# # try to guess which of them is the good one.
#
# # Remove those blocks too wide
# sized = []
# for block in tops:
# if block.rect().width() < 120:
# sized.append( block )
return result
@ -109,34 +163,42 @@ class InvoiceRecognizer:
value = Levenshtein.levenshtein( text, textToFind )
ran.distance = value
ranges.sort( rangeDistanceComparison )
if ranges:
return ranges[0]
else:
return None
return ranges
def findTagValue(self, tag):
ranges = []
for tagData in InvoiceRecognizer.Tags[tag]['tag']:
ran = self.findText( tagData )
if ran:
ranges.append( ran )
ranges.sort( rangeDistanceComparison )
#ran = ranges[0]
distance = ranges[0].distance
sameDistance = [x for x in ranges if x.distance == distance]
sameDistance.sort( rangeLengthComparison )
#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
ran = sameDistance[-1]
ranges += self.findText( tagData )
#ran = self.findText( tagData )
#if ran:
#ranges.append( ran )
#ranges.sort( rangeDistanceComparison )
#distance = ranges[0].distance
#sameDistance = [x for x in ranges if x.distance == distance]
#sameDistance.sort( rangeLengthComparison )
#ran = sameDistance[-1]
ranges.sort( rangeDistanceLengthRatioComparison )
print "RANGES FOR TAG: %s\n%s" % ( tag, [ran.text().encode('ascii','replace') for ran in ranges[:20]] )
for ran in ranges[:5]:
print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
value = self.findTagValueFromRange( tag, ran )
if value:
return value
return None
print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
def findTagValueFromRange(self, tag, ran):
# Extract text on the right
line = self.formatedLine( self.textLines[ ran.line ] )
rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
print "rightValue: ", rightValue.encode('ascii','ignore')
print "SAME LINE: ", line.encode('ascii','ignore')
#line = self.formatedLine( self.textLines[ ran.line ] )
#rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
line = self.textLines[ ran.line ]
line = line[ran.pos+ran.length+1:]
rightValue = Block.extractAllBlocksFromDocument( [ line ] )[0].text()
#print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
print "rightValue: ", rightValue.encode('ascii','replace')
#print "SAME LINE: ", line.encode('ascii','ignore')
# Extract text on the bottom
if ran.line < len(self.textLines)-1:
@ -150,30 +212,40 @@ class InvoiceRecognizer:
bottomValue += c.character
else:
bottomValue = u''
print "bottomValue: ", bottomValue.encode('ascii','replace')
# Decide which of both values match the given tag type
type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
value = None
if type == 'numeric':
if isFloat( rightValue ):
return textToFloat( rightValue )
value = textToFloat( rightValue )
elif isFloat( bottomValue ):
return textToFloat( bottomValue )
else:
return None
value = textToFloat( bottomValue )
elif type == 'date':
if isDate( rightValue ):
return textToDate( rightValue )
value = textToDate( rightValue )
elif isDate( bottomValue ):
return textToDate( bottomValue )
else:
return None
value = textToDate( bottomValue )
elif type == 'mostly-numeric':
if isMostlyNumeric( rightValue ):
return rightValue
value = textToMostlyNumeric( rightValue )
elif isMostlyNumeric( bottomValue ):
return bottomValue
else:
return rightValue
value = textToMostlyNumeric( bottomValue )
elif type == 'vat':
if isVat( rightValue ):
value = textToVat( rightValue )
elif isVat( bottomValue ):
value = textToVat( bottomValue )
elif type == 'page-number':
if isPageNumber( rightValue ):
value = textToPageNumber( rightValue )
elif isPageNumber( bottomValue ):
value = textToPageNumber( bottomValue )
else:
return rightValue
value = rightValue
if not value and 'fallback' in InvoiceRecognizer.Tags[ tag ]:
value = InvoiceRecognizer.Tags[ tag ]['fallback']( self )
return value

View File

@ -37,6 +37,24 @@ def rangeLengthComparison(x, y):
else:
return 0
def rangeDistanceLengthRatioComparison(x, y):
xt = x.text()
if len( xt ):
xl = (1.0/len(xt)) + float( x.distance ) / len( xt )
else:
xl = 999
yt = y.text()
if len( y.text() ):
yl = (1.0/len(yt)) + float( y.distance ) / len( yt )
else:
yl = 999
if xl > yl:
return 1
elif xl < yl:
return -1
else:
return 0
## @brief This class represents a group of characters in a document.
class Range:
def __init__(self):

View File

@ -1,3 +1,4 @@
# encoding: iso-8859-1
# Copyright (C) 2009 by Albert Cervera i Areny
# albert@nan-tic.com
#
@ -17,6 +18,7 @@
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from PyQt4.QtCore import *
import re
def textToFloat( value ):
if ',' in value and '.' in value:
@ -30,6 +32,11 @@ def textToFloat( value ):
newValue = value.replace( ',', '.' )
else:
newValue = value
# Remove spaces
newValue = newValue.replace( ' ', '' )
# Remove possible coin symbol in the end
if not newValue[-1] in '0123456789':
newValue = newValue[:-1]
return float( newValue )
def isFloat( value ):
@ -44,14 +51,51 @@ def isDate( value ):
return date.isValid()
def textToDate( value ):
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
value = value.replace(' ','')
value = textMonthToNumber( value )
# Replace texts for cases such as '25 de juny de 2009'
value = re.sub( r'[a-z]', '', value )
value = re.sub( r'[A-Z]', '', value )
patterns = [
'dd/MM/yyyy', 'dd-MM-yyyy', 'dd/MM/yy', 'dd-MM-yy',
'd/MM/yyyy', 'd-MM-yyyy', 'd/MM/yy', 'd-MM-yy',
'dd.mm.yyyy', 'dd.mm.yy', 'd.mm.yyyy', 'd.mm.yy']
for pattern in patterns:
date = QDate.fromString( value.replace(' ',''), pattern )
date = QDate.fromString( value, pattern )
if date.isValid():
# If only two digits where used to specify year
# it probably meant 200x or 20xx not 190x or 19xx
# (which is what QDate interprets).
if date.year() < 1930 and not 'yyyy' in pattern:
date = date.addYears( 100 )
return date
return QDate()
def textMonthToNumber( value ):
months = [
('gen', '01'), ('gener', '01'), ('enero', '01'), ('january', '01'),
('feb', '02'), ('febrer', '02'), ('febrero', '02'), ('february', '02'),
('mar', '03'), ('marc', '03'), ('marzo', '03'), ('march', '03'),
('abr', '04'), ('apr', '04'), ('abril', '04'), ('april', '04'),
('mai', '05'), ('may', '05'), ('maig', '05'), ('mayo', '05'),
('jun', '06'), ('jul', '07'), ('juny', '06'), ('junio', '06'), ('june', '06'),
('ago', '08'), ('agost', '09'), ('agosto', '08'), ('august', '08'),
('set', '09'), ('sep', '09'), ('setembre', '09'), ('september', '09'),
('oct', '10'), ('octubre', '10'), ('october', '10'),
('nov', '11'), ('novembre', '11'), ('noviembre', '11'),
('des', '12'), ('dec', '12'), ('desembre', '12'), ('diciembre', '12'), ('december', '12')
]
# reverse sort so longer names are replaced first
months.sort( key=lambda a: a[0], reverse=True )
v = value
for x in months:
# Try to replace twice. Sometimes instead of 'dec' we see 'dec.'
v = v.replace( u'%s.' % x[0], u'/%s/' % x[1] )
v = v.replace( x[0], u'/%s/' % x[1] )
return v
def isMostlyNumeric( text ):
text = text.replace(' ','')
numbers = 0
for x in text:
if x in '0123456789':
@ -61,3 +105,54 @@ def isMostlyNumeric( text ):
else:
return False
def textToMostlyNumeric( text ):
text = text.replace(' ','')
return text
def isVat( text ):
if textToVat( text ):
return True
else:
return False
def textToVat( text ):
text = text.replace( ' ', '' )
text = text.upper()
expressions = [
'^[A-Z][0-9]{8}$', '^[0-9]{8}[A-Z]$'
]
for e in expressions:
ex = re.compile( e )
if ex.search( text ):
return text
return ''
def isPageNumber( text ):
if textToPageNumber( text ):
return True
else:
return False
def textToPageNumber( text ):
current = None
total = None
blocks = []
inSequence = False
for c in text:
if c in '0123456789':
if not inSequence:
inSequence = True
blocks.append( u'' )
blocks[-1] += c
else:
inSequence = False
if len(blocks) > 0:
current = textToFloat( blocks[0] )
if len(blocks) > 1:
total = textToFloat( blocks[1] )
if current:
return (current, total)
else:
return None