mirror of https://github.com/NaN-tic/nanscan.git
Several improvements in invoice recognition:
Added new types, improved date recognition, improved performance.
This commit is contained in:
parent
4119ec747c
commit
41224310da
|
@ -58,15 +58,23 @@ class Block:
|
|||
self.document = None
|
||||
self._boxes = []
|
||||
self.outerDistance = 2.5
|
||||
self._rect = None
|
||||
self._outerRect = None
|
||||
|
||||
def setBoxes(self, boxes):
|
||||
self._boxes = boxes
|
||||
self.invalidateCache()
|
||||
|
||||
def boxes(self):
|
||||
return self._boxes
|
||||
|
||||
def addBox(self, box):
|
||||
self._boxes.append( box )
|
||||
self.invalidateCache()
|
||||
|
||||
def removeBox(self, box):
|
||||
self._boxes.remove( box )
|
||||
self.invalidateCache()
|
||||
|
||||
def count(self):
|
||||
return len(self._boxes)
|
||||
|
@ -75,20 +83,30 @@ class Block:
|
|||
def text(self):
|
||||
return self.formatedText()
|
||||
|
||||
def invalidateCache(self):
|
||||
self._rect = None
|
||||
self._outerRect = None
|
||||
|
||||
## @brief Returns the bounding rectangle of the text in the range
|
||||
def rect(self):
|
||||
rect = QRectF()
|
||||
# If we have the value in the cache use it.
|
||||
if self._rect:
|
||||
return self._rect
|
||||
self._rect = QRectF()
|
||||
for c in self._boxes:
|
||||
rect = rect.united( c.box )
|
||||
return rect
|
||||
self._rect = self._rect.united( c.box )
|
||||
return self._rect
|
||||
|
||||
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
|
||||
# larger in all sides.
|
||||
def outerRect(self):
|
||||
if self._outerRect:
|
||||
return self._outerRect
|
||||
rect = self.rect()
|
||||
rect.translate( - self.outerDistance, - self.outerDistance )
|
||||
rect.setWidth( rect.width() + self.outerDistance * 2.0 )
|
||||
rect.setHeight( rect.height() + self.outerDistance * 2.0 )
|
||||
self._outerRect = rect
|
||||
return rect
|
||||
|
||||
## @brief Returns a list with all possible ranges of size length of the
|
||||
|
@ -117,7 +135,7 @@ class Block:
|
|||
block.document = lines
|
||||
blocks.append( block )
|
||||
for char in line:
|
||||
if char.character != u' ':
|
||||
if char.character != u' ' or block.count() == 0:
|
||||
block.addBox( char )
|
||||
else:
|
||||
avgWidth = block.rect().width() / block.count()
|
||||
|
@ -234,7 +252,6 @@ class Block:
|
|||
words.append( line[spacesToAdd[-1]:count] )
|
||||
spacesToAdd.append( count )
|
||||
|
||||
# c.character is already a unicode string
|
||||
left = c.box.right()
|
||||
width += c.box.width()
|
||||
count += 1
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# encoding: iso-8859-1
|
||||
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||
# albert@nan-tic.com
|
||||
#
|
||||
|
@ -22,77 +23,130 @@ from NanScan.Range import *
|
|||
from NanScan.Block import *
|
||||
from NanScan.TextPatterns import *
|
||||
|
||||
def findDate( recognizer ):
|
||||
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
|
||||
for ran in ranges:
|
||||
text = ran.text()
|
||||
if isDate( ran.text() ):
|
||||
return textToDate( text )
|
||||
return None
|
||||
|
||||
def findVat( recognizer ):
|
||||
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
|
||||
for ran in ranges:
|
||||
text = ran.text()
|
||||
if isVat( ran.text() ):
|
||||
return textToVat( text )
|
||||
return None
|
||||
|
||||
class InvoiceRecognizer:
|
||||
Tags = {
|
||||
'number': {
|
||||
'tag': [
|
||||
u'factura',
|
||||
u'numero factura',
|
||||
u'factura numero',
|
||||
u'num. de factura',
|
||||
u'factura num.'
|
||||
u'factura num.',
|
||||
u'nº factura',
|
||||
u'factura núm.',
|
||||
u'factura',
|
||||
u'número de factura'
|
||||
],
|
||||
'type': 'mostly-numeric'
|
||||
},
|
||||
'date': {
|
||||
'tag': [
|
||||
u'fecha',
|
||||
u'fecha de factura'
|
||||
u'fecha factura',
|
||||
u'fecha emision',
|
||||
u'data factura'
|
||||
u'fecha',
|
||||
u'data:',
|
||||
u'data',
|
||||
u'data factura'
|
||||
],
|
||||
'type': 'date'
|
||||
# With dates we need to be able to find a date with
|
||||
# the format '1 Sep. 2009'. Also we need to find the
|
||||
# date without a tag. Something like:
|
||||
#
|
||||
# 'fallback': functionName,
|
||||
#
|
||||
# might be appropiate for those cases in which the
|
||||
# tag can't be found.
|
||||
'type': 'date',
|
||||
'fallback': findDate,
|
||||
},
|
||||
'amount': {
|
||||
'base': {
|
||||
'tag': [
|
||||
u'base imponible',
|
||||
u'base imposable',
|
||||
u'total (base imposable)'
|
||||
],
|
||||
'type': 'numeric'
|
||||
},
|
||||
'taxes': {
|
||||
'tag': [
|
||||
u'IVA',
|
||||
],
|
||||
'type': 'numeric'
|
||||
},
|
||||
'total': {
|
||||
'tag': [
|
||||
u'total',
|
||||
u'total factura',
|
||||
u'total a pagar (euros)'
|
||||
],
|
||||
'type': 'numeric'
|
||||
},
|
||||
'vat': {
|
||||
'tag': [
|
||||
u'nif',
|
||||
u'cif',
|
||||
u'nif/cif',
|
||||
u'nif:',
|
||||
u'cif:',
|
||||
u'nif/cif:',
|
||||
u'nif :',
|
||||
u'cif :',
|
||||
u'nif/cif :',
|
||||
],
|
||||
'type': 'vat',
|
||||
'fallback': findVat,
|
||||
},
|
||||
'pagina': {
|
||||
'tag': [
|
||||
u'pagina',
|
||||
u'página',
|
||||
u'pàgina',
|
||||
u'pag.',
|
||||
u'pàg.',
|
||||
u'pág.'
|
||||
],
|
||||
'type': 'page-number'
|
||||
}
|
||||
}
|
||||
|
||||
def recognize(self, recognizer):
|
||||
#text = recognizer.textInRegion('text')
|
||||
analyzer = recognizer.analyzers['text']
|
||||
self.textLines = analyzer.block.textLinesWithSpaces()
|
||||
result = ''
|
||||
for tag in InvoiceRecognizer.Tags:
|
||||
result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
|
||||
|
||||
print "========================================"
|
||||
blocks = Block.extractAllBlocksFromDocument( self.textLines )
|
||||
for block in blocks:
|
||||
print "---"
|
||||
print "BLOCK:", block.text().encode('ascii','ignore')
|
||||
print "---"
|
||||
print "========================================"
|
||||
# Try to find out which of the blocks contains customer information
|
||||
|
||||
# This rect, picks up the first third of an A4 paper size.
|
||||
top = QRectF( 0, 0, 210, 99 )
|
||||
tops = []
|
||||
for block in blocks:
|
||||
if block.rect().intersects( top ):
|
||||
tops.append( block )
|
||||
# Once we have all the blocks of the first third of the paper
|
||||
# try to guess which of them is the good one.
|
||||
|
||||
# Remove those blocks too wide
|
||||
sized = []
|
||||
for block in tops:
|
||||
if block.width() < 120:
|
||||
sized.append( block )
|
||||
# print "========================================"
|
||||
# blocks = Block.extractAllBlocksFromDocument( self.textLines )
|
||||
# for block in blocks:
|
||||
# print "---"
|
||||
# print "BLOCK:", block.text().encode('ascii','ignore')
|
||||
# print "---"
|
||||
# print "========================================"
|
||||
# # Try to find out which of the blocks contains customer information
|
||||
#
|
||||
# # This rect, picks up the first third of an A4 paper size.
|
||||
# top = QRectF( 0, 0, 210, 99 )
|
||||
# tops = []
|
||||
# for block in blocks:
|
||||
# if block.rect().intersects( top ):
|
||||
# tops.append( block )
|
||||
# # Once we have all the blocks of the first third of the paper
|
||||
# # try to guess which of them is the good one.
|
||||
#
|
||||
# # Remove those blocks too wide
|
||||
# sized = []
|
||||
# for block in tops:
|
||||
# if block.rect().width() < 120:
|
||||
# sized.append( block )
|
||||
|
||||
return result
|
||||
|
||||
|
@ -109,34 +163,42 @@ class InvoiceRecognizer:
|
|||
value = Levenshtein.levenshtein( text, textToFind )
|
||||
ran.distance = value
|
||||
ranges.sort( rangeDistanceComparison )
|
||||
if ranges:
|
||||
return ranges[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
return ranges
|
||||
|
||||
def findTagValue(self, tag):
|
||||
ranges = []
|
||||
for tagData in InvoiceRecognizer.Tags[tag]['tag']:
|
||||
ran = self.findText( tagData )
|
||||
if ran:
|
||||
ranges.append( ran )
|
||||
ranges.sort( rangeDistanceComparison )
|
||||
#ran = ranges[0]
|
||||
distance = ranges[0].distance
|
||||
sameDistance = [x for x in ranges if x.distance == distance]
|
||||
sameDistance.sort( rangeLengthComparison )
|
||||
#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
|
||||
ran = sameDistance[-1]
|
||||
|
||||
ranges += self.findText( tagData )
|
||||
#ran = self.findText( tagData )
|
||||
#if ran:
|
||||
#ranges.append( ran )
|
||||
#ranges.sort( rangeDistanceComparison )
|
||||
#distance = ranges[0].distance
|
||||
#sameDistance = [x for x in ranges if x.distance == distance]
|
||||
#sameDistance.sort( rangeLengthComparison )
|
||||
#ran = sameDistance[-1]
|
||||
ranges.sort( rangeDistanceLengthRatioComparison )
|
||||
print "RANGES FOR TAG: %s\n%s" % ( tag, [ran.text().encode('ascii','replace') for ran in ranges[:20]] )
|
||||
for ran in ranges[:5]:
|
||||
print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
|
||||
value = self.findTagValueFromRange( tag, ran )
|
||||
if value:
|
||||
return value
|
||||
return None
|
||||
|
||||
def findTagValueFromRange(self, tag, ran):
|
||||
|
||||
# Extract text on the right
|
||||
line = self.formatedLine( self.textLines[ ran.line ] )
|
||||
rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
|
||||
print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
|
||||
print "rightValue: ", rightValue.encode('ascii','ignore')
|
||||
print "SAME LINE: ", line.encode('ascii','ignore')
|
||||
#line = self.formatedLine( self.textLines[ ran.line ] )
|
||||
#rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
|
||||
line = self.textLines[ ran.line ]
|
||||
line = line[ran.pos+ran.length+1:]
|
||||
rightValue = Block.extractAllBlocksFromDocument( [ line ] )[0].text()
|
||||
|
||||
|
||||
#print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
|
||||
print "rightValue: ", rightValue.encode('ascii','replace')
|
||||
#print "SAME LINE: ", line.encode('ascii','ignore')
|
||||
|
||||
# Extract text on the bottom
|
||||
if ran.line < len(self.textLines)-1:
|
||||
|
@ -150,30 +212,40 @@ class InvoiceRecognizer:
|
|||
bottomValue += c.character
|
||||
else:
|
||||
bottomValue = u''
|
||||
print "bottomValue: ", bottomValue.encode('ascii','replace')
|
||||
|
||||
# Decide which of both values match the given tag type
|
||||
type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
|
||||
value = None
|
||||
if type == 'numeric':
|
||||
if isFloat( rightValue ):
|
||||
return textToFloat( rightValue )
|
||||
value = textToFloat( rightValue )
|
||||
elif isFloat( bottomValue ):
|
||||
return textToFloat( bottomValue )
|
||||
else:
|
||||
return None
|
||||
value = textToFloat( bottomValue )
|
||||
elif type == 'date':
|
||||
if isDate( rightValue ):
|
||||
return textToDate( rightValue )
|
||||
value = textToDate( rightValue )
|
||||
elif isDate( bottomValue ):
|
||||
return textToDate( bottomValue )
|
||||
else:
|
||||
return None
|
||||
value = textToDate( bottomValue )
|
||||
elif type == 'mostly-numeric':
|
||||
if isMostlyNumeric( rightValue ):
|
||||
return rightValue
|
||||
value = textToMostlyNumeric( rightValue )
|
||||
elif isMostlyNumeric( bottomValue ):
|
||||
return bottomValue
|
||||
value = textToMostlyNumeric( bottomValue )
|
||||
elif type == 'vat':
|
||||
if isVat( rightValue ):
|
||||
value = textToVat( rightValue )
|
||||
elif isVat( bottomValue ):
|
||||
value = textToVat( bottomValue )
|
||||
elif type == 'page-number':
|
||||
if isPageNumber( rightValue ):
|
||||
value = textToPageNumber( rightValue )
|
||||
elif isPageNumber( bottomValue ):
|
||||
value = textToPageNumber( bottomValue )
|
||||
else:
|
||||
return rightValue
|
||||
else:
|
||||
return rightValue
|
||||
value = rightValue
|
||||
|
||||
if not value and 'fallback' in InvoiceRecognizer.Tags[ tag ]:
|
||||
value = InvoiceRecognizer.Tags[ tag ]['fallback']( self )
|
||||
return value
|
||||
|
||||
|
|
|
@ -37,6 +37,24 @@ def rangeLengthComparison(x, y):
|
|||
else:
|
||||
return 0
|
||||
|
||||
def rangeDistanceLengthRatioComparison(x, y):
|
||||
xt = x.text()
|
||||
if len( xt ):
|
||||
xl = (1.0/len(xt)) + float( x.distance ) / len( xt )
|
||||
else:
|
||||
xl = 999
|
||||
yt = y.text()
|
||||
if len( y.text() ):
|
||||
yl = (1.0/len(yt)) + float( y.distance ) / len( yt )
|
||||
else:
|
||||
yl = 999
|
||||
if xl > yl:
|
||||
return 1
|
||||
elif xl < yl:
|
||||
return -1
|
||||
else:
|
||||
return 0
|
||||
|
||||
## @brief This class represents a group of characters in a document.
|
||||
class Range:
|
||||
def __init__(self):
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# encoding: iso-8859-1
|
||||
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||
# albert@nan-tic.com
|
||||
#
|
||||
|
@ -17,6 +18,7 @@
|
|||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
from PyQt4.QtCore import *
|
||||
import re
|
||||
|
||||
def textToFloat( value ):
|
||||
if ',' in value and '.' in value:
|
||||
|
@ -30,6 +32,11 @@ def textToFloat( value ):
|
|||
newValue = value.replace( ',', '.' )
|
||||
else:
|
||||
newValue = value
|
||||
# Remove spaces
|
||||
newValue = newValue.replace( ' ', '' )
|
||||
# Remove possible coin symbol in the end
|
||||
if not newValue[-1] in '0123456789':
|
||||
newValue = newValue[:-1]
|
||||
return float( newValue )
|
||||
|
||||
def isFloat( value ):
|
||||
|
@ -44,14 +51,51 @@ def isDate( value ):
|
|||
return date.isValid()
|
||||
|
||||
def textToDate( value ):
|
||||
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
|
||||
value = value.replace(' ','')
|
||||
value = textMonthToNumber( value )
|
||||
# Replace texts for cases such as '25 de juny de 2009'
|
||||
value = re.sub( r'[a-z]', '', value )
|
||||
value = re.sub( r'[A-Z]', '', value )
|
||||
patterns = [
|
||||
'dd/MM/yyyy', 'dd-MM-yyyy', 'dd/MM/yy', 'dd-MM-yy',
|
||||
'd/MM/yyyy', 'd-MM-yyyy', 'd/MM/yy', 'd-MM-yy',
|
||||
'dd.mm.yyyy', 'dd.mm.yy', 'd.mm.yyyy', 'd.mm.yy']
|
||||
for pattern in patterns:
|
||||
date = QDate.fromString( value.replace(' ',''), pattern )
|
||||
date = QDate.fromString( value, pattern )
|
||||
if date.isValid():
|
||||
# If only two digits where used to specify year
|
||||
# it probably meant 200x or 20xx not 190x or 19xx
|
||||
# (which is what QDate interprets).
|
||||
if date.year() < 1930 and not 'yyyy' in pattern:
|
||||
date = date.addYears( 100 )
|
||||
return date
|
||||
return QDate()
|
||||
|
||||
def textMonthToNumber( value ):
|
||||
months = [
|
||||
('gen', '01'), ('gener', '01'), ('enero', '01'), ('january', '01'),
|
||||
('feb', '02'), ('febrer', '02'), ('febrero', '02'), ('february', '02'),
|
||||
('mar', '03'), ('marc', '03'), ('marzo', '03'), ('march', '03'),
|
||||
('abr', '04'), ('apr', '04'), ('abril', '04'), ('april', '04'),
|
||||
('mai', '05'), ('may', '05'), ('maig', '05'), ('mayo', '05'),
|
||||
('jun', '06'), ('jul', '07'), ('juny', '06'), ('junio', '06'), ('june', '06'),
|
||||
('ago', '08'), ('agost', '09'), ('agosto', '08'), ('august', '08'),
|
||||
('set', '09'), ('sep', '09'), ('setembre', '09'), ('september', '09'),
|
||||
('oct', '10'), ('octubre', '10'), ('october', '10'),
|
||||
('nov', '11'), ('novembre', '11'), ('noviembre', '11'),
|
||||
('des', '12'), ('dec', '12'), ('desembre', '12'), ('diciembre', '12'), ('december', '12')
|
||||
]
|
||||
# reverse sort so longer names are replaced first
|
||||
months.sort( key=lambda a: a[0], reverse=True )
|
||||
v = value
|
||||
for x in months:
|
||||
# Try to replace twice. Sometimes instead of 'dec' we see 'dec.'
|
||||
v = v.replace( u'%s.' % x[0], u'/%s/' % x[1] )
|
||||
v = v.replace( x[0], u'/%s/' % x[1] )
|
||||
return v
|
||||
|
||||
def isMostlyNumeric( text ):
|
||||
text = text.replace(' ','')
|
||||
numbers = 0
|
||||
for x in text:
|
||||
if x in '0123456789':
|
||||
|
@ -61,3 +105,54 @@ def isMostlyNumeric( text ):
|
|||
else:
|
||||
return False
|
||||
|
||||
def textToMostlyNumeric( text ):
|
||||
text = text.replace(' ','')
|
||||
return text
|
||||
|
||||
def isVat( text ):
|
||||
if textToVat( text ):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def textToVat( text ):
|
||||
text = text.replace( ' ', '' )
|
||||
text = text.upper()
|
||||
expressions = [
|
||||
'^[A-Z][0-9]{8}$', '^[0-9]{8}[A-Z]$'
|
||||
]
|
||||
for e in expressions:
|
||||
ex = re.compile( e )
|
||||
if ex.search( text ):
|
||||
return text
|
||||
return ''
|
||||
|
||||
def isPageNumber( text ):
|
||||
if textToPageNumber( text ):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def textToPageNumber( text ):
|
||||
current = None
|
||||
total = None
|
||||
|
||||
blocks = []
|
||||
inSequence = False
|
||||
for c in text:
|
||||
if c in '0123456789':
|
||||
if not inSequence:
|
||||
inSequence = True
|
||||
blocks.append( u'' )
|
||||
blocks[-1] += c
|
||||
else:
|
||||
inSequence = False
|
||||
if len(blocks) > 0:
|
||||
current = textToFloat( blocks[0] )
|
||||
if len(blocks) > 1:
|
||||
total = textToFloat( blocks[1] )
|
||||
if current:
|
||||
return (current, total)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
|
Loading…
Reference in New Issue