mirror of https://github.com/NaN-tic/nanscan.git
Several improvements in invoice recognition:
Added new types, improved date recognition, improved performance.
This commit is contained in:
parent
4119ec747c
commit
41224310da
|
@ -58,15 +58,23 @@ class Block:
|
||||||
self.document = None
|
self.document = None
|
||||||
self._boxes = []
|
self._boxes = []
|
||||||
self.outerDistance = 2.5
|
self.outerDistance = 2.5
|
||||||
|
self._rect = None
|
||||||
|
self._outerRect = None
|
||||||
|
|
||||||
def setBoxes(self, boxes):
|
def setBoxes(self, boxes):
|
||||||
self._boxes = boxes
|
self._boxes = boxes
|
||||||
|
self.invalidateCache()
|
||||||
|
|
||||||
def boxes(self):
|
def boxes(self):
|
||||||
return self._boxes
|
return self._boxes
|
||||||
|
|
||||||
def addBox(self, box):
|
def addBox(self, box):
|
||||||
self._boxes.append( box )
|
self._boxes.append( box )
|
||||||
|
self.invalidateCache()
|
||||||
|
|
||||||
|
def removeBox(self, box):
|
||||||
|
self._boxes.remove( box )
|
||||||
|
self.invalidateCache()
|
||||||
|
|
||||||
def count(self):
|
def count(self):
|
||||||
return len(self._boxes)
|
return len(self._boxes)
|
||||||
|
@ -75,20 +83,30 @@ class Block:
|
||||||
def text(self):
|
def text(self):
|
||||||
return self.formatedText()
|
return self.formatedText()
|
||||||
|
|
||||||
|
def invalidateCache(self):
|
||||||
|
self._rect = None
|
||||||
|
self._outerRect = None
|
||||||
|
|
||||||
## @brief Returns the bounding rectangle of the text in the range
|
## @brief Returns the bounding rectangle of the text in the range
|
||||||
def rect(self):
|
def rect(self):
|
||||||
rect = QRectF()
|
# If we have the value in the cache use it.
|
||||||
|
if self._rect:
|
||||||
|
return self._rect
|
||||||
|
self._rect = QRectF()
|
||||||
for c in self._boxes:
|
for c in self._boxes:
|
||||||
rect = rect.united( c.box )
|
self._rect = self._rect.united( c.box )
|
||||||
return rect
|
return self._rect
|
||||||
|
|
||||||
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
|
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
|
||||||
# larger in all sides.
|
# larger in all sides.
|
||||||
def outerRect(self):
|
def outerRect(self):
|
||||||
|
if self._outerRect:
|
||||||
|
return self._outerRect
|
||||||
rect = self.rect()
|
rect = self.rect()
|
||||||
rect.translate( - self.outerDistance, - self.outerDistance )
|
rect.translate( - self.outerDistance, - self.outerDistance )
|
||||||
rect.setWidth( rect.width() + self.outerDistance * 2.0 )
|
rect.setWidth( rect.width() + self.outerDistance * 2.0 )
|
||||||
rect.setHeight( rect.height() + self.outerDistance * 2.0 )
|
rect.setHeight( rect.height() + self.outerDistance * 2.0 )
|
||||||
|
self._outerRect = rect
|
||||||
return rect
|
return rect
|
||||||
|
|
||||||
## @brief Returns a list with all possible ranges of size length of the
|
## @brief Returns a list with all possible ranges of size length of the
|
||||||
|
@ -117,7 +135,7 @@ class Block:
|
||||||
block.document = lines
|
block.document = lines
|
||||||
blocks.append( block )
|
blocks.append( block )
|
||||||
for char in line:
|
for char in line:
|
||||||
if char.character != u' ':
|
if char.character != u' ' or block.count() == 0:
|
||||||
block.addBox( char )
|
block.addBox( char )
|
||||||
else:
|
else:
|
||||||
avgWidth = block.rect().width() / block.count()
|
avgWidth = block.rect().width() / block.count()
|
||||||
|
@ -234,7 +252,6 @@ class Block:
|
||||||
words.append( line[spacesToAdd[-1]:count] )
|
words.append( line[spacesToAdd[-1]:count] )
|
||||||
spacesToAdd.append( count )
|
spacesToAdd.append( count )
|
||||||
|
|
||||||
# c.character is already a unicode string
|
|
||||||
left = c.box.right()
|
left = c.box.right()
|
||||||
width += c.box.width()
|
width += c.box.width()
|
||||||
count += 1
|
count += 1
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# encoding: iso-8859-1
|
||||||
# Copyright (C) 2009 by Albert Cervera i Areny
|
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||||
# albert@nan-tic.com
|
# albert@nan-tic.com
|
||||||
#
|
#
|
||||||
|
@ -22,77 +23,130 @@ from NanScan.Range import *
|
||||||
from NanScan.Block import *
|
from NanScan.Block import *
|
||||||
from NanScan.TextPatterns import *
|
from NanScan.TextPatterns import *
|
||||||
|
|
||||||
|
def findDate( recognizer ):
|
||||||
|
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
|
||||||
|
for ran in ranges:
|
||||||
|
text = ran.text()
|
||||||
|
if isDate( ran.text() ):
|
||||||
|
return textToDate( text )
|
||||||
|
return None
|
||||||
|
|
||||||
|
def findVat( recognizer ):
|
||||||
|
ranges = Range.extractAllRangesFromDocument( recognizer.textLines, 10 )
|
||||||
|
for ran in ranges:
|
||||||
|
text = ran.text()
|
||||||
|
if isVat( ran.text() ):
|
||||||
|
return textToVat( text )
|
||||||
|
return None
|
||||||
|
|
||||||
class InvoiceRecognizer:
|
class InvoiceRecognizer:
|
||||||
Tags = {
|
Tags = {
|
||||||
'number': {
|
'number': {
|
||||||
'tag': [
|
'tag': [
|
||||||
u'factura',
|
|
||||||
u'numero factura',
|
u'numero factura',
|
||||||
u'factura numero',
|
u'factura numero',
|
||||||
u'num. de factura',
|
u'num. de factura',
|
||||||
u'factura num.'
|
u'factura num.',
|
||||||
|
u'nº factura',
|
||||||
|
u'factura núm.',
|
||||||
|
u'factura',
|
||||||
|
u'número de factura'
|
||||||
],
|
],
|
||||||
'type': 'mostly-numeric'
|
'type': 'mostly-numeric'
|
||||||
},
|
},
|
||||||
'date': {
|
'date': {
|
||||||
'tag': [
|
'tag': [
|
||||||
u'fecha',
|
u'fecha de factura'
|
||||||
u'fecha factura',
|
u'fecha factura',
|
||||||
u'fecha emision',
|
u'fecha emision',
|
||||||
|
u'data factura'
|
||||||
|
u'fecha',
|
||||||
u'data:',
|
u'data:',
|
||||||
u'data',
|
u'data',
|
||||||
u'data factura'
|
|
||||||
],
|
],
|
||||||
'type': 'date'
|
'type': 'date',
|
||||||
# With dates we need to be able to find a date with
|
'fallback': findDate,
|
||||||
# the format '1 Sep. 2009'. Also we need to find the
|
|
||||||
# date without a tag. Something like:
|
|
||||||
#
|
|
||||||
# 'fallback': functionName,
|
|
||||||
#
|
|
||||||
# might be appropiate for those cases in which the
|
|
||||||
# tag can't be found.
|
|
||||||
},
|
},
|
||||||
'amount': {
|
'base': {
|
||||||
|
'tag': [
|
||||||
|
u'base imponible',
|
||||||
|
u'base imposable',
|
||||||
|
u'total (base imposable)'
|
||||||
|
],
|
||||||
|
'type': 'numeric'
|
||||||
|
},
|
||||||
|
'taxes': {
|
||||||
|
'tag': [
|
||||||
|
u'IVA',
|
||||||
|
],
|
||||||
|
'type': 'numeric'
|
||||||
|
},
|
||||||
|
'total': {
|
||||||
'tag': [
|
'tag': [
|
||||||
u'total',
|
u'total',
|
||||||
u'total factura',
|
u'total factura',
|
||||||
u'total a pagar (euros)'
|
u'total a pagar (euros)'
|
||||||
],
|
],
|
||||||
'type': 'numeric'
|
'type': 'numeric'
|
||||||
|
},
|
||||||
|
'vat': {
|
||||||
|
'tag': [
|
||||||
|
u'nif',
|
||||||
|
u'cif',
|
||||||
|
u'nif/cif',
|
||||||
|
u'nif:',
|
||||||
|
u'cif:',
|
||||||
|
u'nif/cif:',
|
||||||
|
u'nif :',
|
||||||
|
u'cif :',
|
||||||
|
u'nif/cif :',
|
||||||
|
],
|
||||||
|
'type': 'vat',
|
||||||
|
'fallback': findVat,
|
||||||
|
},
|
||||||
|
'pagina': {
|
||||||
|
'tag': [
|
||||||
|
u'pagina',
|
||||||
|
u'página',
|
||||||
|
u'pàgina',
|
||||||
|
u'pag.',
|
||||||
|
u'pàg.',
|
||||||
|
u'pág.'
|
||||||
|
],
|
||||||
|
'type': 'page-number'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def recognize(self, recognizer):
|
def recognize(self, recognizer):
|
||||||
#text = recognizer.textInRegion('text')
|
|
||||||
analyzer = recognizer.analyzers['text']
|
analyzer = recognizer.analyzers['text']
|
||||||
self.textLines = analyzer.block.textLinesWithSpaces()
|
self.textLines = analyzer.block.textLinesWithSpaces()
|
||||||
result = ''
|
result = ''
|
||||||
for tag in InvoiceRecognizer.Tags:
|
for tag in InvoiceRecognizer.Tags:
|
||||||
result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
|
result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
|
||||||
|
|
||||||
print "========================================"
|
# print "========================================"
|
||||||
blocks = Block.extractAllBlocksFromDocument( self.textLines )
|
# blocks = Block.extractAllBlocksFromDocument( self.textLines )
|
||||||
for block in blocks:
|
# for block in blocks:
|
||||||
print "---"
|
# print "---"
|
||||||
print "BLOCK:", block.text().encode('ascii','ignore')
|
# print "BLOCK:", block.text().encode('ascii','ignore')
|
||||||
print "---"
|
# print "---"
|
||||||
print "========================================"
|
# print "========================================"
|
||||||
# Try to find out which of the blocks contains customer information
|
# # Try to find out which of the blocks contains customer information
|
||||||
|
#
|
||||||
# This rect, picks up the first third of an A4 paper size.
|
# # This rect, picks up the first third of an A4 paper size.
|
||||||
top = QRectF( 0, 0, 210, 99 )
|
# top = QRectF( 0, 0, 210, 99 )
|
||||||
tops = []
|
# tops = []
|
||||||
for block in blocks:
|
# for block in blocks:
|
||||||
if block.rect().intersects( top ):
|
# if block.rect().intersects( top ):
|
||||||
tops.append( block )
|
# tops.append( block )
|
||||||
# Once we have all the blocks of the first third of the paper
|
# # Once we have all the blocks of the first third of the paper
|
||||||
# try to guess which of them is the good one.
|
# # try to guess which of them is the good one.
|
||||||
|
#
|
||||||
# Remove those blocks too wide
|
# # Remove those blocks too wide
|
||||||
sized = []
|
# sized = []
|
||||||
for block in tops:
|
# for block in tops:
|
||||||
if block.width() < 120:
|
# if block.rect().width() < 120:
|
||||||
sized.append( block )
|
# sized.append( block )
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -109,34 +163,42 @@ class InvoiceRecognizer:
|
||||||
value = Levenshtein.levenshtein( text, textToFind )
|
value = Levenshtein.levenshtein( text, textToFind )
|
||||||
ran.distance = value
|
ran.distance = value
|
||||||
ranges.sort( rangeDistanceComparison )
|
ranges.sort( rangeDistanceComparison )
|
||||||
if ranges:
|
return ranges
|
||||||
return ranges[0]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def findTagValue(self, tag):
|
def findTagValue(self, tag):
|
||||||
ranges = []
|
ranges = []
|
||||||
for tagData in InvoiceRecognizer.Tags[tag]['tag']:
|
for tagData in InvoiceRecognizer.Tags[tag]['tag']:
|
||||||
ran = self.findText( tagData )
|
ranges += self.findText( tagData )
|
||||||
if ran:
|
#ran = self.findText( tagData )
|
||||||
ranges.append( ran )
|
#if ran:
|
||||||
ranges.sort( rangeDistanceComparison )
|
#ranges.append( ran )
|
||||||
#ran = ranges[0]
|
#ranges.sort( rangeDistanceComparison )
|
||||||
distance = ranges[0].distance
|
#distance = ranges[0].distance
|
||||||
sameDistance = [x for x in ranges if x.distance == distance]
|
#sameDistance = [x for x in ranges if x.distance == distance]
|
||||||
sameDistance.sort( rangeLengthComparison )
|
#sameDistance.sort( rangeLengthComparison )
|
||||||
#print "SECOND 5: ", [x.text().encode('ascii','ignore') for x in sameDistance[:5]]
|
#ran = sameDistance[-1]
|
||||||
ran = sameDistance[-1]
|
ranges.sort( rangeDistanceLengthRatioComparison )
|
||||||
|
print "RANGES FOR TAG: %s\n%s" % ( tag, [ran.text().encode('ascii','replace') for ran in ranges[:20]] )
|
||||||
|
for ran in ranges[:5]:
|
||||||
print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
|
print "RANGE FOR TAG %s: %s" % ( tag, ran.text().encode('ascii','ignore') )
|
||||||
|
value = self.findTagValueFromRange( tag, ran )
|
||||||
|
if value:
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
def findTagValueFromRange(self, tag, ran):
|
||||||
|
|
||||||
# Extract text on the right
|
# Extract text on the right
|
||||||
line = self.formatedLine( self.textLines[ ran.line ] )
|
#line = self.formatedLine( self.textLines[ ran.line ] )
|
||||||
rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
|
#rightValue = line[ran.pos+ran.length+1:].strip().split(' ')[0]
|
||||||
print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
|
line = self.textLines[ ran.line ]
|
||||||
print "rightValue: ", rightValue.encode('ascii','ignore')
|
line = line[ran.pos+ran.length+1:]
|
||||||
print "SAME LINE: ", line.encode('ascii','ignore')
|
rightValue = Block.extractAllBlocksFromDocument( [ line ] )[0].text()
|
||||||
|
|
||||||
|
|
||||||
|
#print "R: ", line[ran.pos+ran.length+1:].strip().encode('ascii','ignore')
|
||||||
|
print "rightValue: ", rightValue.encode('ascii','replace')
|
||||||
|
#print "SAME LINE: ", line.encode('ascii','ignore')
|
||||||
|
|
||||||
# Extract text on the bottom
|
# Extract text on the bottom
|
||||||
if ran.line < len(self.textLines)-1:
|
if ran.line < len(self.textLines)-1:
|
||||||
|
@ -150,30 +212,40 @@ class InvoiceRecognizer:
|
||||||
bottomValue += c.character
|
bottomValue += c.character
|
||||||
else:
|
else:
|
||||||
bottomValue = u''
|
bottomValue = u''
|
||||||
|
print "bottomValue: ", bottomValue.encode('ascii','replace')
|
||||||
|
|
||||||
# Decide which of both values match the given tag type
|
# Decide which of both values match the given tag type
|
||||||
type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
|
type = InvoiceRecognizer.Tags[ tag ][ 'type' ]
|
||||||
|
value = None
|
||||||
if type == 'numeric':
|
if type == 'numeric':
|
||||||
if isFloat( rightValue ):
|
if isFloat( rightValue ):
|
||||||
return textToFloat( rightValue )
|
value = textToFloat( rightValue )
|
||||||
elif isFloat( bottomValue ):
|
elif isFloat( bottomValue ):
|
||||||
return textToFloat( bottomValue )
|
value = textToFloat( bottomValue )
|
||||||
else:
|
|
||||||
return None
|
|
||||||
elif type == 'date':
|
elif type == 'date':
|
||||||
if isDate( rightValue ):
|
if isDate( rightValue ):
|
||||||
return textToDate( rightValue )
|
value = textToDate( rightValue )
|
||||||
elif isDate( bottomValue ):
|
elif isDate( bottomValue ):
|
||||||
return textToDate( bottomValue )
|
value = textToDate( bottomValue )
|
||||||
else:
|
|
||||||
return None
|
|
||||||
elif type == 'mostly-numeric':
|
elif type == 'mostly-numeric':
|
||||||
if isMostlyNumeric( rightValue ):
|
if isMostlyNumeric( rightValue ):
|
||||||
return rightValue
|
value = textToMostlyNumeric( rightValue )
|
||||||
elif isMostlyNumeric( bottomValue ):
|
elif isMostlyNumeric( bottomValue ):
|
||||||
return bottomValue
|
value = textToMostlyNumeric( bottomValue )
|
||||||
|
elif type == 'vat':
|
||||||
|
if isVat( rightValue ):
|
||||||
|
value = textToVat( rightValue )
|
||||||
|
elif isVat( bottomValue ):
|
||||||
|
value = textToVat( bottomValue )
|
||||||
|
elif type == 'page-number':
|
||||||
|
if isPageNumber( rightValue ):
|
||||||
|
value = textToPageNumber( rightValue )
|
||||||
|
elif isPageNumber( bottomValue ):
|
||||||
|
value = textToPageNumber( bottomValue )
|
||||||
else:
|
else:
|
||||||
return rightValue
|
value = rightValue
|
||||||
else:
|
|
||||||
return rightValue
|
if not value and 'fallback' in InvoiceRecognizer.Tags[ tag ]:
|
||||||
|
value = InvoiceRecognizer.Tags[ tag ]['fallback']( self )
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,24 @@ def rangeLengthComparison(x, y):
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def rangeDistanceLengthRatioComparison(x, y):
|
||||||
|
xt = x.text()
|
||||||
|
if len( xt ):
|
||||||
|
xl = (1.0/len(xt)) + float( x.distance ) / len( xt )
|
||||||
|
else:
|
||||||
|
xl = 999
|
||||||
|
yt = y.text()
|
||||||
|
if len( y.text() ):
|
||||||
|
yl = (1.0/len(yt)) + float( y.distance ) / len( yt )
|
||||||
|
else:
|
||||||
|
yl = 999
|
||||||
|
if xl > yl:
|
||||||
|
return 1
|
||||||
|
elif xl < yl:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
## @brief This class represents a group of characters in a document.
|
## @brief This class represents a group of characters in a document.
|
||||||
class Range:
|
class Range:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# encoding: iso-8859-1
|
||||||
# Copyright (C) 2009 by Albert Cervera i Areny
|
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||||
# albert@nan-tic.com
|
# albert@nan-tic.com
|
||||||
#
|
#
|
||||||
|
@ -17,6 +18,7 @@
|
||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
from PyQt4.QtCore import *
|
from PyQt4.QtCore import *
|
||||||
|
import re
|
||||||
|
|
||||||
def textToFloat( value ):
|
def textToFloat( value ):
|
||||||
if ',' in value and '.' in value:
|
if ',' in value and '.' in value:
|
||||||
|
@ -30,6 +32,11 @@ def textToFloat( value ):
|
||||||
newValue = value.replace( ',', '.' )
|
newValue = value.replace( ',', '.' )
|
||||||
else:
|
else:
|
||||||
newValue = value
|
newValue = value
|
||||||
|
# Remove spaces
|
||||||
|
newValue = newValue.replace( ' ', '' )
|
||||||
|
# Remove possible coin symbol in the end
|
||||||
|
if not newValue[-1] in '0123456789':
|
||||||
|
newValue = newValue[:-1]
|
||||||
return float( newValue )
|
return float( newValue )
|
||||||
|
|
||||||
def isFloat( value ):
|
def isFloat( value ):
|
||||||
|
@ -44,14 +51,51 @@ def isDate( value ):
|
||||||
return date.isValid()
|
return date.isValid()
|
||||||
|
|
||||||
def textToDate( value ):
|
def textToDate( value ):
|
||||||
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
|
value = value.replace(' ','')
|
||||||
|
value = textMonthToNumber( value )
|
||||||
|
# Replace texts for cases such as '25 de juny de 2009'
|
||||||
|
value = re.sub( r'[a-z]', '', value )
|
||||||
|
value = re.sub( r'[A-Z]', '', value )
|
||||||
|
patterns = [
|
||||||
|
'dd/MM/yyyy', 'dd-MM-yyyy', 'dd/MM/yy', 'dd-MM-yy',
|
||||||
|
'd/MM/yyyy', 'd-MM-yyyy', 'd/MM/yy', 'd-MM-yy',
|
||||||
|
'dd.mm.yyyy', 'dd.mm.yy', 'd.mm.yyyy', 'd.mm.yy']
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
date = QDate.fromString( value.replace(' ',''), pattern )
|
date = QDate.fromString( value, pattern )
|
||||||
if date.isValid():
|
if date.isValid():
|
||||||
|
# If only two digits where used to specify year
|
||||||
|
# it probably meant 200x or 20xx not 190x or 19xx
|
||||||
|
# (which is what QDate interprets).
|
||||||
|
if date.year() < 1930 and not 'yyyy' in pattern:
|
||||||
|
date = date.addYears( 100 )
|
||||||
return date
|
return date
|
||||||
return QDate()
|
return QDate()
|
||||||
|
|
||||||
|
def textMonthToNumber( value ):
|
||||||
|
months = [
|
||||||
|
('gen', '01'), ('gener', '01'), ('enero', '01'), ('january', '01'),
|
||||||
|
('feb', '02'), ('febrer', '02'), ('febrero', '02'), ('february', '02'),
|
||||||
|
('mar', '03'), ('marc', '03'), ('marzo', '03'), ('march', '03'),
|
||||||
|
('abr', '04'), ('apr', '04'), ('abril', '04'), ('april', '04'),
|
||||||
|
('mai', '05'), ('may', '05'), ('maig', '05'), ('mayo', '05'),
|
||||||
|
('jun', '06'), ('jul', '07'), ('juny', '06'), ('junio', '06'), ('june', '06'),
|
||||||
|
('ago', '08'), ('agost', '09'), ('agosto', '08'), ('august', '08'),
|
||||||
|
('set', '09'), ('sep', '09'), ('setembre', '09'), ('september', '09'),
|
||||||
|
('oct', '10'), ('octubre', '10'), ('october', '10'),
|
||||||
|
('nov', '11'), ('novembre', '11'), ('noviembre', '11'),
|
||||||
|
('des', '12'), ('dec', '12'), ('desembre', '12'), ('diciembre', '12'), ('december', '12')
|
||||||
|
]
|
||||||
|
# reverse sort so longer names are replaced first
|
||||||
|
months.sort( key=lambda a: a[0], reverse=True )
|
||||||
|
v = value
|
||||||
|
for x in months:
|
||||||
|
# Try to replace twice. Sometimes instead of 'dec' we see 'dec.'
|
||||||
|
v = v.replace( u'%s.' % x[0], u'/%s/' % x[1] )
|
||||||
|
v = v.replace( x[0], u'/%s/' % x[1] )
|
||||||
|
return v
|
||||||
|
|
||||||
def isMostlyNumeric( text ):
|
def isMostlyNumeric( text ):
|
||||||
|
text = text.replace(' ','')
|
||||||
numbers = 0
|
numbers = 0
|
||||||
for x in text:
|
for x in text:
|
||||||
if x in '0123456789':
|
if x in '0123456789':
|
||||||
|
@ -61,3 +105,54 @@ def isMostlyNumeric( text ):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def textToMostlyNumeric( text ):
|
||||||
|
text = text.replace(' ','')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def isVat( text ):
|
||||||
|
if textToVat( text ):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def textToVat( text ):
|
||||||
|
text = text.replace( ' ', '' )
|
||||||
|
text = text.upper()
|
||||||
|
expressions = [
|
||||||
|
'^[A-Z][0-9]{8}$', '^[0-9]{8}[A-Z]$'
|
||||||
|
]
|
||||||
|
for e in expressions:
|
||||||
|
ex = re.compile( e )
|
||||||
|
if ex.search( text ):
|
||||||
|
return text
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def isPageNumber( text ):
|
||||||
|
if textToPageNumber( text ):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def textToPageNumber( text ):
|
||||||
|
current = None
|
||||||
|
total = None
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
inSequence = False
|
||||||
|
for c in text:
|
||||||
|
if c in '0123456789':
|
||||||
|
if not inSequence:
|
||||||
|
inSequence = True
|
||||||
|
blocks.append( u'' )
|
||||||
|
blocks[-1] += c
|
||||||
|
else:
|
||||||
|
inSequence = False
|
||||||
|
if len(blocks) > 0:
|
||||||
|
current = textToFloat( blocks[0] )
|
||||||
|
if len(blocks) > 1:
|
||||||
|
total = textToFloat( blocks[1] )
|
||||||
|
if current:
|
||||||
|
return (current, total)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue