Improved Block class implementation. Make Ocr use it internally.

First steps towards Block finding in documents.
This commit is contained in:
Albert Cervera i Areny 2009-03-16 23:18:56 +01:00
parent 4cafa912ce
commit 02241981b6
3 changed files with 265 additions and 178 deletions

View File

@ -19,6 +19,14 @@
from PyQt4.QtCore import *
def boxComparison(x, y):
if x.box.x() > y.box.x():
return 1
elif x.box.x() < y.box.x():
return -1
else:
return 0
def blockDistanceComparison(x, y):
if x.distance > y.distance:
return 1
@ -39,23 +47,38 @@ def blockSizeComparison(x, y):
else:
return 0
class Character:
def __init__(self):
self.character = None
self.box = None
## @brief This class represents a group of characters in a document.
class Block:
def __init__(self):
self.document = None
self.boxes = []
self.outerDistane = 10
self._boxes = []
self.outerDistance = 2.5
def setBoxes(self, boxes):
self._boxes = boxes
def boxes(self):
return self._boxes
def addBox(self, box):
self._boxes.append( box )
def count(self):
return len(self._boxes)
## @brief Returns a unicode string with the text of the current range
def text(self):
line = self.document[self.line]
chars = line[self.pos:self.pos + self.length]
return u''.join( [x.character for x in chars] )
return self.formatedText()
## @brief Returns the bounding rectangle of the text in the range
def rect(self):
rect = QRectF()
for c in self.boxes:
for c in self._boxes:
rect = rect.united( c.box )
return rect
@ -64,29 +87,214 @@ class Block:
def outerRect(self):
rect = self.rect()
rect.translate( - self.outerDistance, - self.outerDistance )
rect.setWidth( rect.width() + self.outerDistance * 2 )
rect.setHeight( rect.height() + self.outerDistance * 2 )
rect.setWidth( rect.width() + self.outerDistance * 2.0 )
rect.setHeight( rect.height() + self.outerDistance * 2.0 )
return rect
## @brief Returns a list with all possible ranges of size length of the
# given document
@staticmethod
def extractAllBlocksFromDocument(lines, length, distance=0):
if length <= 0:
return []
def extractAllBlocksFromDocument(lines, distance=0):
#blocks = []
#for line in lines:
#for char in line:
#blockFound = False
#for block in blocks:
#if block.outerRect().intersects( char.box ):
#block.addBox( char )
#blockFound = True
#break
#if not blockFound:
#block = Block()
#block.addBox( char )
#block.document = lines
#blocks.append( block )
# Find initial blocks.
blocks = []
for line in xrange(len(lines)):
for char in xrange(len(line)):
blockFound = False
for block in blocks:
if block.outerRect().intersects( char.box ):
block.boxes.append( char )
blockFound = True
break
if not blockFound:
block = Block()
block.boxes.append( char )
block.document = lines
blocks.append( block )
for line in lines:
block = Block()
block.document = lines
blocks.append( block )
for char in line:
if char.character != u' ':
block.addBox( char )
else:
avgWidth = block.rect().width() / block.count()
#print "BLOCK: ", block.text().encode('ascii','ignore')
#print "BLOCK WIDTH: ", block.rect().width()
#print "BLOCK COUNT: ", block.count()
#print "BLOCK AVGWIDTH: ", avgWidth
#print "CHAR WIDTH: ", char.box.width()
#print "===="
if char.box.width() > avgWidth * 1.5:
block = Block()
block.document = lines
blocks.append( block )
else:
block.addBox( char )
# Find out if existing blocks should be merged
merged = True
while merged:
merged = False
for block1 in blocks:
for block2 in blocks:
if block1 == block2:
continue
if block1.outerRect().intersects( block2.rect() ):
block1.merge( block2 )
blocks.remove( block2 )
merged = True
break
if merged:
break
return blocks
def merge(self, block):
for box in block._boxes:
self._boxes.append( box )
## @brief Obtains top most box of the given list
def topMostBox(self, boxes):
top = None
for x in boxes:
if not top or x.box.y() < top.box.y():
top = x
return top
## @brief Obtain text lines in a list of lines where each line is a list
# of ordered characters.
# Note that no spaces are added in this function and each character is a
# Character class instance.
# The algorithm used is pretty simple:
# 1- Put all boxes in a list ('boxes')
# 2- Search top most box, remove from pending 'boxes' and add in a new line
# 3- Search all boxes that vertically intersect with current box, remove from
# pending and add in the current line
# 4- Go to number 2 until all boxes have been processed.
# 5- Sort the characters of each line by the y coordinate.
def textLines(self, region=None):
# If we use 'if region:' instead of comparing with None
# rects with top (or left) >= bottom (or right), will return
# False and thus return _all_ boxes instead of _none_.
# Indeed, 'if region:' is equivalent to 'if region.isValid():'
if region != None:
# Filter out boxes not in the given region
boxes = []
for x in self._boxes:
if region.intersects(x.box):
boxes.append(x)
else:
# Copy as we'll remove items from the list
boxes = self._boxes[:]
lines = []
while boxes:
box = self.topMostBox( boxes )
boxes.remove( box )
line = []
line.append( box )
toRemove = []
for x in boxes:
if x.box.top() > box.box.bottom():
continue
elif x.box.bottom() < box.box.top():
continue
line.append( x )
toRemove.append( x )
for x in toRemove:
boxes.remove( x )
lines.append( line )
# Now that we have all boxes in its line. Sort each of
# them
for line in lines:
line.sort( boxComparison )
return lines
## @brief This function adds spaces between words of a single line of boxes.
def textLineWithSpaces(self, line):
width = 0
count = 0
left = None
spacesToAdd = []
words = []
for c in line:
if left:
# WITH TESSERACT: 1 * 0.333
# If separtion between previous and current char
# is greater than a third of the average character
# width we'll add a space.
#
# WITH CUNEIFORM: 1 * 0.4
if c.box.left() - left > ( width / count ) * 0.4:
if spacesToAdd:
words.append( line[spacesToAdd[-1]:count] )
spacesToAdd.append( count )
# c.character is already a unicode string
left = c.box.right()
width += c.box.width()
count += 1
# Try to find out if they are fixed sized characters
# We've got some problems with fixed size fonts. In some cases the 'I' letter will
# have the width of a pipe but the distance between characters will be fixed. In these
# cases it's very probable our algorithm will add incorrect spaces before and/or after
# the 'I' letter. This should be fixed by somehow determining if it's a fixed sized
# font. The commented code below tries to do just that by calculating distances within
# the letters of each word. We need to find out if something like this can work and
# use it.
#for x in words:
#dist = []
#for c in range( len(x)-1 ):
#dist.append( x[c+1].box.center().x() - x[c].box.center().x() )
#print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore')
#print 'Distancies: ', dist
# Reverse so indexes are still valid after insertions
spacesToAdd.reverse()
previousIdx = None
for idx in spacesToAdd:
c = Character()
c.character = u' '
c.box = QRectF()
c.box.setTop( line[idx - 1].box.top() )
c.box.setBottom( line[idx - 1].box.bottom() )
c.box.setLeft( line[idx - 1].box.right() )
c.box.setRight( line[idx].box.left() )
line.insert( idx, c )
## @brief This function is similar to textLines() but adds spaces between words.
# The result is also a list of lines each line being a list of Character objects.
def textLinesWithSpaces(self, region=None):
lines = self.textLines( region )
# Now we have all lines with their characters in their positions.
# Here we write and add spaces appropiately.
# In order not to be distracted with character widths of letters
# like 'm' or 'i' (which are very wide and narrow), we average
# width of the letters on a per line basis. This shows good
# results, by now, on text with the same char size in the line,
# which is quite usual.
for line in lines:
self.textLineWithSpaces( line )
return lines
## @brief Returns the text in the given region as a string. Spaces included.
def formatedText(self, region=None):
lines = self.textLinesWithSpaces( region )
texts = []
for line in lines:
text = u''
for c in line:
text += c.character
texts.append(text)
return u'\n'.join( texts )

View File

@ -19,6 +19,7 @@
from NanScan.LevenshteinDistance import *
from NanScan.Range import *
from NanScan.Block import *
from NanScan.TextPatterns import *
class InvoiceRecognizer:
@ -64,10 +65,35 @@ class InvoiceRecognizer:
def recognize(self, recognizer):
#text = recognizer.textInRegion('text')
analyzer = recognizer.analyzers['text']
self.textLines = analyzer.textLinesWithSpaces()
self.textLines = analyzer.block.textLinesWithSpaces()
result = ''
for tag in InvoiceRecognizer.Tags:
result += 'Tag: %s, Value: %s\n' % ( tag, self.findTagValue( tag ) )
print "========================================"
blocks = Block.extractAllBlocksFromDocument( self.textLines )
for block in blocks:
print "---"
print "BLOCK:", block.text().encode('ascii','ignore')
print "---"
print "========================================"
# Try to find out which of the blocks contains customer information
# This rect, picks up the first third of an A4 paper size.
top = QRectF( 0, 0, 210, 99 )
tops = []
for block in blocks:
if block.rect().intersects( top ):
tops.append( block )
# Once we have all the blocks of the first third of the paper
# try to guess which of them is the good one.
# Remove those blocks too wide
sized = []
for block in tops:
if block.width() < 120:
sized.append( block )
return result
def formatedLine(self, line):

View File

@ -26,23 +26,13 @@ import math
from TemporaryFile import *
from Analyzer import *
from Block import *
from gamera.core import *
from PyQt4.QtCore import *
from PyQt4.QtGui import *
class Character:
def __init__(self):
self.character = None
self.box = None
def boxComparison(x, y):
if x.box.x() > y.box.x():
return 1
elif x.box.x() < y.box.x():
return -1
else:
return 0
## @brief This class allows using an OCR and provides several convenient functions
# regarding text and image processing such as deskewing or obtaining formated text.
@ -133,12 +123,12 @@ class Ocr(Analyzer):
## @brief Returns the text of a given region of the image.
# It's the same as calling formatedText().
def textInRegion(self, region=None):
return self.formatedText( region )
return self.block.formatedText( region )
## @brief Returns the bounding rectangle of the text returned by textInRegion for
# the given region.
def featureRectInRegion(self, region=None):
lines = self.textLinesWithSpaces( region )
lines = self.block.textLinesWithSpaces( region )
rect = QRectF()
for line in lines:
for c in line:
@ -182,147 +172,10 @@ class Ocr(Analyzer):
txt = lower( self.cuneiform() )
self.boxes = self.parseCuneiformOutput(txt)
## @brief Obtains top most box of the given list
def topMostBox(self, boxes):
top = None
for x in boxes:
if not top or x.box.y() < top.box.y():
top = x
return top
## @brief Obtain text lines in a list of lines where each line is a list
# of ordered characters.
# Note that no spaces are added in this function and each character is a
# Character class instance.
# The algorithm used is pretty simple:
# 1- Put all boxes in a list ('boxes')
# 2- Search top most box, remove from pending 'boxes' and add in a new line
# 3- Search all boxes that vertically intersect with current box, remove from
# pending and add in the current line
# 4- Go to number 2 until all boxes have been processed.
# 5- Sort the characters of each line by the y coordinate.
def textLines(self, region=None):
# If we use 'if region:' instead of comparing with None
# rects with top (or left) >= bottom (or right), will return
# False and thus return _all_ boxes instead of _none_.
# Indeed, 'if region:' is equivalent to 'if region.isValid():'
if region != None:
# Filter out boxes not in the given region
boxes = []
for x in self.boxes:
if region.intersects(x.box):
boxes.append(x)
else:
# Copy as we'll remove items from the list
boxes = self.boxes[:]
lines = []
while boxes:
box = self.topMostBox( boxes )
boxes.remove( box )
line = []
line.append( box )
toRemove = []
for x in boxes:
if x.box.top() > box.box.bottom():
continue
elif x.box.bottom() < box.box.top():
continue
line.append( x )
toRemove.append( x )
for x in toRemove:
boxes.remove( x )
lines.append( line )
# Now that we have all boxes in its line. Sort each of
# them
for line in lines:
line.sort( boxComparison )
return lines
## @brief This function adds spaces between words of a single line of boxes.
def textLineWithSpaces(self, line):
width = 0
count = 0
left = None
spacesToAdd = []
words = []
for c in line:
if left:
# If separtion between previous and current char
# is greater than a third of the average character
# width we'll add a space.
if c.box.left() - left > ( width / count ) / 3:
if spacesToAdd:
words.append( line[spacesToAdd[-1]:count] )
spacesToAdd.append( count )
# c.character is already a unicode string
left = c.box.right()
width += c.box.width()
count += 1
# Try to find out if they are fixed sized characters
# We've got some problems with fixed size fonts. In some cases the 'I' letter will
# have the width of a pipe but the distance between characters will be fixed. In these
# cases it's very probable our algorithm will add incorrect spaces before and/or after
# the 'I' letter. This should be fixed by somehow determining if it's a fixed sized
# font. The commented code below tries to do just that by calculating distances within
# the letters of each word. We need to find out if something like this can work and
# use it.
#for x in words:
#dist = []
#for c in range( len(x)-1 ):
#dist.append( x[c+1].box.center().x() - x[c].box.center().x() )
#print 'Paraula: ', (u''.join( [i.character for i in x] )).encode( 'ascii', 'ignore')
#print 'Distancies: ', dist
# Reverse so indexes are still valid after insertions
spacesToAdd.reverse()
previousIdx = None
for idx in spacesToAdd:
c = Character()
c.character = u' '
c.box = QRectF()
c.box.setTop( line[idx - 1].box.top() )
c.box.setBottom( line[idx - 1].box.bottom() )
c.box.setLeft( line[idx - 1].box.right() )
c.box.setRight( line[idx].box.left() )
line.insert( idx, c )
self.block = Block()
self.block.setBoxes( self.boxes )
## @brief This function is similar to textLines() but adds spaces between words.
# The result is also a list of lines each line being a list of Character objects.
def textLinesWithSpaces(self, region=None):
lines = self.textLines( region )
# Now we have all lines with their characters in their positions.
# Here we write and add spaces appropiately.
# In order not to be distracted with character widths of letters
# like 'm' or 'i' (which are very wide and narrow), we average
# width of the letters on a per line basis. This shows good
# results, by now, on text with the same char size in the line,
# which is quite usual.
for line in lines:
self.textLineWithSpaces( line )
return lines
## @brief Returns the text in the given region as a string. Spaces included.
def formatedText(self, region=None):
lines = self.textLinesWithSpaces( region )
texts = []
for line in lines:
text = u''
for c in line:
text += c.character
texts.append(text)
return u'\n'.join( texts )
## @brief Calculates slope of text lines
# This value is used by deskew() function to rotate image and
@ -338,7 +191,7 @@ class Ocr(Analyzer):
def slope(self, region=None):
# TODO: We should probably discard values that highly differ
# from the average for the final value to be used to rotate.
lines = self.textLines( region )
lines = self.block.textLines( region )
slopes = []
for line in lines:
if len(line) < 3: