mirror of https://github.com/NaN-tic/nanscan.git
Added our own trigram implementation. Now it's used in recognizer.
Recognizer usage of trigram still untested but should most possibly work.
This commit is contained in:
parent
6cb2c3b1bd
commit
b0320e055d
|
@ -16,4 +16,9 @@
|
||||||
# Free Software Foundation, Inc.,
|
# Free Software Foundation, Inc.,
|
||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
from template import *
|
||||||
|
from ocr import *
|
||||||
|
from recognizer import *
|
||||||
|
from scanner import *
|
||||||
|
from document import *
|
||||||
|
from barcode import *
|
||||||
|
|
|
@ -23,6 +23,7 @@ from ocr import *
|
||||||
|
|
||||||
from template import *
|
from template import *
|
||||||
from document import *
|
from document import *
|
||||||
|
from trigram import *
|
||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
@ -130,7 +131,7 @@ class Recognizer(QObject):
|
||||||
|
|
||||||
# Tries to find out the best template in 'templates' for the image file given by 'image'
|
# Tries to find out the best template in 'templates' for the image file given by 'image'
|
||||||
# TODO: Should be reestructured to use function scanWithTemplate()
|
# TODO: Should be reestructured to use function scanWithTemplate()
|
||||||
def findBestTemplate( self, cr, file, templates ):
|
def findBestTemplate( self, file, templates ):
|
||||||
self.scan( file )
|
self.scan( file )
|
||||||
max = 0
|
max = 0
|
||||||
bestDocument = Document()
|
bestDocument = Document()
|
||||||
|
@ -157,9 +158,7 @@ class Recognizer(QObject):
|
||||||
print "Jumping %s due to type %s " % ( templateBox.name, templateBox.type )
|
print "Jumping %s due to type %s " % ( templateBox.name, templateBox.type )
|
||||||
continue
|
continue
|
||||||
matcherBoxes += 1
|
matcherBoxes += 1
|
||||||
#cr.execute( 'SELECT similarity(%s,%s)', (translate(text), translate(templateBox.text)) )
|
similarity = Trigram.trigram( text, templateBox.text )
|
||||||
cr.execute( 'SELECT similarity(%s,%s)', (text, templateBox.text) )
|
|
||||||
similarity = cr.fetchone()[0]
|
|
||||||
score += similarity
|
score += similarity
|
||||||
score = score / matcherBoxes
|
score = score / matcherBoxes
|
||||||
if score > max:
|
if score > max:
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
|
||||||
|
class Trigram:
|
||||||
|
|
||||||
|
# Returns a list of the trigrams of a sentence. That is, the list of
|
||||||
|
# all trigrams of each of the words in a string. Words are currently
|
||||||
|
# splitted by the space character only.
|
||||||
|
@staticmethod
|
||||||
|
def trigramList( text ):
|
||||||
|
words = text.split( ' ' )
|
||||||
|
l = []
|
||||||
|
for x in words:
|
||||||
|
for y in Trigram.wordTrigramList( x ):
|
||||||
|
l.append( y )
|
||||||
|
return l
|
||||||
|
|
||||||
|
# Calculates the list of trigrams contained in a word. If you feed
|
||||||
|
# this function with an string with spaces they'll be treated like
|
||||||
|
# normal characters. The usual trigram function is trigramList() which
|
||||||
|
# returns trigrams for all of it's words.
|
||||||
|
@staticmethod
|
||||||
|
def wordTrigramList( text ):
|
||||||
|
l = []
|
||||||
|
size = len(text) + 1
|
||||||
|
text = ' ' + text + ' '
|
||||||
|
for x in range(size):
|
||||||
|
l.append( text[x:x+3] )
|
||||||
|
return l
|
||||||
|
|
||||||
|
# Calculates similarity between two strings using a trigram algorithm.
|
||||||
|
# This is based in PostgreSQL pg_trgm implementation though in some cases
|
||||||
|
# you'll get different results. For example trigram( 'abcabc', 'abc' )
|
||||||
|
# returns 0.3 here and 0.67 in PostgreSQL's version.
|
||||||
|
# There's also a commented alternative for the final calculation of the
|
||||||
|
# distance.
|
||||||
|
@staticmethod
|
||||||
|
def trigram( text1, text2 ):
|
||||||
|
l1 = Trigram.trigramList( text1.lower() )
|
||||||
|
l2 = Trigram.trigramList( text2.lower() )
|
||||||
|
size1 = len(l1)
|
||||||
|
size2 = len(l2)
|
||||||
|
p1 = 0
|
||||||
|
p2 = 0
|
||||||
|
count = 0
|
||||||
|
while p1 < size1 and p2 < size2:
|
||||||
|
if l1[p1] < l2[p2]:
|
||||||
|
p1 += 1
|
||||||
|
elif l1[p1] > l2[p2]:
|
||||||
|
p2 += 1
|
||||||
|
else:
|
||||||
|
p1 += 1
|
||||||
|
p2 += 1
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
return float(count) / float( size1 + size2 - count )
|
||||||
|
|
||||||
|
# Here another way of calculating the similarity
|
||||||
|
#if size1 > size2:
|
||||||
|
#return float(count) / float( size1 )
|
||||||
|
#else:
|
||||||
|
#return float(count) / float( size2 )
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print Trigram.trigramList( 'abc' )
|
||||||
|
print Trigram.trigramList( 'hola' )
|
||||||
|
print Trigram.trigramList( 'adeu manalet' )
|
||||||
|
|
||||||
|
print Trigram.trigram( 'abc', 'abc' )
|
||||||
|
print Trigram.trigram( 'abcabc', 'abc' )
|
||||||
|
print Trigram.trigram( 'abcdef', 'abc' )
|
||||||
|
print Trigram.trigram( 'abcdef', 'bcd' )
|
||||||
|
print Trigram.trigram( 'bcdef', 'abc' )
|
Loading…
Reference in New Issue