From b0320e055d8c3cb48230390c3bc0587da1fb9d0e Mon Sep 17 00:00:00 2001 From: Albert Cervera i Areny Date: Fri, 13 Jun 2008 20:31:14 +0200 Subject: [PATCH] Added our own trigram implementation. Now it's used in recognizer. Recognizer usage of trigram still untested but should most possibly work. --- __init__.py | 7 ++++- recognizer.py | 7 +++-- trigram.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 5 deletions(-) create mode 100755 trigram.py diff --git a/__init__.py b/__init__.py index 2ead901..082da09 100644 --- a/__init__.py +++ b/__init__.py @@ -16,4 +16,9 @@ # Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - +from template import * +from ocr import * +from recognizer import * +from scanner import * +from document import * +from barcode import * diff --git a/recognizer.py b/recognizer.py index 5bd2bf1..6076525 100644 --- a/recognizer.py +++ b/recognizer.py @@ -23,6 +23,7 @@ from ocr import * from template import * from document import * +from trigram import * import tempfile @@ -130,7 +131,7 @@ class Recognizer(QObject): # Tries to find out the best template in 'templates' for the image file given by 'image' # TODO: Should be reestructured to use function scanWithTemplate() - def findBestTemplate( self, cr, file, templates ): + def findBestTemplate( self, file, templates ): self.scan( file ) max = 0 bestDocument = Document() @@ -157,9 +158,7 @@ class Recognizer(QObject): print "Jumping %s due to type %s " % ( templateBox.name, templateBox.type ) continue matcherBoxes += 1 - #cr.execute( 'SELECT similarity(%s,%s)', (translate(text), translate(templateBox.text)) ) - cr.execute( 'SELECT similarity(%s,%s)', (text, templateBox.text) ) - similarity = cr.fetchone()[0] + similarity = Trigram.trigram( text, templateBox.text ) score += similarity score = score / matcherBoxes if score > max: diff --git a/trigram.py b/trigram.py new file mode 100755 index 0000000..456da9a --- /dev/null +++ b/trigram.py @@ -0,0 +1,72 @@ + +class Trigram: + + # Returns a list of the trigrams of a sentence. That is, the list of + # all trigrams of each of the words in a string. Words are currently + # splitted by the space character only. + @staticmethod + def trigramList( text ): + words = text.split( ' ' ) + l = [] + for x in words: + for y in Trigram.wordTrigramList( x ): + l.append( y ) + return l + + # Calculates the list of trigrams contained in a word. If you feed + # this function with an string with spaces they'll be treated like + # normal characters. The usual trigram function is trigramList() which + # returns trigrams for all of it's words. + @staticmethod + def wordTrigramList( text ): + l = [] + size = len(text) + 1 + text = ' ' + text + ' ' + for x in range(size): + l.append( text[x:x+3] ) + return l + + # Calculates similarity between two strings using a trigram algorithm. + # This is based in PostgreSQL pg_trgm implementation though in some cases + # you'll get different results. For example trigram( 'abcabc', 'abc' ) + # returns 0.3 here and 0.67 in PostgreSQL's version. + # There's also a commented alternative for the final calculation of the + # distance. + @staticmethod + def trigram( text1, text2 ): + l1 = Trigram.trigramList( text1.lower() ) + l2 = Trigram.trigramList( text2.lower() ) + size1 = len(l1) + size2 = len(l2) + p1 = 0 + p2 = 0 + count = 0 + while p1 < size1 and p2 < size2: + if l1[p1] < l2[p2]: + p1 += 1 + elif l1[p1] > l2[p2]: + p2 += 1 + else: + p1 += 1 + p2 += 1 + count += 1 + + return float(count) / float( size1 + size2 - count ) + + # Here another way of calculating the similarity + #if size1 > size2: + #return float(count) / float( size1 ) + #else: + #return float(count) / float( size2 ) + + +if __name__ == '__main__': + print Trigram.trigramList( 'abc' ) + print Trigram.trigramList( 'hola' ) + print Trigram.trigramList( 'adeu manalet' ) + + print Trigram.trigram( 'abc', 'abc' ) + print Trigram.trigram( 'abcabc', 'abc' ) + print Trigram.trigram( 'abcdef', 'abc' ) + print Trigram.trigram( 'abcdef', 'bcd' ) + print Trigram.trigram( 'bcdef', 'abc' )