nanscan/NanScan/Trigram.py

96 lines
2.9 KiB
Python
Executable File

# Copyright (C) 2008 by Albert Cervera i Areny
# albert@nan-tic.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
class Trigram:
# Returns a list of the trigrams of a sentence. That is, the list of
# all trigrams of each of the words in a string. Words are currently
# splitted by the space character only.
# Note that more than a list it's a sorted set. So there are no repeated items.
@staticmethod
def trigramList( text ):
words = text.split( ' ' )
l = set()
for x in words:
for y in Trigram.wordTrigramList( x ):
l.add( y )
l = list( l )
l.sort()
return l
# Calculates the list of trigrams contained in a word. If you feed
# this function with an string with spaces they'll be treated like
# normal characters. The usual trigram function is trigramList() which
# returns trigrams for all of it's words.
# Note that more than a list it's a sorted set. So there are no repeated items.
@staticmethod
def wordTrigramList( text ):
l = set()
size = len(text) + 1
text = ' ' + text + ' '
for x in range(size):
l.add( text[x:x+3] )
l = list( l )
l.sort()
return l
# Calculates similarity between two strings using a trigram algorithm.
# This is based in PostgreSQL pg_trgm implementation.
# There's also a commented alternative for the final calculation of the
# distance.
@staticmethod
def trigram( text1, text2 ):
l1 = Trigram.trigramList( text1.lower() )
l2 = Trigram.trigramList( text2.lower() )
size1 = len(l1)
size2 = len(l2)
p1 = 0
p2 = 0
count = 0
while p1 < size1 and p2 < size2:
if l1[p1] < l2[p2]:
p1 += 1
elif l1[p1] > l2[p2]:
p2 += 1
else:
p1 += 1
p2 += 1
count += 1
return float(count) / float( size1 + size2 - count )
# Here another way of calculating the similarity
#if size1 > size2:
#return float(count) / float( size1 )
#else:
#return float(count) / float( size2 )
if __name__ == '__main__':
print Trigram.trigramList( 'abc' )
print Trigram.trigramList( 'abcabc' )
print Trigram.trigramList( 'hola' )
print Trigram.trigramList( 'adeu manelet' )
print Trigram.trigram( 'abc', 'abc' )
print Trigram.trigram( 'abcabc', 'abc' )
print Trigram.trigram( 'abcdef', 'abc' )
print Trigram.trigram( 'abcdef', 'bcd' )
print Trigram.trigram( 'bcdef', 'abc' )