mirror of https://github.com/NaN-tic/nanscan.git
96 lines
2.9 KiB
Python
Executable File
96 lines
2.9 KiB
Python
Executable File
# Copyright (C) 2008 by Albert Cervera i Areny
|
|
# albert@nan-tic.com
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the
|
|
# Free Software Foundation, Inc.,
|
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
|
|
class Trigram:
|
|
|
|
# Returns a list of the trigrams of a sentence. That is, the list of
|
|
# all trigrams of each of the words in a string. Words are currently
|
|
# splitted by the space character only.
|
|
# Note that more than a list it's a sorted set. So there are no repeated items.
|
|
@staticmethod
|
|
def trigramList( text ):
|
|
words = text.split( ' ' )
|
|
l = set()
|
|
for x in words:
|
|
for y in Trigram.wordTrigramList( x ):
|
|
l.add( y )
|
|
l = list( l )
|
|
l.sort()
|
|
return l
|
|
|
|
# Calculates the list of trigrams contained in a word. If you feed
|
|
# this function with an string with spaces they'll be treated like
|
|
# normal characters. The usual trigram function is trigramList() which
|
|
# returns trigrams for all of it's words.
|
|
# Note that more than a list it's a sorted set. So there are no repeated items.
|
|
@staticmethod
|
|
def wordTrigramList( text ):
|
|
l = set()
|
|
size = len(text) + 1
|
|
text = ' ' + text + ' '
|
|
for x in range(size):
|
|
l.add( text[x:x+3] )
|
|
l = list( l )
|
|
l.sort()
|
|
return l
|
|
|
|
# Calculates similarity between two strings using a trigram algorithm.
|
|
# This is based in PostgreSQL pg_trgm implementation.
|
|
# There's also a commented alternative for the final calculation of the
|
|
# distance.
|
|
@staticmethod
|
|
def trigram( text1, text2 ):
|
|
l1 = Trigram.trigramList( text1.lower() )
|
|
l2 = Trigram.trigramList( text2.lower() )
|
|
size1 = len(l1)
|
|
size2 = len(l2)
|
|
p1 = 0
|
|
p2 = 0
|
|
count = 0
|
|
while p1 < size1 and p2 < size2:
|
|
if l1[p1] < l2[p2]:
|
|
p1 += 1
|
|
elif l1[p1] > l2[p2]:
|
|
p2 += 1
|
|
else:
|
|
p1 += 1
|
|
p2 += 1
|
|
count += 1
|
|
|
|
return float(count) / float( size1 + size2 - count )
|
|
|
|
# Here another way of calculating the similarity
|
|
#if size1 > size2:
|
|
#return float(count) / float( size1 )
|
|
#else:
|
|
#return float(count) / float( size2 )
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print Trigram.trigramList( 'abc' )
|
|
print Trigram.trigramList( 'abcabc' )
|
|
print Trigram.trigramList( 'hola' )
|
|
print Trigram.trigramList( 'adeu manelet' )
|
|
|
|
print Trigram.trigram( 'abc', 'abc' )
|
|
print Trigram.trigram( 'abcabc', 'abc' )
|
|
print Trigram.trigram( 'abcdef', 'abc' )
|
|
print Trigram.trigram( 'abcdef', 'bcd' )
|
|
print Trigram.trigram( 'bcdef', 'abc' )
|