nanscan/translator.py

66 lines
2.4 KiB
Python

# Copyright (C) 2008 by Albert Cervera i Areny
# albert@nan-tic.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import codecs
import os
## @brief This class provides a simple way of converting similar characters
# to the same one.
#
# This can proof useful to overcome OCR errors and is used in Hamming class,
# for example. Default translation file provides families of characters. For
# example 's', 'S', '$' are in the same one because the OCR may sometimes
# recognize an 'S' as '$'. 'l', 'i' and '|' are in another family.
#
# The text 'eli a sa|de$' will be converted to 'ell a saldes'. The translator
# replaces any element of a family by the first character of the family it is in.
class Translator:
def __init__(self):
self.translations = None
## @brief Sets the translation list.
#
# The list should follow the following structure: [ 'sS$', 'li|', ... ]
def setTranslations(self, translations):
self.translations = translations
## @brief Loads the translation list from the given file.
#
# Each character family must be in a different line. See the default
# translations.txt file if you need an example.
def load(self, fileName):
f=codecs.open(fileName, 'r', 'utf-8')
if not f:
print "File not found"
return txt
self.translations = f.readlines()
f.close()
## @brief Returns the given text replacing each character with the first
# character of its family or itself if it's not in any character family.
def translated(self, text):
if self.translations == None:
self.load( os.path.join( os.path.abspath(os.path.dirname(__file__)), 'translations.txt' ) )
result = text
for x in self.translations:
for y in x[1:]:
result = result.replace( y, x[0] )
return result