nanscan/translator.py

#   Copyright (C) 2008 by Albert Cervera i Areny
#   albert@nan-tic.com
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the
#   Free Software Foundation, Inc.,
#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

import codecs
import os

## @brief This class provides a simple way of converting similar characters
# to the same one.
#
# This can proof useful to overcome OCR errors and is used in Hamming class,
# for example. Default translation file provides families of characters. For
# example 's', 'S', '$' are in the same one because the OCR may sometimes
# recognize an 'S' as '$'. 'l', 'i' and '|' are in another family.
#
# The text 'eli a sa|de$' will be converted to 'ell a saldes'. The translator
# replaces any element of a family by the first character of the family it is in.
class Translator:
	def __init__(self):
		self.translations = None

	## @brief Sets the translation list.
	#
	# The list should follow the following structure: [ 'sS$', 'li|', ... ]
	def setTranslations(self, translations):
		self.translations = translations

	## @brief Loads the translation list from the given file.
	#
	# Each character family must be in a different line. See the default
	# translations.txt file if you need an example.
	def load(self, fileName):
		f=codecs.open(fileName, 'r', 'utf-8')
		if not f:
			print "File not found"
			return txt
		self.translations = f.readlines()
		f.close()

	## @brief Returns the given text replacing each character with the first
	# character of its family or itself if it's not in any character family.
	def translated(self, text):
		if self.translations == None:
			self.load( os.path.join( os.path.abspath(os.path.dirname(__file__)), 'translations.txt' ) )

		result = text
		for x in self.translations:
			for y in x[1:]:
				result = result.replace( y, x[0] )
		return result