Added Block, PdfReader, Range and TextPatterns.

This commit is contained in:
Albert Cervera i Areny 2009-03-14 18:11:20 +01:00
parent 0eb944c512
commit 4cafa912ce
4 changed files with 264 additions and 0 deletions

92
NanScan/Block.py Normal file
View File

@ -0,0 +1,92 @@
# Copyright (C) 2009 by Albert Cervera i Areny
# albert@nan-tic.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from PyQt4.QtCore import *
def blockDistanceComparison(x, y):
if x.distance > y.distance:
return 1
elif x.distance < y.distance:
return -1
else:
return 0
def blockSizeComparison(x, y):
xRect = x.rect()
yRect = y.rect()
xt = len( xRect.width() * xRect.height() )
yt = len( yRect.width() * yRect.height() )
if xt > yt:
return 1
elif xt < yt:
return -1
else:
return 0
## @brief This class represents a group of characters in a document.
class Block:
def __init__(self):
self.document = None
self.boxes = []
self.outerDistane = 10
## @brief Returns a unicode string with the text of the current range
def text(self):
line = self.document[self.line]
chars = line[self.pos:self.pos + self.length]
return u''.join( [x.character for x in chars] )
## @brief Returns the bounding rectangle of the text in the range
def rect(self):
rect = QRectF()
for c in self.boxes:
rect = rect.united( c.box )
return rect
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
# larger in all sides.
def outerRect(self):
rect = self.rect()
rect.translate( - self.outerDistance, - self.outerDistance )
rect.setWidth( rect.width() + self.outerDistance * 2 )
rect.setHeight( rect.height() + self.outerDistance * 2 )
return rect
## @brief Returns a list with all possible ranges of size length of the
# given document
@staticmethod
def extractAllBlocksFromDocument(lines, length, distance=0):
if length <= 0:
return []
blocks = []
for line in xrange(len(lines)):
for char in xrange(len(line)):
blockFound = False
for block in blocks:
if block.outerRect().intersects( char.box ):
block.boxes.append( char )
blockFound = True
break
if not blockFound:
block = Block()
block.boxes.append( char )
block.document = lines
blocks.append( block )
return blocks

16
NanScan/PdfReader.py Normal file
View File

@ -0,0 +1,16 @@
import poppler
document = poppler.document_new_from_file ('file:///home/albert/prog/empresa/info/pla-empresa/empresa.pdf', None)
#self.document = poppler.document_new_from_file (uri, None)
print "PAGES: ", document.get_n_pages()
print "SIZE: ", current_page.get_size()
# Font Info:
font_info = poppler.FontInfo(self.document)
iter = font_info.scan(self.n_pages)
print iter.get_full_name()
while iter.next():
print iter.get_full_name()

93
NanScan/Range.py Normal file
View File

@ -0,0 +1,93 @@
# Copyright (C) 2009 by Albert Cervera i Areny
# albert@nan-tic.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from PyQt4.QtCore import *
def rangeDistanceComparison(x, y):
if x.distance > y.distance:
return 1
elif x.distance < y.distance:
return -1
else:
return 0
def rangeLengthComparison(x, y):
xt = len( x.text() )
yt = len( y.text() )
if xt > yt:
return 1
elif xt < yt:
return -1
else:
return 0
## @brief This class represents a group of characters in a document.
class Range:
def __init__(self):
self.line = 0
self.pos = 0
self.length = 0
self.document = None
## @brief Returns a unicode string with the text of the current range
def text(self):
line = self.document[self.line]
chars = line[self.pos:self.pos + self.length]
return u''.join( [x.character for x in chars] )
## @brief Returns the bounding rectangle of the text in the range
def rect(self):
line = self.document[self.line]
chars = line[self.pos:self.pos + self.length]
rect = QRectF()
for c in chars:
rect = rect.united( c.box )
return rect
## @brief Returns a list with all possible ranges of size length of the
# given document
@staticmethod
def extractAllRangesFromDocument(lines, length, width=0):
if length <= 0:
return []
ranges = []
for line in xrange(len(lines)):
if length >= len(lines[line]):
ran = Range()
ran.line = line
ran.pos = 0
ran.length = len(lines[line])
ran.document = lines
#if width:
# while ran.rect().width() > width:
# ran.length -= 1
ranges.append( ran )
continue
for pos in xrange(len(lines[line]) - length + 1):
ran = Range()
ran.line = line
ran.pos = pos
ran.length = length
ran.document = lines
#if width:
# while ran.rect().width() > width:
# ran.length -= 1
ranges.append( ran )
return ranges

63
NanScan/TextPatterns.py Normal file
View File

@ -0,0 +1,63 @@
# Copyright (C) 2009 by Albert Cervera i Areny
# albert@nan-tic.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from PyQt4.QtCore import *
def textToFloat( value ):
if ',' in value and '.' in value:
commaIndex = value.rfind(',')
periodIndex = value.rfind('.')
if commaIndex > periodIndex:
newValue = value.replace( '.', '' ).replace( ',', '.' )
else:
newValue = value.replace( ',', '' )
elif ',' in value:
newValue = value.replace( ',', '.' )
else:
newValue = value
return float( newValue )
def isFloat( value ):
try:
textToFloat( value )
return True
except:
return False
def isDate( value ):
date = textToDate( value )
return date.isValid()
def textToDate( value ):
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
for pattern in patterns:
date = QDate.fromString( value.replace(' ',''), pattern )
if date.isValid():
return date
return QDate()
def isMostlyNumeric( text ):
numbers = 0
for x in text:
if x in '0123456789':
numbers += 1
if numbers > len(text) / 2:
return True
else:
return False