mirror of https://github.com/NaN-tic/nanscan.git
Added Block, PdfReader, Range and TextPatterns.
This commit is contained in:
parent
0eb944c512
commit
4cafa912ce
|
@ -0,0 +1,92 @@
|
||||||
|
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||||
|
# albert@nan-tic.com
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
|
||||||
|
from PyQt4.QtCore import *
|
||||||
|
|
||||||
|
def blockDistanceComparison(x, y):
|
||||||
|
if x.distance > y.distance:
|
||||||
|
return 1
|
||||||
|
elif x.distance < y.distance:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def blockSizeComparison(x, y):
|
||||||
|
xRect = x.rect()
|
||||||
|
yRect = y.rect()
|
||||||
|
xt = len( xRect.width() * xRect.height() )
|
||||||
|
yt = len( yRect.width() * yRect.height() )
|
||||||
|
if xt > yt:
|
||||||
|
return 1
|
||||||
|
elif xt < yt:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
## @brief This class represents a group of characters in a document.
|
||||||
|
class Block:
|
||||||
|
def __init__(self):
|
||||||
|
self.document = None
|
||||||
|
self.boxes = []
|
||||||
|
self.outerDistane = 10
|
||||||
|
|
||||||
|
## @brief Returns a unicode string with the text of the current range
|
||||||
|
def text(self):
|
||||||
|
line = self.document[self.line]
|
||||||
|
chars = line[self.pos:self.pos + self.length]
|
||||||
|
return u''.join( [x.character for x in chars] )
|
||||||
|
|
||||||
|
## @brief Returns the bounding rectangle of the text in the range
|
||||||
|
def rect(self):
|
||||||
|
rect = QRectF()
|
||||||
|
for c in self.boxes:
|
||||||
|
rect = rect.united( c.box )
|
||||||
|
return rect
|
||||||
|
|
||||||
|
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
|
||||||
|
# larger in all sides.
|
||||||
|
def outerRect(self):
|
||||||
|
rect = self.rect()
|
||||||
|
rect.translate( - self.outerDistance, - self.outerDistance )
|
||||||
|
rect.setWidth( rect.width() + self.outerDistance * 2 )
|
||||||
|
rect.setHeight( rect.height() + self.outerDistance * 2 )
|
||||||
|
return rect
|
||||||
|
|
||||||
|
## @brief Returns a list with all possible ranges of size length of the
|
||||||
|
# given document
|
||||||
|
@staticmethod
|
||||||
|
def extractAllBlocksFromDocument(lines, length, distance=0):
|
||||||
|
if length <= 0:
|
||||||
|
return []
|
||||||
|
blocks = []
|
||||||
|
for line in xrange(len(lines)):
|
||||||
|
for char in xrange(len(line)):
|
||||||
|
blockFound = False
|
||||||
|
for block in blocks:
|
||||||
|
if block.outerRect().intersects( char.box ):
|
||||||
|
block.boxes.append( char )
|
||||||
|
blockFound = True
|
||||||
|
break
|
||||||
|
if not blockFound:
|
||||||
|
block = Block()
|
||||||
|
block.boxes.append( char )
|
||||||
|
block.document = lines
|
||||||
|
blocks.append( block )
|
||||||
|
return blocks
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
import poppler
|
||||||
|
|
||||||
|
document = poppler.document_new_from_file ('file:///home/albert/prog/empresa/info/pla-empresa/empresa.pdf', None)
|
||||||
|
#self.document = poppler.document_new_from_file (uri, None)
|
||||||
|
print "PAGES: ", document.get_n_pages()
|
||||||
|
print "SIZE: ", current_page.get_size()
|
||||||
|
|
||||||
|
|
||||||
|
# Font Info:
|
||||||
|
font_info = poppler.FontInfo(self.document)
|
||||||
|
iter = font_info.scan(self.n_pages)
|
||||||
|
|
||||||
|
print iter.get_full_name()
|
||||||
|
|
||||||
|
while iter.next():
|
||||||
|
print iter.get_full_name()
|
|
@ -0,0 +1,93 @@
|
||||||
|
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||||
|
# albert@nan-tic.com
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
|
||||||
|
from PyQt4.QtCore import *
|
||||||
|
|
||||||
|
def rangeDistanceComparison(x, y):
|
||||||
|
if x.distance > y.distance:
|
||||||
|
return 1
|
||||||
|
elif x.distance < y.distance:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def rangeLengthComparison(x, y):
|
||||||
|
xt = len( x.text() )
|
||||||
|
yt = len( y.text() )
|
||||||
|
if xt > yt:
|
||||||
|
return 1
|
||||||
|
elif xt < yt:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
## @brief This class represents a group of characters in a document.
|
||||||
|
class Range:
|
||||||
|
def __init__(self):
|
||||||
|
self.line = 0
|
||||||
|
self.pos = 0
|
||||||
|
self.length = 0
|
||||||
|
self.document = None
|
||||||
|
|
||||||
|
## @brief Returns a unicode string with the text of the current range
|
||||||
|
def text(self):
|
||||||
|
line = self.document[self.line]
|
||||||
|
chars = line[self.pos:self.pos + self.length]
|
||||||
|
return u''.join( [x.character for x in chars] )
|
||||||
|
|
||||||
|
## @brief Returns the bounding rectangle of the text in the range
|
||||||
|
def rect(self):
|
||||||
|
line = self.document[self.line]
|
||||||
|
chars = line[self.pos:self.pos + self.length]
|
||||||
|
rect = QRectF()
|
||||||
|
for c in chars:
|
||||||
|
rect = rect.united( c.box )
|
||||||
|
return rect
|
||||||
|
|
||||||
|
## @brief Returns a list with all possible ranges of size length of the
|
||||||
|
# given document
|
||||||
|
@staticmethod
|
||||||
|
def extractAllRangesFromDocument(lines, length, width=0):
|
||||||
|
if length <= 0:
|
||||||
|
return []
|
||||||
|
ranges = []
|
||||||
|
for line in xrange(len(lines)):
|
||||||
|
if length >= len(lines[line]):
|
||||||
|
ran = Range()
|
||||||
|
ran.line = line
|
||||||
|
ran.pos = 0
|
||||||
|
ran.length = len(lines[line])
|
||||||
|
ran.document = lines
|
||||||
|
#if width:
|
||||||
|
# while ran.rect().width() > width:
|
||||||
|
# ran.length -= 1
|
||||||
|
ranges.append( ran )
|
||||||
|
continue
|
||||||
|
for pos in xrange(len(lines[line]) - length + 1):
|
||||||
|
ran = Range()
|
||||||
|
ran.line = line
|
||||||
|
ran.pos = pos
|
||||||
|
ran.length = length
|
||||||
|
ran.document = lines
|
||||||
|
#if width:
|
||||||
|
# while ran.rect().width() > width:
|
||||||
|
# ran.length -= 1
|
||||||
|
ranges.append( ran )
|
||||||
|
return ranges
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||||
|
# albert@nan-tic.com
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
from PyQt4.QtCore import *
|
||||||
|
|
||||||
|
def textToFloat( value ):
|
||||||
|
if ',' in value and '.' in value:
|
||||||
|
commaIndex = value.rfind(',')
|
||||||
|
periodIndex = value.rfind('.')
|
||||||
|
if commaIndex > periodIndex:
|
||||||
|
newValue = value.replace( '.', '' ).replace( ',', '.' )
|
||||||
|
else:
|
||||||
|
newValue = value.replace( ',', '' )
|
||||||
|
elif ',' in value:
|
||||||
|
newValue = value.replace( ',', '.' )
|
||||||
|
else:
|
||||||
|
newValue = value
|
||||||
|
return float( newValue )
|
||||||
|
|
||||||
|
def isFloat( value ):
|
||||||
|
try:
|
||||||
|
textToFloat( value )
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def isDate( value ):
|
||||||
|
date = textToDate( value )
|
||||||
|
return date.isValid()
|
||||||
|
|
||||||
|
def textToDate( value ):
|
||||||
|
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
|
||||||
|
for pattern in patterns:
|
||||||
|
date = QDate.fromString( value.replace(' ',''), pattern )
|
||||||
|
if date.isValid():
|
||||||
|
return date
|
||||||
|
return QDate()
|
||||||
|
|
||||||
|
def isMostlyNumeric( text ):
|
||||||
|
numbers = 0
|
||||||
|
for x in text:
|
||||||
|
if x in '0123456789':
|
||||||
|
numbers += 1
|
||||||
|
if numbers > len(text) / 2:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
Loading…
Reference in New Issue