mirror of https://github.com/NaN-tic/nanscan.git
Added Block, PdfReader, Range and TextPatterns.
This commit is contained in:
parent
0eb944c512
commit
4cafa912ce
|
@ -0,0 +1,92 @@
|
|||
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||
# albert@nan-tic.com
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
from PyQt4.QtCore import *
|
||||
|
||||
def blockDistanceComparison(x, y):
|
||||
if x.distance > y.distance:
|
||||
return 1
|
||||
elif x.distance < y.distance:
|
||||
return -1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def blockSizeComparison(x, y):
|
||||
xRect = x.rect()
|
||||
yRect = y.rect()
|
||||
xt = len( xRect.width() * xRect.height() )
|
||||
yt = len( yRect.width() * yRect.height() )
|
||||
if xt > yt:
|
||||
return 1
|
||||
elif xt < yt:
|
||||
return -1
|
||||
else:
|
||||
return 0
|
||||
|
||||
## @brief This class represents a group of characters in a document.
|
||||
class Block:
|
||||
def __init__(self):
|
||||
self.document = None
|
||||
self.boxes = []
|
||||
self.outerDistane = 10
|
||||
|
||||
## @brief Returns a unicode string with the text of the current range
|
||||
def text(self):
|
||||
line = self.document[self.line]
|
||||
chars = line[self.pos:self.pos + self.length]
|
||||
return u''.join( [x.character for x in chars] )
|
||||
|
||||
## @brief Returns the bounding rectangle of the text in the range
|
||||
def rect(self):
|
||||
rect = QRectF()
|
||||
for c in self.boxes:
|
||||
rect = rect.united( c.box )
|
||||
return rect
|
||||
|
||||
## @brief Returns a bounding rectangle of the text in the block that is 'outerDistance'
|
||||
# larger in all sides.
|
||||
def outerRect(self):
|
||||
rect = self.rect()
|
||||
rect.translate( - self.outerDistance, - self.outerDistance )
|
||||
rect.setWidth( rect.width() + self.outerDistance * 2 )
|
||||
rect.setHeight( rect.height() + self.outerDistance * 2 )
|
||||
return rect
|
||||
|
||||
## @brief Returns a list with all possible ranges of size length of the
|
||||
# given document
|
||||
@staticmethod
|
||||
def extractAllBlocksFromDocument(lines, length, distance=0):
|
||||
if length <= 0:
|
||||
return []
|
||||
blocks = []
|
||||
for line in xrange(len(lines)):
|
||||
for char in xrange(len(line)):
|
||||
blockFound = False
|
||||
for block in blocks:
|
||||
if block.outerRect().intersects( char.box ):
|
||||
block.boxes.append( char )
|
||||
blockFound = True
|
||||
break
|
||||
if not blockFound:
|
||||
block = Block()
|
||||
block.boxes.append( char )
|
||||
block.document = lines
|
||||
blocks.append( block )
|
||||
return blocks
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
import poppler
|
||||
|
||||
document = poppler.document_new_from_file ('file:///home/albert/prog/empresa/info/pla-empresa/empresa.pdf', None)
|
||||
#self.document = poppler.document_new_from_file (uri, None)
|
||||
print "PAGES: ", document.get_n_pages()
|
||||
print "SIZE: ", current_page.get_size()
|
||||
|
||||
|
||||
# Font Info:
|
||||
font_info = poppler.FontInfo(self.document)
|
||||
iter = font_info.scan(self.n_pages)
|
||||
|
||||
print iter.get_full_name()
|
||||
|
||||
while iter.next():
|
||||
print iter.get_full_name()
|
|
@ -0,0 +1,93 @@
|
|||
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||
# albert@nan-tic.com
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
from PyQt4.QtCore import *
|
||||
|
||||
def rangeDistanceComparison(x, y):
|
||||
if x.distance > y.distance:
|
||||
return 1
|
||||
elif x.distance < y.distance:
|
||||
return -1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def rangeLengthComparison(x, y):
|
||||
xt = len( x.text() )
|
||||
yt = len( y.text() )
|
||||
if xt > yt:
|
||||
return 1
|
||||
elif xt < yt:
|
||||
return -1
|
||||
else:
|
||||
return 0
|
||||
|
||||
## @brief This class represents a group of characters in a document.
|
||||
class Range:
|
||||
def __init__(self):
|
||||
self.line = 0
|
||||
self.pos = 0
|
||||
self.length = 0
|
||||
self.document = None
|
||||
|
||||
## @brief Returns a unicode string with the text of the current range
|
||||
def text(self):
|
||||
line = self.document[self.line]
|
||||
chars = line[self.pos:self.pos + self.length]
|
||||
return u''.join( [x.character for x in chars] )
|
||||
|
||||
## @brief Returns the bounding rectangle of the text in the range
|
||||
def rect(self):
|
||||
line = self.document[self.line]
|
||||
chars = line[self.pos:self.pos + self.length]
|
||||
rect = QRectF()
|
||||
for c in chars:
|
||||
rect = rect.united( c.box )
|
||||
return rect
|
||||
|
||||
## @brief Returns a list with all possible ranges of size length of the
|
||||
# given document
|
||||
@staticmethod
|
||||
def extractAllRangesFromDocument(lines, length, width=0):
|
||||
if length <= 0:
|
||||
return []
|
||||
ranges = []
|
||||
for line in xrange(len(lines)):
|
||||
if length >= len(lines[line]):
|
||||
ran = Range()
|
||||
ran.line = line
|
||||
ran.pos = 0
|
||||
ran.length = len(lines[line])
|
||||
ran.document = lines
|
||||
#if width:
|
||||
# while ran.rect().width() > width:
|
||||
# ran.length -= 1
|
||||
ranges.append( ran )
|
||||
continue
|
||||
for pos in xrange(len(lines[line]) - length + 1):
|
||||
ran = Range()
|
||||
ran.line = line
|
||||
ran.pos = pos
|
||||
ran.length = length
|
||||
ran.document = lines
|
||||
#if width:
|
||||
# while ran.rect().width() > width:
|
||||
# ran.length -= 1
|
||||
ranges.append( ran )
|
||||
return ranges
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
# Copyright (C) 2009 by Albert Cervera i Areny
|
||||
# albert@nan-tic.com
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
from PyQt4.QtCore import *
|
||||
|
||||
def textToFloat( value ):
|
||||
if ',' in value and '.' in value:
|
||||
commaIndex = value.rfind(',')
|
||||
periodIndex = value.rfind('.')
|
||||
if commaIndex > periodIndex:
|
||||
newValue = value.replace( '.', '' ).replace( ',', '.' )
|
||||
else:
|
||||
newValue = value.replace( ',', '' )
|
||||
elif ',' in value:
|
||||
newValue = value.replace( ',', '.' )
|
||||
else:
|
||||
newValue = value
|
||||
return float( newValue )
|
||||
|
||||
def isFloat( value ):
|
||||
try:
|
||||
textToFloat( value )
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def isDate( value ):
|
||||
date = textToDate( value )
|
||||
return date.isValid()
|
||||
|
||||
def textToDate( value ):
|
||||
patterns = ['dd/MM/yyyy', 'dd-MM-yyyy', 'dd-MM-yy', 'dd MMM. yy', 'dd MMMM yyyy', 'dd.mm.yyyy']
|
||||
for pattern in patterns:
|
||||
date = QDate.fromString( value.replace(' ',''), pattern )
|
||||
if date.isValid():
|
||||
return date
|
||||
return QDate()
|
||||
|
||||
def isMostlyNumeric( text ):
|
||||
numbers = 0
|
||||
for x in text:
|
||||
if x in '0123456789':
|
||||
numbers += 1
|
||||
if numbers > len(text) / 2:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
Loading…
Reference in New Issue