This commit is contained in:
John 2024-01-18 22:20:52 +01:00
parent 51d68caa88
commit 75f6a2ac8e
8 changed files with 6456 additions and 2 deletions

View File

@ -0,0 +1,39 @@
from datetime import datetime
# https://www.pg4e.com/code/datecompat.py
# Non-dateutil version - we try our best
def parsemaildate(md) :
pieces = md.split()
notz = " ".join(pieces[:4]).strip()
# Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue
if dnotz is None :
# print 'Bad Date:',md
return None
iso = dnotz.isoformat()
tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000' : tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh+":"+tzm
except:
pass
return iso+tz

View File

@ -0,0 +1,110 @@
# https://www.pg4e.com/code/elasticbook.py
# Download a book
# wget http://www.gutenberg.org/cache/epub/18866/pg18866.txt
# wget http://www.gutenberg.org/cache/epub/14091/pg14091.txt
# wget https://www.gutenberg.org/files/2591/2591-0.txt
# wget https://www.gutenberg.org/files/11/11-0.txt
# (If needed)
# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py
# edit hidden.py and put in your credentials
# python3 elasticbook.py
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection
import time
import copy
import hidden
import uuid
import json
import hashlib
bookfile = input("Enter book file (i.e. pg18866.txt): ")
if bookfile.strip() == '':
raise Exception("empty string detected, please try again to enter a book file")
# Make sure we can open the file
fhand = open(bookfile)
# Load the secrets
secrets = hidden.elastic()
es = Elasticsearch(
[ secrets['host'] ],
http_auth=(secrets['user'], secrets['pass']),
url_prefix = secrets['prefix'],
scheme=secrets['scheme'],
port=secrets['port'],
connection_class=RequestsHttpConnection,
)
# set indexname equal to elasticsearch username
indexname = secrets['user']
# Start fresh
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
res = es.indices.delete(index=indexname, ignore=[400, 404])
print("Dropped index", indexname)
print(res)
res = es.indices.create(index=indexname)
print("Created the index...")
print(res)
para = ''
chars = 0
count = 0
pcount = 0
for line in fhand:
count = count + 1
line = line.strip()
chars = chars + len(line)
if line == '' and para == '' : continue
if line == '' :
pcount = pcount + 1
doc = {
'offset' : pcount,
'content': para
}
# Use the paragraph count as primary key
# pkey = pcount
# Use a GUID for the primary key
# pkey = uuid.uuid4()
# Compute a SHA256 of the entire document as the primary key.
# Because the pkey is a based on the document contents
# the "index" is in effect INSERT ON CONFLICT UPDATE unless
# the document contents change
m = hashlib.sha256()
m.update(json.dumps(doc).encode())
pkey = m.hexdigest()
res = es.index(index=indexname, id=pkey, body=doc)
print('Added document', pkey)
# print(res['result'])
if pcount % 100 == 0 :
print(pcount, 'loaded...')
time.sleep(1)
para = ''
continue
para = para + ' ' + line
# Tell it to recompute the index
res = es.indices.refresh(index=indexname)
print("Index refreshed", indexname)
print(res)
print(' ')
print('Loaded',pcount,'paragraphs',count,'lines',chars,'characters')

View File

@ -0,0 +1,174 @@
# https://www.pg4e.com/code/elasticmail.py
# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py (if necessary)
# edit hidden.py and put in your credentials
# http://mbox.dr-chuck.net/sakai.devel/100/101
# python3 elasticmail.py
# Pulls data from the web and puts it into index
import requests
import re
import hidden
import datecompat
import time
import json
import copy
import hidden
import dateutil.parser as parser # If this import fails - just comment it out
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection
def parsemaildate(md) :
try:
pdate = parser.parse(tdate)
test_at = pdate.isoformat()
return test_at
except:
return datecompat.parsemaildate(md)
secrets = hidden.elastic()
# Connect to our database
es = Elasticsearch(
[ secrets['host'] ],
http_auth=(secrets['user'], secrets['pass']),
url_prefix = secrets['prefix'],
scheme=secrets['scheme'],
port=secrets['port'],
connection_class=RequestsHttpConnection,
)
# In our test world - we only get one index :(
indexname = secrets['user']
# Start fresh
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
res = es.indices.delete(index=indexname, ignore=[400, 404])
print("Dropped index")
print(res)
res = es.indices.create(index=indexname)
print("Created the index...")
print(res)
baseurl = 'http://mbox.dr-chuck.net/sakai.devel/'
many = 0
count = 0
fail = 0
start = 0
while True:
if ( many < 1 ) :
sval = input('How many messages:')
if ( len(sval) < 1 ) : break
many = int(sval)
start = start + 1
many = many - 1
url = baseurl + str(start) + '/' + str(start + 1)
text = 'None'
try:
# Open with a timeout of 30 seconds
response = requests.get(url)
text = response.text
status = response.status_code
if status != 200 :
print('Error code=',status, url)
break
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except Exception as e:
print('Unable to retrieve or parse page',url)
print('Error',e)
fail = fail + 1
if fail > 5 : break
continue
print(url,len(text))
count = count + 1
if not text.startswith('From '):
print(text)
print('Did not find From ')
fail = fail + 1
if fail > 5 : break
continue
pos = text.find('\n\n')
if pos > 0 :
hdr = text[:pos]
body = text[pos+2:]
else:
print(text)
print('Could not find break between headers and body')
fail = fail + 1
if fail > 5 : break
continue
# Accept with or without < >
email = None
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
if len(x) == 1 :
email = x[0]
email = email.strip().lower()
email = email.replace('<','')
else:
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
if len(x) == 1 :
email = x[0]
email = email.strip().lower()
email = email.replace('<','')
# Hack the date
sent_at = None
y = re.findall('\nDate: .*, (.*)\n', hdr)
if len(y) == 1 :
tdate = y[0]
tdate = tdate[:26]
try:
sent_at = parsemaildate(tdate)
except:
print(text)
print('Parse fail',tdate)
fail = fail + 1
if fail > 5 : break
continue
# Make the headers into a dictionary
hdrlines = hdr.split('\n')
hdrdict = dict()
for line in hdrlines:
# [('From', '"Glenn R. Golden" <ggolden@umich.edu>')]
y = re.findall('([^ :]*): (.*)$', line)
if len(y) != 1 : continue
tup = y[0]
if len(tup) != 2 : continue
# print(tup)
key = tup[0].lower()
value = tup[1].lower()
hdrdict[key] = value
# Override the date field
hdrdict['date'] = sent_at
# Reset the fail counter
fail = 0
doc = {'offset': start, 'sender': email, 'headers' : hdrdict, 'body': body}
res = es.index(index=indexname, id=str(start), body=doc)
print(' ',start, email, sent_at)
print('Added document...')
print(res['result'])
if count % 100 == 0 : time.sleep(1)

View File

@ -21,8 +21,8 @@ def elastic() :
"prefix" : "elasticsearch",
"port": 443,
"scheme": "https",
"user": "pg4e_86f9be92a2",
"pass": "2008_9d454b1f"}
"user": "pg4e_4cb8bb5508",
"pass": "2404_60cfd956"}
def readonly():
return {"host": "pg.pg4e.com",

View File

@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from elasticsearch import Elasticsearch\n",
"from elasticsearch import RequestsHttpConnection\n",
"import time\n",
"import copy\n",
"import hidden\n",
"import uuid\n",
"import json\n",
"import hashlib\n",
"import requests\n",
"import json\n",
"import hidden\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"secrets = hidden.elastic()\n",
"\n",
"es = Elasticsearch(\n",
" [ secrets['host'] ],\n",
" http_auth=(secrets['user'], secrets['pass']),\n",
" url_prefix = secrets['prefix'],\n",
" scheme=secrets['scheme'],\n",
" port=secrets['port'],\n",
" connection_class=RequestsHttpConnection,\n",
")\n",
"indexname = secrets[\"user\"]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dropped index\n",
"{'acknowledged': True}\n"
]
}
],
"source": [
"res = es.indices.delete(index=indexname, ignore=[400, 404])\n",
"print(\"Dropped index\")\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created the index...\n",
"{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pg4e_4cb8bb5508'}\n"
]
}
],
"source": [
"res = es.indices.create(index=indexname)\n",
"print(\"Created the index...\")\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"tweet = '''building on your left Your friend is very late and calls you to\n",
"tell you that they are on a farm and walking around behind a barn\n",
"with no sign of a restaurant Then you say did you turn left or\n",
"right at the gas station and they say I followed your directions\n",
"perfectly I have them written down it says turn left and go one'''\n",
"\n",
"doc = {\n",
" \"author\": \"kimchy\",\n",
" \"type\": \"tweet\",\n",
" \"text\": tweet,\n",
" \"timestamp\": datetime.now(),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Added document...\n",
"updated\n"
]
}
],
"source": [
"res = es.index(index=indexname, id=\"abc\", body=doc)\n",
"print(\"Added document...\")\n",
"print(res[\"result\"])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Retrieved document...\n",
"{'_index': 'pg4e_4cb8bb5508', '_type': '_doc', '_id': 'abc', '_version': 2, '_seq_no': 1, '_primary_term': 1, 'found': True, '_source': {'author': 'kimchy', 'type': 'tweet', 'text': 'building on your left Your friend is very late and calls you to\\ntell you that they are on a farm and walking around behind a barn\\nwith no sign of a restaurant Then you say did you turn left or\\nright at the gas station and they say I followed your directions\\nperfectly I have them written down it says turn left and go one', 'timestamp': '2024-01-18T21:42:09.213160'}}\n"
]
}
],
"source": [
"res = es.get(index=indexname, id=\"abc\")\n",
"print(\"Retrieved document...\")\n",
"print(res)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sql",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}