elastic
This commit is contained in:
parent
51d68caa88
commit
75f6a2ac8e
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,39 @@
|
|||
from datetime import datetime
|
||||
# https://www.pg4e.com/code/datecompat.py
|
||||
|
||||
# Non-dateutil version - we try our best
|
||||
def parsemaildate(md) :
|
||||
|
||||
pieces = md.split()
|
||||
notz = " ".join(pieces[:4]).strip()
|
||||
|
||||
# Try a bunch of format variations - strptime() is *lame*
|
||||
dnotz = None
|
||||
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
|
||||
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
|
||||
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
|
||||
try:
|
||||
dnotz = datetime.strptime(notz, form)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if dnotz is None :
|
||||
# print 'Bad Date:',md
|
||||
return None
|
||||
|
||||
iso = dnotz.isoformat()
|
||||
|
||||
tz = "+0000"
|
||||
try:
|
||||
tz = pieces[4]
|
||||
ival = int(tz) # Only want numeric timezone values
|
||||
if tz == '-0000' : tz = '+0000'
|
||||
tzh = tz[:3]
|
||||
tzm = tz[3:]
|
||||
tz = tzh+":"+tzm
|
||||
except:
|
||||
pass
|
||||
|
||||
return iso+tz
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
|
||||
# https://www.pg4e.com/code/elasticbook.py
|
||||
|
||||
# Download a book
|
||||
# wget http://www.gutenberg.org/cache/epub/18866/pg18866.txt
|
||||
# wget http://www.gutenberg.org/cache/epub/14091/pg14091.txt
|
||||
# wget https://www.gutenberg.org/files/2591/2591-0.txt
|
||||
# wget https://www.gutenberg.org/files/11/11-0.txt
|
||||
|
||||
# (If needed)
|
||||
# https://www.pg4e.com/code/hidden-dist.py
|
||||
# copy hidden-dist.py to hidden.py
|
||||
# edit hidden.py and put in your credentials
|
||||
|
||||
# python3 elasticbook.py
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import RequestsHttpConnection
|
||||
import time
|
||||
import copy
|
||||
import hidden
|
||||
import uuid
|
||||
import json
|
||||
import hashlib
|
||||
|
||||
bookfile = input("Enter book file (i.e. pg18866.txt): ")
|
||||
if bookfile.strip() == '':
|
||||
raise Exception("empty string detected, please try again to enter a book file")
|
||||
|
||||
# Make sure we can open the file
|
||||
fhand = open(bookfile)
|
||||
|
||||
# Load the secrets
|
||||
secrets = hidden.elastic()
|
||||
|
||||
es = Elasticsearch(
|
||||
[ secrets['host'] ],
|
||||
http_auth=(secrets['user'], secrets['pass']),
|
||||
url_prefix = secrets['prefix'],
|
||||
scheme=secrets['scheme'],
|
||||
port=secrets['port'],
|
||||
connection_class=RequestsHttpConnection,
|
||||
)
|
||||
|
||||
# set indexname equal to elasticsearch username
|
||||
indexname = secrets['user']
|
||||
|
||||
# Start fresh
|
||||
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
|
||||
res = es.indices.delete(index=indexname, ignore=[400, 404])
|
||||
print("Dropped index", indexname)
|
||||
print(res)
|
||||
|
||||
res = es.indices.create(index=indexname)
|
||||
print("Created the index...")
|
||||
print(res)
|
||||
|
||||
para = ''
|
||||
chars = 0
|
||||
count = 0
|
||||
pcount = 0
|
||||
for line in fhand:
|
||||
count = count + 1
|
||||
line = line.strip()
|
||||
chars = chars + len(line)
|
||||
if line == '' and para == '' : continue
|
||||
if line == '' :
|
||||
pcount = pcount + 1
|
||||
doc = {
|
||||
'offset' : pcount,
|
||||
'content': para
|
||||
}
|
||||
|
||||
# Use the paragraph count as primary key
|
||||
# pkey = pcount
|
||||
|
||||
# Use a GUID for the primary key
|
||||
# pkey = uuid.uuid4()
|
||||
|
||||
# Compute a SHA256 of the entire document as the primary key.
|
||||
# Because the pkey is a based on the document contents
|
||||
# the "index" is in effect INSERT ON CONFLICT UPDATE unless
|
||||
# the document contents change
|
||||
m = hashlib.sha256()
|
||||
m.update(json.dumps(doc).encode())
|
||||
pkey = m.hexdigest()
|
||||
|
||||
res = es.index(index=indexname, id=pkey, body=doc)
|
||||
|
||||
print('Added document', pkey)
|
||||
# print(res['result'])
|
||||
|
||||
if pcount % 100 == 0 :
|
||||
print(pcount, 'loaded...')
|
||||
time.sleep(1)
|
||||
|
||||
para = ''
|
||||
continue
|
||||
|
||||
para = para + ' ' + line
|
||||
|
||||
# Tell it to recompute the index
|
||||
res = es.indices.refresh(index=indexname)
|
||||
print("Index refreshed", indexname)
|
||||
print(res)
|
||||
|
||||
print(' ')
|
||||
print('Loaded',pcount,'paragraphs',count,'lines',chars,'characters')
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
|
||||
# https://www.pg4e.com/code/elasticmail.py
|
||||
|
||||
# https://www.pg4e.com/code/hidden-dist.py
|
||||
# copy hidden-dist.py to hidden.py (if necessary)
|
||||
# edit hidden.py and put in your credentials
|
||||
|
||||
# http://mbox.dr-chuck.net/sakai.devel/100/101
|
||||
|
||||
# python3 elasticmail.py
|
||||
# Pulls data from the web and puts it into index
|
||||
|
||||
import requests
|
||||
import re
|
||||
import hidden
|
||||
import datecompat
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import hidden
|
||||
|
||||
import dateutil.parser as parser # If this import fails - just comment it out
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import RequestsHttpConnection
|
||||
|
||||
def parsemaildate(md) :
|
||||
try:
|
||||
pdate = parser.parse(tdate)
|
||||
test_at = pdate.isoformat()
|
||||
return test_at
|
||||
except:
|
||||
return datecompat.parsemaildate(md)
|
||||
|
||||
secrets = hidden.elastic()
|
||||
|
||||
# Connect to our database
|
||||
es = Elasticsearch(
|
||||
[ secrets['host'] ],
|
||||
http_auth=(secrets['user'], secrets['pass']),
|
||||
url_prefix = secrets['prefix'],
|
||||
scheme=secrets['scheme'],
|
||||
port=secrets['port'],
|
||||
connection_class=RequestsHttpConnection,
|
||||
)
|
||||
|
||||
# In our test world - we only get one index :(
|
||||
indexname = secrets['user']
|
||||
|
||||
# Start fresh
|
||||
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
|
||||
res = es.indices.delete(index=indexname, ignore=[400, 404])
|
||||
print("Dropped index")
|
||||
print(res)
|
||||
|
||||
res = es.indices.create(index=indexname)
|
||||
print("Created the index...")
|
||||
print(res)
|
||||
|
||||
baseurl = 'http://mbox.dr-chuck.net/sakai.devel/'
|
||||
|
||||
many = 0
|
||||
count = 0
|
||||
fail = 0
|
||||
start = 0
|
||||
while True:
|
||||
if ( many < 1 ) :
|
||||
sval = input('How many messages:')
|
||||
if ( len(sval) < 1 ) : break
|
||||
many = int(sval)
|
||||
|
||||
start = start + 1
|
||||
|
||||
many = many - 1
|
||||
url = baseurl + str(start) + '/' + str(start + 1)
|
||||
|
||||
text = 'None'
|
||||
try:
|
||||
# Open with a timeout of 30 seconds
|
||||
response = requests.get(url)
|
||||
text = response.text
|
||||
status = response.status_code
|
||||
if status != 200 :
|
||||
print('Error code=',status, url)
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
print('')
|
||||
print('Program interrupted by user...')
|
||||
break
|
||||
except Exception as e:
|
||||
print('Unable to retrieve or parse page',url)
|
||||
print('Error',e)
|
||||
fail = fail + 1
|
||||
if fail > 5 : break
|
||||
continue
|
||||
|
||||
print(url,len(text))
|
||||
count = count + 1
|
||||
|
||||
if not text.startswith('From '):
|
||||
print(text)
|
||||
print('Did not find From ')
|
||||
fail = fail + 1
|
||||
if fail > 5 : break
|
||||
continue
|
||||
|
||||
pos = text.find('\n\n')
|
||||
if pos > 0 :
|
||||
hdr = text[:pos]
|
||||
body = text[pos+2:]
|
||||
else:
|
||||
print(text)
|
||||
print('Could not find break between headers and body')
|
||||
fail = fail + 1
|
||||
if fail > 5 : break
|
||||
continue
|
||||
|
||||
# Accept with or without < >
|
||||
email = None
|
||||
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
|
||||
if len(x) == 1 :
|
||||
email = x[0]
|
||||
email = email.strip().lower()
|
||||
email = email.replace('<','')
|
||||
else:
|
||||
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
|
||||
if len(x) == 1 :
|
||||
email = x[0]
|
||||
email = email.strip().lower()
|
||||
email = email.replace('<','')
|
||||
|
||||
# Hack the date
|
||||
sent_at = None
|
||||
y = re.findall('\nDate: .*, (.*)\n', hdr)
|
||||
if len(y) == 1 :
|
||||
tdate = y[0]
|
||||
tdate = tdate[:26]
|
||||
try:
|
||||
sent_at = parsemaildate(tdate)
|
||||
except:
|
||||
print(text)
|
||||
print('Parse fail',tdate)
|
||||
fail = fail + 1
|
||||
if fail > 5 : break
|
||||
continue
|
||||
|
||||
# Make the headers into a dictionary
|
||||
hdrlines = hdr.split('\n')
|
||||
hdrdict = dict()
|
||||
for line in hdrlines:
|
||||
# [('From', '"Glenn R. Golden" <ggolden@umich.edu>')]
|
||||
y = re.findall('([^ :]*): (.*)$', line)
|
||||
if len(y) != 1 : continue
|
||||
tup = y[0]
|
||||
if len(tup) != 2 : continue
|
||||
# print(tup)
|
||||
key = tup[0].lower()
|
||||
value = tup[1].lower()
|
||||
hdrdict[key] = value
|
||||
|
||||
# Override the date field
|
||||
hdrdict['date'] = sent_at
|
||||
|
||||
# Reset the fail counter
|
||||
fail = 0
|
||||
doc = {'offset': start, 'sender': email, 'headers' : hdrdict, 'body': body}
|
||||
res = es.index(index=indexname, id=str(start), body=doc)
|
||||
print(' ',start, email, sent_at)
|
||||
|
||||
print('Added document...')
|
||||
print(res['result'])
|
||||
|
||||
if count % 100 == 0 : time.sleep(1)
|
||||
|
|
@ -21,8 +21,8 @@ def elastic() :
|
|||
"prefix" : "elasticsearch",
|
||||
"port": 443,
|
||||
"scheme": "https",
|
||||
"user": "pg4e_86f9be92a2",
|
||||
"pass": "2008_9d454b1f"}
|
||||
"user": "pg4e_4cb8bb5508",
|
||||
"pass": "2404_60cfd956"}
|
||||
|
||||
def readonly():
|
||||
return {"host": "pg.pg4e.com",
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,164 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from elasticsearch import Elasticsearch\n",
|
||||
"from elasticsearch import RequestsHttpConnection\n",
|
||||
"import time\n",
|
||||
"import copy\n",
|
||||
"import hidden\n",
|
||||
"import uuid\n",
|
||||
"import json\n",
|
||||
"import hashlib\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"import hidden\n",
|
||||
"from datetime import datetime"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"secrets = hidden.elastic()\n",
|
||||
"\n",
|
||||
"es = Elasticsearch(\n",
|
||||
" [ secrets['host'] ],\n",
|
||||
" http_auth=(secrets['user'], secrets['pass']),\n",
|
||||
" url_prefix = secrets['prefix'],\n",
|
||||
" scheme=secrets['scheme'],\n",
|
||||
" port=secrets['port'],\n",
|
||||
" connection_class=RequestsHttpConnection,\n",
|
||||
")\n",
|
||||
"indexname = secrets[\"user\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dropped index\n",
|
||||
"{'acknowledged': True}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = es.indices.delete(index=indexname, ignore=[400, 404])\n",
|
||||
"print(\"Dropped index\")\n",
|
||||
"print(res)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created the index...\n",
|
||||
"{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pg4e_4cb8bb5508'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = es.indices.create(index=indexname)\n",
|
||||
"print(\"Created the index...\")\n",
|
||||
"print(res)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tweet = '''building on your left Your friend is very late and calls you to\n",
|
||||
"tell you that they are on a farm and walking around behind a barn\n",
|
||||
"with no sign of a restaurant Then you say did you turn left or\n",
|
||||
"right at the gas station and they say I followed your directions\n",
|
||||
"perfectly I have them written down it says turn left and go one'''\n",
|
||||
"\n",
|
||||
"doc = {\n",
|
||||
" \"author\": \"kimchy\",\n",
|
||||
" \"type\": \"tweet\",\n",
|
||||
" \"text\": tweet,\n",
|
||||
" \"timestamp\": datetime.now(),\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Added document...\n",
|
||||
"updated\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = es.index(index=indexname, id=\"abc\", body=doc)\n",
|
||||
"print(\"Added document...\")\n",
|
||||
"print(res[\"result\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Retrieved document...\n",
|
||||
"{'_index': 'pg4e_4cb8bb5508', '_type': '_doc', '_id': 'abc', '_version': 2, '_seq_no': 1, '_primary_term': 1, 'found': True, '_source': {'author': 'kimchy', 'type': 'tweet', 'text': 'building on your left Your friend is very late and calls you to\\ntell you that they are on a farm and walking around behind a barn\\nwith no sign of a restaurant Then you say did you turn left or\\nright at the gas station and they say I followed your directions\\nperfectly I have them written down it says turn left and go one', 'timestamp': '2024-01-18T21:42:09.213160'}}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = es.get(index=indexname, id=\"abc\")\n",
|
||||
"print(\"Retrieved document...\")\n",
|
||||
"print(res)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "sql",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue