This commit is contained in:
John 2024-01-17 21:20:22 +01:00
parent f187d49780
commit 51d68caa88
7 changed files with 333 additions and 0 deletions

View File

@ -0,0 +1,148 @@
# Some Python utility code for elasticsearch.
# uses the requests library (low level) rather than the Python elasticsearch wrapper
# https://www.pg4e.com/code/elastictool.py
# (If needed)
# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py
# edit hidden.py and put in your credentials
import requests
import json
import hidden
import warnings
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
secrets = hidden.elastic()
# from lesson in course
queryurl = "http://pg4e_86f9:*@es.py4e.com:9210/prefx/testindex/_search?pretty"
body = json.dumps({"query": {"match all": {}}}) # match everything
hdict = {"Content-type": "application/json; charset=UTF-8"}
response = requests.post(queryurl, headers=hdict, data=body)
text = response.text
status = response.status_code
js = json.loads(text)
# Status codes:
# 200: OK
# 404: not found
# 500: error at server side
# end from lesson in course
url = "http://"
if secrets["scheme"] == "https":
url = "https://"
url = (
url
+ secrets["user"]
+ ":"
+ secrets["pass"]
+ "@"
+ secrets["host"]
+ ":"
+ str(secrets["port"])
)
if secrets.get("prefix"):
url = url + "/" + secrets["prefix"]
url = url + "/" + secrets["user"]
while True:
print()
try:
cmd = input("Enter command: ").strip()
except:
print()
break
if cmd.startswith("quit"):
break
pieces = cmd.split()
# https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-delete-index.html
if len(pieces) == 1 and pieces[0] == "delete":
prurl = url.replace(secrets["pass"], "*****")
print(prurl)
response = requests.delete(url)
text = response.text
status = response.status_code
print("Status:", status)
print(text)
continue
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-all-query.html
if len(pieces) == 1 and pieces[0] == "match_all":
queryurl = url + "/_search"
prurl = queryurl.replace(secrets["pass"], "*****")
print(prurl)
body = json.dumps({"query": {"match_all": {}}})
hdict = {"Content-type": "application/json; charset=UTF-8"}
response = requests.post(
queryurl, verify=False, headers=hdict, data=body
)
text = response.text
status = response.status_code
print(status)
print(text)
continue
# https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-get.html
if len(pieces) == 2 and pieces[0] == "get":
queryurl = url + "/_doc/" + pieces[1] + "?pretty"
prurl = queryurl.replace(secrets["pass"], "*****")
print(prurl)
response = requests.get(queryurl, verify=False)
text = response.text
status = response.status_code
print(status)
print(text)
continue
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
if len(pieces) == 2 and pieces[0] == "search":
queryurl = url + "/_search?pretty"
prurl = queryurl.replace(secrets["pass"], "*****")
print(prurl)
body = json.dumps({"query": {"query_string": {"query": pieces[1]}}})
# {"query": {"query_string": { "query": search, "default_field": "content" }}}
print(body)
hdict = {"Content-type": "application/json; charset=UTF-8"}
response = requests.post(
queryurl, verify=False, headers=hdict, data=body
)
text = response.text
status = response.status_code
if status == 200:
print(status)
print(json.dumps(json.loads(text), indent=2))
else:
print(text)
print()
print("Error, status=", status)
continue
print()
print("Invalid command, please try:")
print("")
print(" quit")
print(" get id")
print(" search string")
print(" match_all")
print(" delete")

View File

@ -0,0 +1,82 @@
# https://www.pg4e.com/code/elastictweet.py
# Example from:
# https://elasticsearch-py.readthedocs.io/en/master/
# pip install 'elasticsearch<7.14.0'
# (If needed)
# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py
# edit hidden.py and put in your credentials
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection
import hidden
secrets = hidden.elastic()
es = Elasticsearch(
[secrets["host"]],
http_auth=(secrets["user"], secrets["pass"]),
url_prefix=secrets["prefix"],
scheme=secrets["scheme"],
port=secrets["port"],
connection_class=RequestsHttpConnection,
)
indexname = secrets["user"]
# Start fresh
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
res = es.indices.delete(index=indexname, ignore=[400, 404])
print("Dropped index")
print(res)
res = es.indices.create(index=indexname)
print("Created the index...")
print(res)
doc = {
"author": "kimchy",
"type": "tweet",
"text": "Elasticsearch: cool. bonsai cool.",
"timestamp": datetime.now(),
}
# Note - you can't change the key type after you start indexing documents
res = es.index(index=indexname, id="abc", body=doc)
print("Added document...")
print(res["result"])
res = es.get(index=indexname, id="abc")
print("Retrieved document...")
print(res)
# Tell it to recompute the index - normally it would take up to 30 seconds
# Refresh can be costly - we do it here for demo purposes
# https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-refresh.html
res = es.indices.refresh(index=indexname)
print("Index refreshed")
print(res)
# Read the documents with a search term
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-filter-context.html
x = {
"query": {
"bool": {
"must": {"match": {"text": "bonsai"}},
"filter": {"match": {"type": "tweet"}},
}
}
}
res = es.search(index=indexname, body=x)
print("Search results...")
print(res)
print()
print("Got %d Hits:" % len(res["hits"]["hits"]))
for hit in res["hits"]["hits"]:
s = hit["_source"]
print(f"{s['timestamp']} {s['author']}: {s['text']}")

View File

@ -0,0 +1,58 @@
# Keep this file separate
# https://www.pg4e.com/code/hidden-dist.py
# psql -h pg.pg4e.com -p 5432 -U pg4e_be9e729093 pg4e_be9e729093
# %load_ext sql
# %config SqlMagic.autocommit=False
# %sql postgresql://pg4e_be9e729093:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_be9e729093
# %sql SELECT 1 as "Test"
def secrets():
return {"host": "pg.pg4e.com",
"port": 5432,
"database": "pg4e_be9e729093",
"user": "pg4e_be9e729093",
"pass": "pg4e_p_d5fab7440699124"}
def elastic() :
return {"host": "www.pg4e.com",
"prefix" : "elasticsearch",
"port": 443,
"scheme": "https",
"user": "pg4e_86f9be92a2",
"pass": "2008_9d454b1f"}
def readonly():
return {"host": "pg.pg4e.com",
"port": 5432,
"database": "readonly",
"user": "readonly",
"pass": "readonly_password"}
# Return a psycopg2 connection string
# import hidden
# secrets = hidden.readonly()
# sql_string = hidden.psycopg2(hidden.readonly())
# 'dbname=pg4e_data user=pg4e_data_read password=pg4e_p_d5fab7440699124 host=pg.pg4e.com port=5432'
def psycopg2(secrets) :
return ('dbname='+secrets['database']+' user='+secrets['user']+
' password='+secrets['pass']+' host='+secrets['host']+
' port='+str(secrets['port']))
# Return an SQLAlchemy string
# import hidden
# secrets = hidden.readonly()
# sql_string = hidden.alchemy(hidden.readonly())
# postgresql://pg4e_data_read:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_data
def alchemy(secrets) :
return ('postgresql://'+secrets['user']+':'+secrets['pass']+'@'+secrets['host']+
':'+str(secrets['port'])+'/'+secrets['database'])

View File

@ -0,0 +1,37 @@
# Application: ELK Stack
- Elasticsearch - distributed NoSQL database
- Logstash - ingests streams of activity data
- Kibana - Visualisation / Dashboard
# Fundamentals concepts
[Source: architecture](https://codersite.dev/hot-warm-architecture-elasticsearch/)
The act of storing data in Elasticsearch is called **indexing**.
An index is a collection of documents and each document is a collection of fields, which are the **key-value pairs** that contain your data. Every index has some properties like mappings, settings, and aliases.
In Elasticsearch, a document belongs to a type, and those types live inside an index. We can draw a parallel to a traditional relational database:
Relational DB ⇒ Databases ⇒ Tables ⇒ Rows ⇒ Columns
Elasticsearch ⇒ Indices ⇒ Types ⇒ Documents ⇒ Fields
In Elasticsearch, the term **document** has a specific meaning. It refers to the **top-level**, or root object that is serialized into JSON and stored in Elasticsearch under a unique ID.
Elasticsearch lets you insert documents without a predefined schema (in RDBMS you need to define tables in advance).
## Inverted index
Relational databases add an index, such as a B-tree index, to specific columns in order to improve the speed of data retrieval. Elasticsearch use a structure called an **inverted index** for exactly the same purpose.
By default, **every field in a document is indexed** (has an inverted index) and thus is searchable **FullText search**. A field without an inverted index is not searchable.
An inverted index consists of a list of all the unique words that appear in any document, and for each word, a list of the documents in which it appears.
# Summary
- Elasticsearch gives us Google-like features
- Scalable ingest / data size / search performance
- Accessible through a "REST API"
- Can be used as a full-text "search engine"
- Can be used as a scalable NoSQL database

View File

@ -24,6 +24,14 @@ GET /items/_search
}
}
GET /_search
{
"query": {
"match_all": {}
}
}
GET /shakespeare/_search
GET /shakespeare/_search