diff --git a/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictool.py b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictool.py new file mode 100644 index 0000000..e391459 --- /dev/null +++ b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictool.py @@ -0,0 +1,148 @@ +# Some Python utility code for elasticsearch. +# uses the requests library (low level) rather than the Python elasticsearch wrapper + +# https://www.pg4e.com/code/elastictool.py + +# (If needed) +# https://www.pg4e.com/code/hidden-dist.py +# copy hidden-dist.py to hidden.py +# edit hidden.py and put in your credentials + +import requests +import json +import hidden + +import warnings + +warnings.filterwarnings("ignore", message="Unverified HTTPS request") + +secrets = hidden.elastic() + + +# from lesson in course +queryurl = "http://pg4e_86f9:*@es.py4e.com:9210/prefx/testindex/_search?pretty" + +body = json.dumps({"query": {"match all": {}}}) # match everything + +hdict = {"Content-type": "application/json; charset=UTF-8"} + +response = requests.post(queryurl, headers=hdict, data=body) +text = response.text + +status = response.status_code +js = json.loads(text) + +# Status codes: +# 200: OK +# 404: not found +# 500: error at server side + + +# end from lesson in course + +url = "http://" +if secrets["scheme"] == "https": + url = "https://" +url = ( + url + + secrets["user"] + + ":" + + secrets["pass"] + + "@" + + secrets["host"] + + ":" + + str(secrets["port"]) +) +if secrets.get("prefix"): + url = url + "/" + secrets["prefix"] +url = url + "/" + secrets["user"] + +while True: + print() + try: + cmd = input("Enter command: ").strip() + except: + print() + break + + if cmd.startswith("quit"): + break + + pieces = cmd.split() + + # https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-delete-index.html + if len(pieces) == 1 and pieces[0] == "delete": + prurl = url.replace(secrets["pass"], "*****") + print(prurl) + response = requests.delete(url) + text = response.text + status = response.status_code + print("Status:", status) + print(text) + continue + + # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-all-query.html + if len(pieces) == 1 and pieces[0] == "match_all": + queryurl = url + "/_search" + prurl = queryurl.replace(secrets["pass"], "*****") + print(prurl) + + body = json.dumps({"query": {"match_all": {}}}) + + hdict = {"Content-type": "application/json; charset=UTF-8"} + response = requests.post( + queryurl, verify=False, headers=hdict, data=body + ) + text = response.text + status = response.status_code + print(status) + print(text) + continue + + # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-get.html + if len(pieces) == 2 and pieces[0] == "get": + queryurl = url + "/_doc/" + pieces[1] + "?pretty" + prurl = queryurl.replace(secrets["pass"], "*****") + print(prurl) + + response = requests.get(queryurl, verify=False) + text = response.text + status = response.status_code + print(status) + print(text) + continue + + # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html + if len(pieces) == 2 and pieces[0] == "search": + queryurl = url + "/_search?pretty" + prurl = queryurl.replace(secrets["pass"], "*****") + print(prurl) + + body = json.dumps({"query": {"query_string": {"query": pieces[1]}}}) + + # {"query": {"query_string": { "query": search, "default_field": "content" }}} + print(body) + + hdict = {"Content-type": "application/json; charset=UTF-8"} + response = requests.post( + queryurl, verify=False, headers=hdict, data=body + ) + text = response.text + status = response.status_code + if status == 200: + print(status) + print(json.dumps(json.loads(text), indent=2)) + else: + print(text) + print() + print("Error, status=", status) + continue + + print() + print("Invalid command, please try:") + print("") + print(" quit") + print(" get id") + print(" search string") + print(" match_all") + print(" delete") diff --git a/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictweet.py b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictweet.py new file mode 100644 index 0000000..a5bd04a --- /dev/null +++ b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/elastictweet.py @@ -0,0 +1,82 @@ +# https://www.pg4e.com/code/elastictweet.py + +# Example from: +# https://elasticsearch-py.readthedocs.io/en/master/ + +# pip install 'elasticsearch<7.14.0' + +# (If needed) +# https://www.pg4e.com/code/hidden-dist.py +# copy hidden-dist.py to hidden.py +# edit hidden.py and put in your credentials + +from datetime import datetime +from elasticsearch import Elasticsearch +from elasticsearch import RequestsHttpConnection + +import hidden + +secrets = hidden.elastic() + +es = Elasticsearch( + [secrets["host"]], + http_auth=(secrets["user"], secrets["pass"]), + url_prefix=secrets["prefix"], + scheme=secrets["scheme"], + port=secrets["port"], + connection_class=RequestsHttpConnection, +) +indexname = secrets["user"] + +# Start fresh +# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices +res = es.indices.delete(index=indexname, ignore=[400, 404]) +print("Dropped index") +print(res) + +res = es.indices.create(index=indexname) +print("Created the index...") +print(res) + +doc = { + "author": "kimchy", + "type": "tweet", + "text": "Elasticsearch: cool. bonsai cool.", + "timestamp": datetime.now(), +} + +# Note - you can't change the key type after you start indexing documents +res = es.index(index=indexname, id="abc", body=doc) +print("Added document...") +print(res["result"]) + +res = es.get(index=indexname, id="abc") +print("Retrieved document...") +print(res) + +# Tell it to recompute the index - normally it would take up to 30 seconds +# Refresh can be costly - we do it here for demo purposes +# https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-refresh.html +res = es.indices.refresh(index=indexname) +print("Index refreshed") +print(res) + +# Read the documents with a search term +# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-filter-context.html +x = { + "query": { + "bool": { + "must": {"match": {"text": "bonsai"}}, + "filter": {"match": {"type": "tweet"}}, + } + } +} + +res = es.search(index=indexname, body=x) +print("Search results...") +print(res) +print() +print("Got %d Hits:" % len(res["hits"]["hits"])) +for hit in res["hits"]["hits"]: + s = hit["_source"] + print(f"{s['timestamp']} {s['author']}: {s['text']}") diff --git a/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/hidden.py b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/hidden.py new file mode 100644 index 0000000..410bdf8 --- /dev/null +++ b/Courses/PostgreSQL-for-Everybody-Specialization/Database Architecture, Scale, and NoSQL with Elasticsearch/hidden.py @@ -0,0 +1,58 @@ +# Keep this file separate + +# https://www.pg4e.com/code/hidden-dist.py + +# psql -h pg.pg4e.com -p 5432 -U pg4e_be9e729093 pg4e_be9e729093 + +# %load_ext sql +# %config SqlMagic.autocommit=False +# %sql postgresql://pg4e_be9e729093:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_be9e729093 +# %sql SELECT 1 as "Test" + +def secrets(): + return {"host": "pg.pg4e.com", + "port": 5432, + "database": "pg4e_be9e729093", + "user": "pg4e_be9e729093", + "pass": "pg4e_p_d5fab7440699124"} + +def elastic() : + return {"host": "www.pg4e.com", + "prefix" : "elasticsearch", + "port": 443, + "scheme": "https", + "user": "pg4e_86f9be92a2", + "pass": "2008_9d454b1f"} + +def readonly(): + return {"host": "pg.pg4e.com", + "port": 5432, + "database": "readonly", + "user": "readonly", + "pass": "readonly_password"} + +# Return a psycopg2 connection string + +# import hidden +# secrets = hidden.readonly() +# sql_string = hidden.psycopg2(hidden.readonly()) + +# 'dbname=pg4e_data user=pg4e_data_read password=pg4e_p_d5fab7440699124 host=pg.pg4e.com port=5432' + +def psycopg2(secrets) : + return ('dbname='+secrets['database']+' user='+secrets['user']+ + ' password='+secrets['pass']+' host='+secrets['host']+ + ' port='+str(secrets['port'])) + +# Return an SQLAlchemy string + +# import hidden +# secrets = hidden.readonly() +# sql_string = hidden.alchemy(hidden.readonly()) + +# postgresql://pg4e_data_read:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_data + +def alchemy(secrets) : + return ('postgresql://'+secrets['user']+':'+secrets['pass']+'@'+secrets['host']+ + ':'+str(secrets['port'])+'/'+secrets['database']) + diff --git a/Databases/ElasticSearch/General.md b/Databases/ElasticSearch/General.md new file mode 100644 index 0000000..3ab2cc1 --- /dev/null +++ b/Databases/ElasticSearch/General.md @@ -0,0 +1,37 @@ +# Application: ELK Stack + +- Elasticsearch - distributed NoSQL database +- Logstash - ingests streams of activity data +- Kibana - Visualisation / Dashboard + +# Fundamentals concepts +[Source: architecture](https://codersite.dev/hot-warm-architecture-elasticsearch/) + +The act of storing data in Elasticsearch is called **indexing**. + +An index is a collection of documents and each document is a collection of fields, which are the **key-value pairs** that contain your data. Every index has some properties like mappings, settings, and aliases. + +In Elasticsearch, a document belongs to a type, and those types live inside an index. We can draw a parallel to a traditional relational database: + +Relational DB ⇒ Databases ⇒ Tables ⇒ Rows ⇒ Columns +Elasticsearch ⇒ Indices ⇒ Types ⇒ Documents ⇒ Fields + +In Elasticsearch, the term **document** has a specific meaning. It refers to the **top-level**, or root object that is serialized into JSON and stored in Elasticsearch under a unique ID. + +Elasticsearch lets you insert documents without a predefined schema (in RDBMS you need to define tables in advance). + +## Inverted index + +Relational databases add an index, such as a B-tree index, to specific columns in order to improve the speed of data retrieval. Elasticsearch use a structure called an **inverted index** for exactly the same purpose. + +By default, **every field in a document is indexed** (has an inverted index) and thus is searchable – **FullText search**. A field without an inverted index is not searchable. + +An inverted index consists of a list of all the unique words that appear in any document, and for each word, a list of the documents in which it appears. + +# Summary + +- Elasticsearch gives us Google-like features + - Scalable ingest / data size / search performance + - Accessible through a "REST API" +- Can be used as a full-text "search engine" +- Can be used as a scalable NoSQL database \ No newline at end of file diff --git a/Databases/ElasticSearch/examples.md b/Databases/ElasticSearch/examples.md index 94521ef..21b139a 100644 --- a/Databases/ElasticSearch/examples.md +++ b/Databases/ElasticSearch/examples.md @@ -24,6 +24,14 @@ GET /items/_search } } +GET /_search +{ + "query": { + "match_all": {} + } +} + + GET /shakespeare/_search GET /shakespeare/_search diff --git a/Databases/ Hive/Select.md b/Databases/Hive/Select.md similarity index 100% rename from Databases/ Hive/Select.md rename to Databases/Hive/Select.md diff --git a/Databases/ Hive/Timestamp.md b/Databases/Hive/Timestamp.md similarity index 100% rename from Databases/ Hive/Timestamp.md rename to Databases/Hive/Timestamp.md