ZeroNet/src/Db/Db.py

520 lines
20 KiB
Python
Raw Normal View History

import sqlite3
import json
import time
import logging
import re
import os
2019-03-18 03:32:42 +01:00
import atexit
2019-12-17 15:02:04 +01:00
import threading
import sys
2019-12-17 15:02:04 +01:00
import weakref
import errno
2019-03-18 03:32:42 +01:00
import gevent
2018-10-20 08:58:35 +02:00
from Debug import Debug
2019-03-15 21:06:59 +01:00
from .DbCursor import DbCursor
from util import SafeRe
2017-08-09 14:19:39 +02:00
from util import helper
2019-11-30 02:12:33 +01:00
from util import ThreadPool
from Config import config
2019-11-30 02:12:33 +01:00
thread_pool_db = ThreadPool.ThreadPool(config.threads_db)
2019-11-30 02:13:39 +01:00
next_db_id = 0
opened_dbs = []
# Close idle databases to save some memory
def dbCleanup():
while 1:
time.sleep(60 * 5)
for db in opened_dbs[:]:
idle = time.time() - db.last_query_time
if idle > 60 * 5 and db.close_idle:
2019-12-17 14:35:49 +01:00
db.close("Cleanup")
def dbCommitCheck():
while 1:
time.sleep(5)
for db in opened_dbs[:]:
if not db.need_commit:
continue
success = db.commit("Interval")
if success:
db.need_commit = False
time.sleep(0.1)
2019-03-18 03:32:42 +01:00
def dbCloseAll():
for db in opened_dbs[:]:
2019-12-17 14:35:49 +01:00
db.close("Close all")
2019-03-18 03:32:42 +01:00
gevent.spawn(dbCleanup)
gevent.spawn(dbCommitCheck)
2019-03-18 03:32:42 +01:00
atexit.register(dbCloseAll)
class DbTableError(Exception):
def __init__(self, message, table):
super().__init__(message)
self.table = table
class Db(object):
def __init__(self, schema, db_path, close_idle=False):
2019-11-30 02:13:39 +01:00
global next_db_id
self.db_path = db_path
self.db_dir = os.path.dirname(db_path) + "/"
self.schema = schema
self.schema["version"] = self.schema.get("version", 1)
self.conn = None
self.cur = None
2019-12-17 15:02:04 +01:00
self.cursors = weakref.WeakSet()
2019-11-30 02:13:39 +01:00
self.id = next_db_id
next_db_id += 1
2019-03-18 03:36:12 +01:00
self.progress_sleeping = False
2019-11-30 02:11:11 +01:00
self.commiting = False
2019-11-30 02:13:39 +01:00
self.log = logging.getLogger("Db#%s:%s" % (self.id, schema["db_name"]))
self.table_names = None
self.collect_stats = False
self.foreign_keys = False
self.need_commit = False
self.query_stats = {}
self.db_keyvalues = {}
2016-11-07 23:11:14 +01:00
self.delayed_queue = []
self.delayed_queue_thread = None
self.close_idle = close_idle
self.last_query_time = time.time()
self.last_sleep_time = time.time()
self.num_execute_since_sleep = 0
2019-11-30 02:11:34 +01:00
self.lock = ThreadPool.Lock()
2019-11-30 02:13:17 +01:00
self.connect_lock = ThreadPool.Lock()
def __repr__(self):
return "<Db#%s:%s close_idle:%s>" % (id(self), self.db_path, self.close_idle)
def connect(self):
2019-11-30 02:13:17 +01:00
self.connect_lock.acquire(True)
try:
if self.conn:
self.log.debug("Already connected, connection ignored")
return
if self not in opened_dbs:
opened_dbs.append(self)
s = time.time()
try: # Directory not exist yet
2019-11-30 02:13:17 +01:00
os.makedirs(self.db_dir)
self.log.debug("Created Db path: %s" % self.db_dir)
except OSError as err:
if err.errno != errno.EEXIST:
raise err
2019-11-30 02:13:17 +01:00
if not os.path.isfile(self.db_path):
self.log.debug("Db file not exist yet: %s" % self.db_path)
self.conn = sqlite3.connect(self.db_path, isolation_level="DEFERRED", check_same_thread=False)
self.conn.row_factory = sqlite3.Row
self.conn.set_progress_handler(self.progress, 5000000)
self.conn.execute('PRAGMA journal_mode=WAL')
if self.foreign_keys:
self.conn.execute("PRAGMA foreign_keys = ON")
2019-11-30 02:13:17 +01:00
self.cur = self.getCursor()
2019-11-30 02:13:17 +01:00
self.log.debug(
"Connected to %s in %.3fs (opened: %s, sqlite version: %s)..." %
(self.db_path, time.time() - s, len(opened_dbs), sqlite3.version)
)
2019-12-17 15:02:04 +01:00
self.log.debug("Connect by thread: %s" % threading.current_thread().ident)
2019-11-30 02:13:17 +01:00
self.log.debug("Connect called by %s" % Debug.formatStack())
finally:
self.connect_lock.release()
2019-12-17 15:02:04 +01:00
def getConn(self):
if not self.conn:
self.connect()
return self.conn
def progress(self, *args, **kwargs):
2019-03-18 03:36:12 +01:00
self.progress_sleeping = True
time.sleep(0.001)
self.progress_sleeping = False
# Execute query using dbcursor
def execute(self, query, params=None):
if not self.conn:
self.connect()
return self.cur.execute(query, params)
2019-11-30 02:12:33 +01:00
@thread_pool_db.wrap
def commit(self, reason="Unknown"):
2019-03-18 03:36:12 +01:00
if self.progress_sleeping:
self.log.debug("Commit ignored: Progress sleeping")
return False
if not self.conn:
self.log.debug("Commit ignored: No connection")
return False
2020-01-28 16:58:14 +01:00
if self.commiting:
self.log.debug("Commit ignored: Already commiting")
return False
try:
s = time.time()
2019-11-30 02:11:11 +01:00
self.commiting = True
self.conn.commit()
self.log.debug("Commited in %.3fs (reason: %s)" % (time.time() - s, reason))
return True
except Exception as err:
if "SQL statements in progress" in str(err):
self.log.warning("Commit delayed: %s (reason: %s)" % (Debug.formatException(err), reason))
else:
self.log.error("Commit error: %s (reason: %s)" % (Debug.formatException(err), reason))
return False
2019-11-30 02:11:11 +01:00
finally:
self.commiting = False
2016-11-07 23:11:14 +01:00
def insertOrUpdate(self, *args, **kwargs):
if not self.conn:
self.connect()
return self.cur.insertOrUpdate(*args, **kwargs)
def executeDelayed(self, *args, **kwargs):
if not self.delayed_queue_thread:
2018-03-29 02:50:23 +02:00
self.delayed_queue_thread = gevent.spawn_later(1, self.processDelayed)
2016-11-07 23:11:14 +01:00
self.delayed_queue.append(("execute", (args, kwargs)))
def insertOrUpdateDelayed(self, *args, **kwargs):
if not self.delayed_queue:
gevent.spawn_later(1, self.processDelayed)
self.delayed_queue.append(("insertOrUpdate", (args, kwargs)))
def processDelayed(self):
if not self.delayed_queue:
self.log.debug("processDelayed aborted")
return
if not self.conn:
self.connect()
s = time.time()
cur = self.getCursor()
for command, params in self.delayed_queue:
if command == "insertOrUpdate":
cur.insertOrUpdate(*params[0], **params[1])
else:
cur.execute(*params[0], **params[1])
if len(self.delayed_queue) > 10:
self.log.debug("Processed %s delayed queue in %.3fs" % (len(self.delayed_queue), time.time() - s))
self.delayed_queue = []
self.delayed_queue_thread = None
2019-12-17 14:35:49 +01:00
def close(self, reason="Unknown"):
if not self.conn:
return False
2019-12-17 15:02:04 +01:00
self.connect_lock.acquire()
2016-05-16 22:28:59 +02:00
s = time.time()
2016-11-07 23:11:14 +01:00
if self.delayed_queue:
self.processDelayed()
if self in opened_dbs:
opened_dbs.remove(self)
self.need_commit = False
2019-12-17 14:35:49 +01:00
self.commit("Closing: %s" % reason)
self.log.debug("Close called by %s" % Debug.formatStack())
2020-01-22 16:36:52 +01:00
for i in range(5):
2019-12-17 15:02:04 +01:00
if len(self.cursors) == 0:
break
self.log.debug("Pending cursors: %s" % len(self.cursors))
time.sleep(0.1 * i)
if len(self.cursors):
self.log.debug("Killing cursors: %s" % len(self.cursors))
self.conn.interrupt()
if self.cur:
self.cur.close()
if self.conn:
2019-12-17 15:02:04 +01:00
ThreadPool.main_loop.call(self.conn.close)
self.conn = None
self.cur = None
2019-12-17 14:35:49 +01:00
self.log.debug("%s closed (reason: %s) in %.3fs, opened: %s" % (self.db_path, reason, time.time() - s, len(opened_dbs)))
self.connect_lock.release()
return True
# Gets a cursor object to database
# Return: Cursor class
def getCursor(self):
if not self.conn:
self.connect()
2018-10-15 13:01:04 +02:00
cur = DbCursor(self)
2018-10-15 13:01:04 +02:00
return cur
2019-03-18 03:36:44 +01:00
def getSharedCursor(self):
if not self.conn:
self.connect()
return self.cur
# Get the table version
# Return: Table version or None if not exist
def getTableVersion(self, table_name):
if not self.db_keyvalues: # Get db keyvalues
try:
2016-11-07 23:13:05 +01:00
res = self.execute("SELECT * FROM keyvalue WHERE json_id=0") # json_id = 0 is internal keyvalues
2019-03-15 21:06:59 +01:00
except sqlite3.OperationalError as err: # Table not exist
self.log.debug("Query table version error: %s" % err)
return False
for row in res:
self.db_keyvalues[row["key"]] = row["value"]
return self.db_keyvalues.get("table.%s.version" % table_name, 0)
# Check Db tables
# Return: <list> Changed table names
def checkTables(self):
s = time.time()
changed_tables = []
2019-03-18 03:36:44 +01:00
cur = self.getSharedCursor()
# Check internal tables
# Check keyvalue table
changed = cur.needTable("keyvalue", [
["keyvalue_id", "INTEGER PRIMARY KEY AUTOINCREMENT"],
["key", "TEXT"],
["value", "INTEGER"],
["json_id", "INTEGER"],
], [
"CREATE UNIQUE INDEX key_id ON keyvalue(json_id, key)"
], version=self.schema["version"])
if changed:
changed_tables.append("keyvalue")
# Create json table if no custom one defined
if "json" not in self.schema.get("tables", {}):
if self.schema["version"] == 1:
changed = cur.needTable("json", [
["json_id", "INTEGER PRIMARY KEY AUTOINCREMENT"],
["path", "VARCHAR(255)"]
], [
"CREATE UNIQUE INDEX path ON json(path)"
], version=self.schema["version"])
elif self.schema["version"] == 2:
changed = cur.needTable("json", [
["json_id", "INTEGER PRIMARY KEY AUTOINCREMENT"],
["directory", "VARCHAR(255)"],
["file_name", "VARCHAR(255)"]
], [
"CREATE UNIQUE INDEX path ON json(directory, file_name)"
], version=self.schema["version"])
elif self.schema["version"] == 3:
changed = cur.needTable("json", [
["json_id", "INTEGER PRIMARY KEY AUTOINCREMENT"],
["site", "VARCHAR(255)"],
["directory", "VARCHAR(255)"],
["file_name", "VARCHAR(255)"]
], [
"CREATE UNIQUE INDEX path ON json(directory, site, file_name)"
], version=self.schema["version"])
if changed:
changed_tables.append("json")
# Check schema tables
2018-01-19 02:18:40 +01:00
for table_name, table_settings in self.schema.get("tables", {}).items():
2018-10-15 13:01:17 +02:00
try:
indexes = table_settings.get("indexes", [])
version = table_settings.get("schema_changed", 0)
2018-10-15 13:01:17 +02:00
changed = cur.needTable(
table_name, table_settings["cols"],
indexes, version=version
2018-10-15 13:01:17 +02:00
)
if changed:
changed_tables.append(table_name)
except Exception as err:
self.log.error("Error creating table %s: %s" % (table_name, Debug.formatException(err)))
raise DbTableError(err, table_name)
self.log.debug("Db check done in %.3fs, changed tables: %s" % (time.time() - s, changed_tables))
if changed_tables:
self.db_keyvalues = {} # Refresh table version cache
return changed_tables
2017-02-09 01:53:31 +01:00
# Update json file to db
# Return: True if matched
2017-02-09 01:53:31 +01:00
def updateJson(self, file_path, file=None, cur=None):
if not file_path.startswith(self.db_dir):
return False # Not from the db dir: Skipping
2017-06-15 13:33:51 +02:00
relative_path = file_path[len(self.db_dir):] # File path realative to db file
# Check if filename matches any of mappings in schema
matched_maps = []
for match, map_settings in self.schema["maps"].items():
try:
if SafeRe.match(match, relative_path):
matched_maps.append(map_settings)
except SafeRe.UnsafePatternError as err:
self.log.error(err)
# No match found for the file
if not matched_maps:
return False
# Load the json file
try:
if file is None: # Open file is not file object passed
2017-08-09 14:19:39 +02:00
file = open(file_path, "rb")
if file is False: # File deleted
data = {}
else:
2017-08-09 14:19:39 +02:00
if file_path.endswith("json.gz"):
file = helper.limitedGzipFile(fileobj=file)
if sys.version_info.major == 3 and sys.version_info.minor < 6:
data = json.loads(file.read().decode("utf8"))
2017-08-09 14:19:39 +02:00
else:
data = json.load(file)
2019-03-15 21:06:59 +01:00
except Exception as err:
self.log.debug("Json file %s load error: %s" % (file_path, err))
data = {}
# No cursor specificed
if not cur:
2019-03-18 03:36:44 +01:00
cur = self.getSharedCursor()
cur.logging = False
2016-08-10 12:49:38 +02:00
# Row for current json file if required
2019-03-15 21:06:59 +01:00
if not data or [dbmap for dbmap in matched_maps if "to_keyvalue" in dbmap or "to_table" in dbmap]:
2016-08-10 12:49:38 +02:00
json_row = cur.getJsonRow(relative_path)
# Check matched mappings in schema
2016-11-07 23:14:36 +01:00
for dbmap in matched_maps:
# Insert non-relational key values
2016-11-07 23:14:36 +01:00
if dbmap.get("to_keyvalue"):
# Get current values
res = cur.execute("SELECT * FROM keyvalue WHERE json_id = ?", (json_row["json_id"],))
current_keyvalue = {}
current_keyvalue_id = {}
for row in res:
current_keyvalue[row["key"]] = row["value"]
current_keyvalue_id[row["key"]] = row["keyvalue_id"]
2016-11-07 23:14:36 +01:00
for key in dbmap["to_keyvalue"]:
if key not in current_keyvalue: # Keyvalue not exist yet in the db
cur.execute(
"INSERT INTO keyvalue ?",
{"key": key, "value": data.get(key), "json_id": json_row["json_id"]}
)
elif data.get(key) != current_keyvalue[key]: # Keyvalue different value
cur.execute(
"UPDATE keyvalue SET value = ? WHERE keyvalue_id = ?",
(data.get(key), current_keyvalue_id[key])
)
2016-08-10 12:49:38 +02:00
# Insert data to json table for easier joins
2016-11-07 23:14:36 +01:00
if dbmap.get("to_json_table"):
2016-08-10 12:49:38 +02:00
directory, file_name = re.match("^(.*?)/*([^/]*)$", relative_path).groups()
2016-11-07 23:14:36 +01:00
data_json_row = dict(cur.getJsonRow(directory + "/" + dbmap.get("file_name", file_name)))
2016-08-10 12:49:38 +02:00
changed = False
2016-11-07 23:14:36 +01:00
for key in dbmap["to_json_table"]:
2016-08-10 12:49:38 +02:00
if data.get(key) != data_json_row.get(key):
changed = True
if changed:
# Add the custom col values
2019-03-15 21:06:59 +01:00
data_json_row.update({key: val for key, val in data.items() if key in dbmap["to_json_table"]})
2016-08-10 12:49:38 +02:00
cur.execute("INSERT OR REPLACE INTO json ?", data_json_row)
# Insert data to tables
2016-11-07 23:14:36 +01:00
for table_settings in dbmap.get("to_table", []):
if isinstance(table_settings, dict): # Custom settings
table_name = table_settings["table"] # Table name to insert datas
node = table_settings.get("node", table_name) # Node keyname in data json file
key_col = table_settings.get("key_col") # Map dict key as this col
val_col = table_settings.get("val_col") # Map dict value as this col
import_cols = table_settings.get("import_cols")
replaces = table_settings.get("replaces")
else: # Simple settings
table_name = table_settings
node = table_settings
key_col = None
val_col = None
import_cols = None
replaces = None
2016-11-07 23:15:09 +01:00
# Fill import cols from table cols
if not import_cols:
2019-03-15 21:06:59 +01:00
import_cols = set([item[0] for item in self.schema["tables"][table_name]["cols"]])
2016-11-07 23:15:09 +01:00
cur.execute("DELETE FROM %s WHERE json_id = ?" % table_name, (json_row["json_id"],))
if node not in data:
continue
if key_col: # Map as dict
2019-03-15 21:06:59 +01:00
for key, val in data[node].items():
if val_col: # Single value
cur.execute(
"INSERT OR REPLACE INTO %s ?" % table_name,
{key_col: key, val_col: val, "json_id": json_row["json_id"]}
)
else: # Multi value
2018-10-30 04:43:21 +01:00
if type(val) is dict: # Single row
row = val
if import_cols:
2016-11-07 23:15:09 +01:00
row = {key: row[key] for key in row if key in import_cols} # Filter row by import_cols
row[key_col] = key
# Replace in value if necessary
if replaces:
2019-03-15 21:06:59 +01:00
for replace_key, replace in replaces.items():
if replace_key in row:
2019-03-15 21:06:59 +01:00
for replace_from, replace_to in replace.items():
row[replace_key] = row[replace_key].replace(replace_from, replace_to)
row["json_id"] = json_row["json_id"]
cur.execute("INSERT OR REPLACE INTO %s ?" % table_name, row)
2018-10-30 04:43:21 +01:00
elif type(val) is list: # Multi row
for row in val:
row[key_col] = key
row["json_id"] = json_row["json_id"]
cur.execute("INSERT OR REPLACE INTO %s ?" % table_name, row)
else: # Map as list
for row in data[node]:
row["json_id"] = json_row["json_id"]
2016-11-07 23:15:09 +01:00
if import_cols:
row = {key: row[key] for key in row if key in import_cols} # Filter row by import_cols
cur.execute("INSERT OR REPLACE INTO %s ?" % table_name, row)
# Cleanup json row
if not data:
self.log.debug("Cleanup json row for %s" % file_path)
cur.execute("DELETE FROM json WHERE json_id = %s" % json_row["json_id"])
return True
if __name__ == "__main__":
s = time.time()
console_log = logging.StreamHandler()
logging.getLogger('').setLevel(logging.DEBUG)
logging.getLogger('').addHandler(console_log)
console_log.setLevel(logging.DEBUG)
dbjson = Db(json.load(open("zerotalk.schema.json")), "data/users/zerotalk.db")
dbjson.collect_stats = True
dbjson.checkTables()
cur = dbjson.getCursor()
cur.logging = False
2017-02-09 01:53:31 +01:00
dbjson.updateJson("data/users/content.json", cur=cur)
for user_dir in os.listdir("data/users"):
if os.path.isdir("data/users/%s" % user_dir):
2017-02-09 01:53:31 +01:00
dbjson.updateJson("data/users/%s/data.json" % user_dir, cur=cur)
# print ".",
cur.logging = True
2019-03-15 21:06:59 +01:00
print("Done in %.3fs" % (time.time() - s))
for query, stats in sorted(dbjson.query_stats.items()):
2019-03-15 21:06:59 +01:00
print("-", query, stats)