ZeroNet/src/Site/SiteStorage.py

549 lines
23 KiB
Python
Raw Normal View History

import os
import re
import shutil
import json
import time
import sys
from collections import defaultdict
import sqlite3
import gevent.event
2017-12-20 23:35:49 +01:00
import util
from util import SafeRe
from Db import Db
from Debug import Debug
from Config import config
from util import helper
2016-08-10 12:28:57 +02:00
from Plugin import PluginManager
from Translate import translate as _
2016-08-10 12:28:57 +02:00
@PluginManager.acceptPlugins
class SiteStorage(object):
def __init__(self, site, allow_create=True):
self.site = site
2017-01-22 21:22:53 +01:00
self.directory = u"%s/%s" % (config.data_dir, self.site.address) # Site data diretory
self.allowed_dir = os.path.abspath(self.directory) # Only serve file within this dir
self.log = site.log
self.db = None # Db class
self.db_checked = False # Checked db tables since startup
self.event_db_busy = None # Gevent AsyncResult if db is working on rebuild
self.has_db = self.isFile("dbschema.json") # The site has schema
if not os.path.isdir(self.directory):
if allow_create:
os.mkdir(self.directory) # Create directory if not found
else:
raise Exception("Directory not exists: %s" % self.directory)
2017-10-22 11:13:09 +02:00
def getDbFile(self):
if self.isFile("dbschema.json"):
schema = self.loadJson("dbschema.json")
return schema["db_file"]
else:
return False
# Create new databaseobject with the site's schema
def openDb(self, close_idle=False):
schema = self.getDbSchema()
db_path = self.getPath(schema["db_file"])
return Db(schema, db_path, close_idle=close_idle)
def closeDb(self):
if self.db:
self.db.close()
self.event_db_busy = None
self.db = None
def getDbSchema(self):
try:
schema = self.loadJson("dbschema.json")
except Exception, err:
raise Exception("dbschema.json is not a valid JSON: %s" % err)
return schema
# Return db class
def getDb(self):
if not self.db:
self.log.debug("No database, waiting for dbschema.json...")
self.site.needFile("dbschema.json", priority=3)
self.has_db = self.isFile("dbschema.json") # Recheck if dbschema exist
if self.has_db:
schema = self.getDbSchema()
db_path = self.getPath(schema["db_file"])
if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0:
self.rebuildDb()
if self.db:
self.db.close()
self.db = self.openDb(close_idle=True)
changed_tables = self.db.checkTables()
if changed_tables:
self.rebuildDb(delete_db=False) # TODO: only update the changed table datas
return self.db
def updateDbFile(self, inner_path, file=None, cur=None):
path = self.getPath(inner_path)
return self.getDb().updateJson(path, file, cur)
# Return possible db files for the site
def getDbFiles(self):
found = 0
for content_inner_path, content in self.site.content_manager.contents.iteritems():
# content.json file itself
2017-03-06 15:30:42 +01:00
if self.isFile(content_inner_path):
yield content_inner_path, self.getPath(content_inner_path)
else:
self.log.error("[MISSING] %s" % content_inner_path)
# Data files in content.json
content_inner_path_dir = helper.getDirname(content_inner_path) # Content.json dir relative to site
2017-04-12 17:36:03 +02:00
for file_relative_path in content.get("files", {}).keys() + content.get("files_optional", {}).keys():
2017-08-09 14:19:39 +02:00
if not file_relative_path.endswith(".json") and not file_relative_path.endswith("json.gz"):
continue # We only interesed in json files
file_inner_path = content_inner_path_dir + file_relative_path # File Relative to site dir
file_inner_path = file_inner_path.strip("/") # Strip leading /
if self.isFile(file_inner_path):
yield file_inner_path, self.getPath(file_inner_path)
else:
self.log.error("[MISSING] %s" % file_inner_path)
found += 1
if found % 100 == 0:
time.sleep(0.000001) # Context switch to avoid UI block
# Rebuild sql cache
2018-09-02 02:24:07 +02:00
@util.Noparallel()
def rebuildDb(self, delete_db=True):
self.has_db = self.isFile("dbschema.json")
if not self.has_db:
return False
self.event_db_busy = gevent.event.AsyncResult()
schema = self.loadJson("dbschema.json")
db_path = self.getPath(schema["db_file"])
if os.path.isfile(db_path) and delete_db:
if self.db:
self.db.close() # Close db if open
time.sleep(0.5)
self.log.info("Deleting %s" % db_path)
try:
os.unlink(db_path)
except Exception, err:
self.log.error("Delete error: %s" % err)
db = self.openDb()
self.log.info("Creating tables...")
db.checkTables()
cur = db.getCursor()
cur.execute("BEGIN")
cur.logging = False
found = 0
s = time.time()
self.log.info("Getting db files...")
db_files = list(self.getDbFiles())
self.log.info("Importing data...")
try:
if len(db_files) > 100:
self.site.messageWebsocket(_["Database rebuilding...<br>Imported {0} of {1} files..."].format("0000", len(db_files)), "rebuild", 0)
for file_inner_path, file_path in db_files:
try:
if self.updateDbFile(file_inner_path, file=open(file_path, "rb"), cur=cur):
found += 1
except Exception, err:
self.log.error("Error importing %s: %s" % (file_inner_path, Debug.formatException(err)))
if found and found % 100 == 0:
self.site.messageWebsocket(
_["Database rebuilding...<br>Imported {0} of {1} files..."].format(found, len(db_files)),
"rebuild",
int(float(found) / len(db_files) * 100)
)
time.sleep(0.000001) # Context switch to avoid UI block
finally:
cur.execute("END")
cur.close()
db.close()
self.log.info("Closing Db: %s" % db)
if len(db_files) > 100:
self.site.messageWebsocket(_["Database rebuilding...<br>Imported {0} of {1} files..."].format(found, len(db_files)), "rebuild", 100)
self.log.info("Imported %s data file in %ss" % (found, time.time() - s))
self.event_db_busy.set(True) # Event done, notify waiters
self.event_db_busy = None # Clear event
# Execute sql query or rebuild on dberror
def query(self, query, params=None):
if self.event_db_busy: # Db not ready for queries
self.log.debug("Wating for db...")
self.event_db_busy.get() # Wait for event
try:
res = self.getDb().execute(query, params)
except sqlite3.DatabaseError, err:
if err.__class__.__name__ == "DatabaseError":
self.log.error("Database error: %s, query: %s, try to rebuilding it..." % (err, query))
self.rebuildDb()
res = self.db.cur.execute(query, params)
else:
raise err
return res
# Open file object
def open(self, inner_path, mode="rb", create_dirs=False):
file_path = self.getPath(inner_path)
if create_dirs:
file_dir = os.path.dirname(file_path)
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
return open(file_path, mode)
# Open file object
def read(self, inner_path, mode="r"):
return open(self.getPath(inner_path), mode).read()
# Write content to file
def write(self, inner_path, content):
file_path = self.getPath(inner_path)
# Create dir if not exist
file_dir = os.path.dirname(file_path)
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
# Write file
if hasattr(content, 'read'): # File-like object
with open(file_path, "wb") as file:
shutil.copyfileobj(content, file) # Write buff to disk
else: # Simple string
2017-01-05 02:26:44 +01:00
if inner_path == "content.json" and os.path.isfile(file_path):
helper.atomicWrite(file_path, content)
else:
with open(file_path, "wb") as file:
file.write(content)
del content
self.onUpdated(inner_path)
# Remove file from filesystem
def delete(self, inner_path):
file_path = self.getPath(inner_path)
os.unlink(file_path)
2016-09-14 10:52:41 +02:00
self.onUpdated(inner_path, file=False)
2016-03-26 00:22:27 +01:00
def deleteDir(self, inner_path):
dir_path = self.getPath(inner_path)
os.rmdir(dir_path)
2016-04-06 13:50:20 +02:00
def rename(self, inner_path_before, inner_path_after):
for retry in range(3):
# To workaround "The process cannot access the file beacause it is being used by another process." error
try:
os.rename(self.getPath(inner_path_before), self.getPath(inner_path_after))
err = None
break
except Exception, err:
self.log.error("%s rename error: %s (retry #%s)" % (inner_path_before, err, retry))
time.sleep(0.1 + retry)
if err:
raise err
2016-04-06 13:50:20 +02:00
# List files from a directory
def walk(self, dir_inner_path, ignore=None):
directory = self.getPath(dir_inner_path)
for root, dirs, files in os.walk(directory):
root = root.replace("\\", "/")
root_relative_path = re.sub("^%s" % re.escape(directory), "", root).lstrip("/")
for file_name in files:
if root_relative_path: # Not root dir
file_relative_path = root_relative_path + "/" + file_name
else:
file_relative_path = file_name
if ignore and SafeRe.match(ignore, file_relative_path):
continue
yield file_relative_path
# Don't scan directory that is in the ignore pattern
if ignore:
dirs_filtered = []
for dir_name in dirs:
if root_relative_path:
dir_relative_path = root_relative_path + "/" + dir_name
else:
dir_relative_path = dir_name
if ignore == ".*" or re.match(".*([|(]|^)%s([|)]|$)" % re.escape(dir_relative_path + "/.*"), ignore):
continue
dirs_filtered.append(dir_name)
dirs[:] = dirs_filtered
# list directories in a directory
def list(self, dir_inner_path):
directory = self.getPath(dir_inner_path)
return os.listdir(directory)
# Site content updated
def onUpdated(self, inner_path, file=None):
# Update Sql cache
if inner_path == "dbschema.json":
self.has_db = self.isFile("dbschema.json")
# Reopen DB to check changes
if self.has_db:
self.closeDb()
self.getDb()
2017-08-09 14:19:39 +02:00
elif not config.disable_db and (inner_path.endswith(".json") or inner_path.endswith(".json.gz")) and self.has_db: # Load json file to db
2016-04-06 13:39:17 +02:00
if config.verbose:
2017-02-13 16:12:56 +01:00
self.log.debug("Loading json file to db: %s (file: %s)" % (inner_path, file))
try:
self.updateDbFile(inner_path, file)
except Exception, err:
self.log.error("Json %s load error: %s" % (inner_path, Debug.formatException(err)))
self.closeDb()
# Load and parse json file
def loadJson(self, inner_path):
with self.open(inner_path) as file:
return json.load(file)
def formatJson(self, data):
content = json.dumps(data, indent=1, sort_keys=True)
2016-10-02 14:23:05 +02:00
# Make it a little more compact by removing unnecessary white space
def compact_dict(match):
2016-10-02 14:23:05 +02:00
if "\n" in match.group(0):
return match.group(0).replace(match.group(1), match.group(1).strip())
else:
return match.group(0)
2016-10-02 14:23:05 +02:00
content = re.sub("\{(\n[^,\[\{]{10,100}?)\}[, ]{0,2}\n", compact_dict, content, flags=re.DOTALL)
2017-06-03 00:53:59 +02:00
def compact_list(match):
if "\n" in match.group(0):
stripped_lines = re.sub("\n[ ]*", "", match.group(1))
return match.group(0).replace(match.group(1), stripped_lines)
else:
return match.group(0)
content = re.sub("\[([^\[\{]{2,300}?)\][, ]{0,2}\n", compact_list, content, flags=re.DOTALL)
# Remove end of line whitespace
content = re.sub("(?m)[ ]+$", "", content)
return content
# Write formatted json file
def writeJson(self, inner_path, data):
# Write to disk
self.write(inner_path, self.formatJson(data))
# Get file size
def getSize(self, inner_path):
path = self.getPath(inner_path)
2016-04-20 23:28:21 +02:00
try:
return os.path.getsize(path)
2016-04-20 23:28:21 +02:00
except:
return 0
# File exist
def isFile(self, inner_path):
return os.path.isfile(self.getPath(inner_path))
# File or directory exist
def isExists(self, inner_path):
return os.path.exists(self.getPath(inner_path))
# Dir exist
def isDir(self, inner_path):
return os.path.isdir(self.getPath(inner_path))
# Security check and return path of site's file
def getPath(self, inner_path):
inner_path = inner_path.replace("\\", "/") # Windows separator fix
if not inner_path:
return self.directory
2017-01-11 13:12:35 +01:00
if ".." in inner_path:
raise Exception(u"File not allowed: %s" % inner_path)
2016-04-06 13:48:13 +02:00
2017-01-11 13:12:35 +01:00
return u"%s/%s" % (self.directory, inner_path)
# Get site dir relative path
def getInnerPath(self, path):
if path == self.directory:
inner_path = ""
else:
2017-06-15 13:30:36 +02:00
if path.startswith(self.directory):
2018-09-02 02:24:07 +02:00
inner_path = path[len(self.directory) + 1:]
2017-06-15 13:30:36 +02:00
else:
raise Exception(u"File not allowed: %s" % path)
return inner_path
# Verify all files sha512sum using content.json
def verifyFiles(self, quick_check=False, add_optional=False, add_changed=True):
bad_files = []
back = defaultdict(int)
back["bad_files"] = bad_files
i = 0
self.log.debug("Verifing files...")
if not self.site.content_manager.contents.get("content.json"): # No content.json, download it first
self.log.debug("VerifyFile content.json not exists")
self.site.needFile("content.json", update=True) # Force update to fix corrupt file
self.site.content_manager.loadContent() # Reload content.json
for content_inner_path, content in self.site.content_manager.contents.items():
back["num_content"] += 1
i += 1
if i % 50 == 0:
time.sleep(0.0001) # Context switch to avoid gevent hangs
if not os.path.isfile(self.getPath(content_inner_path)): # Missing content.json file
back["num_content_missing"] += 1
self.log.debug("[MISSING] %s" % content_inner_path)
bad_files.append(content_inner_path)
for file_relative_path in content.get("files", {}).keys():
back["num_file"] += 1
file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
file_inner_path = file_inner_path.strip("/") # Strip leading /
file_path = self.getPath(file_inner_path)
if not os.path.isfile(file_path):
back["num_file_missing"] += 1
self.log.debug("[MISSING] %s" % file_inner_path)
bad_files.append(file_inner_path)
continue
if quick_check:
ok = os.path.getsize(file_path) == content["files"][file_relative_path]["size"]
if not ok:
err = "Invalid size"
else:
try:
ok = self.site.content_manager.verifyFile(file_inner_path, open(file_path, "rb"))
except Exception, err:
ok = False
if not ok:
back["num_file_invalid"] += 1
self.log.debug("[INVALID] %s: %s" % (file_inner_path, err))
2016-08-26 11:40:22 +02:00
if add_changed or content.get("cert_user_id"): # If updating own site only add changed user files
bad_files.append(file_inner_path)
# Optional files
optional_added = 0
optional_removed = 0
for file_relative_path in content.get("files_optional", {}).keys():
back["num_optional"] += 1
file_node = content["files_optional"][file_relative_path]
file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
file_inner_path = file_inner_path.strip("/") # Strip leading /
file_path = self.getPath(file_inner_path)
hash_id = self.site.content_manager.hashfield.getHashId(file_node["sha512"])
if not os.path.isfile(file_path):
if self.site.content_manager.isDownloaded(file_inner_path, hash_id):
back["num_optional_removed"] += 1
self.log.debug("[OPTIONAL REMOVED] %s" % file_inner_path)
self.site.content_manager.optionalRemoved(file_inner_path, hash_id, file_node["size"])
if add_optional:
bad_files.append(file_inner_path)
continue
if quick_check:
ok = os.path.getsize(file_path) == content["files_optional"][file_relative_path]["size"]
else:
try:
ok = self.site.content_manager.verifyFile(file_inner_path, open(file_path, "rb"))
except Exception, err:
ok = False
if ok:
if not self.site.content_manager.isDownloaded(file_inner_path, hash_id):
back["num_optional_added"] += 1
self.site.content_manager.optionalDownloaded(file_inner_path, hash_id, file_node["size"])
optional_added += 1
self.log.debug("[OPTIONAL FOUND] %s" % file_inner_path)
else:
if self.site.content_manager.isDownloaded(file_inner_path, hash_id):
back["num_optional_removed"] += 1
self.site.content_manager.optionalRemoved(file_inner_path, hash_id, file_node["size"])
optional_removed += 1
bad_files.append(file_inner_path)
self.log.debug("[OPTIONAL CHANGED] %s" % file_inner_path)
if config.verbose:
self.log.debug(
"%s verified: %s, quick: %s, optionals: +%s -%s" %
(content_inner_path, len(content["files"]), quick_check, optional_added, optional_removed)
)
self.site.content_manager.contents.db.processDelayed()
time.sleep(0.0001) # Context switch to avoid gevent hangs
return back
# Check and try to fix site files integrity
2016-09-04 17:59:29 +02:00
def updateBadFiles(self, quick_check=True):
s = time.time()
res = self.verifyFiles(
quick_check,
add_optional=self.site.isDownloadable(""),
add_changed=not self.site.settings.get("own") # Don't overwrite changed files if site owned
)
bad_files = res["bad_files"]
self.site.bad_files = {}
if bad_files:
for bad_file in bad_files:
self.site.bad_files[bad_file] = 1
self.log.debug("Checked files in %.2fs... Found bad files: %s, Quick:%s" % (time.time() - s, len(bad_files), quick_check))
# Delete site's all file
def deleteFiles(self):
self.log.debug("Deleting files from content.json...")
files = [] # Get filenames
2016-09-04 17:59:29 +02:00
for content_inner_path in self.site.content_manager.contents.keys():
content = self.site.content_manager.contents.get(content_inner_path, {})
files.append(content_inner_path)
# Add normal files
for file_relative_path in content.get("files", {}).keys():
file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
files.append(file_inner_path)
# Add optional files
for file_relative_path in content.get("files_optional", {}).keys():
file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
files.append(file_inner_path)
2016-08-10 12:31:54 +02:00
if self.isFile("dbschema.json"):
self.log.debug("Deleting db file...")
self.closeDb()
self.has_db = False
try:
schema = self.loadJson("dbschema.json")
db_path = self.getPath(schema["db_file"])
if os.path.isfile(db_path):
os.unlink(db_path)
except Exception, err:
self.log.error("Db file delete error: %s" % err)
for inner_path in files:
path = self.getPath(inner_path)
if os.path.isfile(path):
2016-11-07 23:47:09 +01:00
for retry in range(5):
try:
os.unlink(path)
break
except Exception, err:
self.log.error(u"Error removing %s: %s, try #%s" % (inner_path, err, retry))
2017-01-11 13:12:35 +01:00
time.sleep(float(retry) / 10)
2016-08-10 12:31:54 +02:00
self.onUpdated(inner_path, False)
self.log.debug("Deleting empty dirs...")
for root, dirs, files in os.walk(self.directory, topdown=False):
for dir in dirs:
path = os.path.join(root, dir)
if os.path.isdir(path) and os.listdir(path) == []:
2018-02-08 18:03:55 +01:00
os.rmdir(path)
self.log.debug("Removing %s" % path)
if os.path.isdir(self.directory) and os.listdir(self.directory) == []:
2018-02-08 18:03:55 +01:00
os.rmdir(self.directory) # Remove sites directory if empty
if os.path.isdir(self.directory):
self.log.debug("Some unknown file remained in site data dir: %s..." % self.directory)
return False # Some files not deleted
else:
self.log.debug("Site data directory deleted: %s..." % self.directory)
return True # All clean