From 361f015bb74f24a62d188ce99f0405f4b773feaf Mon Sep 17 00:00:00 2001 From: Albert Cervera i Areny Date: Mon, 2 Feb 2015 01:54:02 +0100 Subject: [PATCH] Add the following checks: - process count - uptime - process open ports - load - raid - ntp status - apt - disk writable --- monitoring.py | 242 +++++++++++++++++++++++++++++++++++++++++++++++-- monitoring.xml | 119 ++++++++++++++++++++++++ 2 files changed, 352 insertions(+), 9 deletions(-) diff --git a/monitoring.py b/monitoring.py index d755d9c..dabf245 100644 --- a/monitoring.py +++ b/monitoring.py @@ -1,12 +1,39 @@ # The COPYRIGHT file at the top level of this repository contains the full # copyright notices and license terms. -from trytond.pool import PoolMeta +import os import psutil +import json +import tempfile +import socket +import subprocess +from datetime import datetime +from trytond.pool import PoolMeta __all__ = ['CheckPlan'] __metaclass__ = PoolMeta +def check_output(*args): + process = subprocess.Popen(args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + process.wait() + data = process.stdout.read() + return data + + +def to_float(text): + try: + return float(text) + except ValueError: + return None + + +PROTOCOLS = { + socket.SOCK_STREAM: 'TCP', + socket.SOCK_DGRAM: 'UDP', + } + + class CheckPlan: __name__ = 'monitoring.check.plan' @@ -86,10 +113,24 @@ class CheckPlan: }) return res + def check_process_count(self): + return [{ + 'result': 'process_count', + 'float_value': len(psutil.pids()), + }] + + def check_uptime(self): + boot_time = datetime.fromtimestamp(psutil.boot_time()) + uptime = (datetime.now() - boot_time).total_seconds() + return [{ + 'result': 'uptime', + 'float_value': uptime, + }] + def check_process_cpu_percent(self): processes = self.get_attribute('processes') if processes: - processes = processes.split(',') + processes = [x.strip() for x in processes.split(',')] res = [] for process in psutil.process_iter(): try: @@ -97,7 +138,7 @@ class CheckPlan: if processes and name not in processes: continue cpu = process.cpu_percent() - except psutil.NoSuchProcess: + except (psutil.NoSuchProcess, psutil.AccessDenied): continue res.append({ 'result': 'process_cpu_percent', @@ -109,7 +150,7 @@ class CheckPlan: def check_process_open_files_count(self): processes = self.get_attribute('processes') if processes: - processes = processes.split(',') + processes = [x.strip() for x in processes.split(',')] res = [] for process in psutil.process_iter(): try: @@ -117,7 +158,7 @@ class CheckPlan: if processes and name not in processes: continue files = process.num_fds() - except psutil.NoSuchProcess: + except (psutil.NoSuchProcess, psutil.AccessDenied): continue res.append({ 'result': 'process_open_files_count', @@ -129,7 +170,7 @@ class CheckPlan: def check_process_memory_percent(self): processes = self.get_attribute('processes') if processes: - processes = processes.split(',') + processes = [x.strip() for x in processes.split(',')] res = [] for process in psutil.process_iter(): try: @@ -137,7 +178,7 @@ class CheckPlan: if processes and name not in processes: continue memory = process.memory_percent() - except psutil.NoSuchProcess: + except (psutil.NoSuchProcess, psutil.AccessDenied): continue res.append({ 'result': 'process_memory_percent', @@ -149,7 +190,7 @@ class CheckPlan: def check_process_io_counters(self): processes = self.get_attribute('processes') if processes: - processes = processes.split(',') + processes = [x.strip() for x in processes.split(',')] res = [] for process in psutil.process_iter(): try: @@ -157,7 +198,7 @@ class CheckPlan: if processes and name not in processes: continue counters = process.io_counters() - except psutil.NoSuchProcess: + except (psutil.NoSuchProcess, psutil.AccessDenied): continue for name in ('read_count', 'write_count', 'read_bytes', 'write_bytes'): @@ -167,3 +208,186 @@ class CheckPlan: 'float_value': getattr(counters, name), }) return res + + def check_process_open_ports(self): + ''' + Expected structure in ports attribute: + + protocol:ip:port + + Example: + + TCP:*:22 + TCP:*:8000 + ''' + valid_entries = set() + entries = [x.strip() for x in + self.get_attribute('process_open_ports').split()] + for entry in entries: + if len(entry.split(':')) != 3: + continue + protocol, ip, port = entry.split(':') + if '*' in entry: + valid_entries.add((protocol, entry.replace('*', '0.0.0.0'), + port)) + valid_entries.add((protocol, entry.replace('*', '::'), port)) + else: + valid_entries.add((protocol, ip, port)) + invalids = [] + value = 'OK' + for process in psutil.process_iter(): + try: + connections = process.get_connections() + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + for connection in connections: + if connection.status != 'LISTEN': + continue + if connection.type not in PROTOCOLS: + continue + protocol = PROTOCOLS[connection.type] + ip = connection.laddr[0] + port = connection.laddr[1] + entry = (protocol, ip, port) + if entry not in valid_entries: + invalids.append(entry) + value = 'Error' + continue + return [{ + 'result': 'process_open_ports_status', + 'char_value': value, + 'payload': json.dumps({ + 'invalid_ports': invalids, + }), + }] + + def check_load(self): + one, five, fifteen = os.getloadavg() + res = [] + res.append({ + 'result': 'load_1', + 'float_value': one, + }) + res.append({ + 'result': 'load_5', + 'float_value': five, + }) + res.append({ + 'result': 'load_15', + 'float_value': fifteen, + }) + return res + + def check_raid(self): + """ + Expected values in raid_devices: + md0, md1 + """ + devices = self.get_attribute('raid_devices') + if devices: + devices = [x.strip() for x in devices.split(',')] + lines = open('/proc/mdstat', 'r').readlines() + current_device = None + current_payload = '' + res = [] + for line in lines: + if line.startswith('md'): + current_device = line.split()[0] + current_payload += line + continue + if current_device: + if devices and current_device not in devices: + current_device = None + current_payload = '' + continue + current_payload += line + if '[UU]' in line: + state = 'OK' + else: + state = 'Error' + res.append({ + 'result': 'raid_status', + 'label': current_device, + 'char_value': state, + 'payload': json.dumps({ + 'output': current_payload, + }), + }) + current_device = None + current_payload = '' + return res + + def check_ntp_status(self): + output = check_output('/usr/sbin/ntpdate', '-q', 'pool.ntp.org') + line = output.splitlines()[-1] + sec = line.split()[-1] + text = line.split()[-2] + offset = line.split()[-3] + res = [] + value = 999999 + if sec == 'sec' and offset == 'offset': + try: + value = float(text) + except ValueError: + pass + res.append({ + 'result': 'ntp_offset', + 'float_value': value, + }) + return res + + def check_apt(self): + output = check_output('apt-get', '-s', 'upgrade') + upgrades = 0 + security_upgrades = 0 + errors = False + for line in output.splitlines(): + if not line.startswith('Inst'): + continue + + upgrades += 1 + + items = line.split() + if len(items) != 5: + errors = True + continue + + release = items[3] + if 'security' in release.lower(): + security_upgrades += 1 + + res = [] + res.append({ + 'result': 'apt_status', + 'char_value': 'Error' if errors else 'OK', + }) + res.append({ + 'result': 'apt_upgrades', + 'float_value': upgrades, + }) + res.append({ + 'result': 'apt_security_upgrades', + 'float_value': security_upgrades, + }) + return res + + def check_disk_writable(self): + path = self.get_attribute('writable_path') + path = path.strip() + if not path.endswith('/'): + path += '/' + try: + with tempfile.TemporaryFile(prefix=path): + pass + except Exception, e: + return [{ + 'result': 'disk_writable', + 'label': path, + 'char_value': 'Error', + 'payload': str(e), + }] + return [{ + 'result': 'disk_writable', + 'label': path, + 'char_value': 'OK', + }] diff --git a/monitoring.xml b/monitoring.xml index e5b4e19..36ca149 100644 --- a/monitoring.xml +++ b/monitoring.xml @@ -293,6 +293,28 @@ + + Process Count + check_process_count + + + Process Count + process_count + float + + + + + Uptime + check_uptime + + + Uptime + uptime + float + + + Process CPU Percent check_process_cpu_percent @@ -354,5 +376,102 @@ float + + + Process Open Ports + check_process_open_ports + + + Process Open Ports Status + process_open_ports_status + char + + + + Load + + + Load + + + + + + + Load + check_load + + + Load 1 + load_1 + float + + + + Load 5 + load_5 + float + + + + Load 15 + load_15 + float + + + + + RAID Status + check_raid + + + RAID Status + raid_status + char + + + + NTP Status + check_ntp_status + + + NTP Status + ntp_offset + float + + + + + APT + check_apt + + + APT Status + apt_status + char + + + APT Upgrades + apt_upgrades + float + + + + APT Security Upgrades + apt_security_upgrades + float + + + + + Disk Writable + check_disk_writable + + + Disk Writable + disk_writable + char + +