Add the following checks:

- process count
- uptime
- process open ports
- load
- raid
- ntp status
- apt
- disk writable
This commit is contained in:
Albert Cervera i Areny 2015-02-02 01:54:02 +01:00
parent ded01235db
commit 361f015bb7
2 changed files with 352 additions and 9 deletions

View File

@ -1,12 +1,39 @@
# The COPYRIGHT file at the top level of this repository contains the full
# copyright notices and license terms.
from trytond.pool import PoolMeta
import os
import psutil
import json
import tempfile
import socket
import subprocess
from datetime import datetime
from trytond.pool import PoolMeta
__all__ = ['CheckPlan']
__metaclass__ = PoolMeta
def check_output(*args):
process = subprocess.Popen(args, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
process.wait()
data = process.stdout.read()
return data
def to_float(text):
try:
return float(text)
except ValueError:
return None
PROTOCOLS = {
socket.SOCK_STREAM: 'TCP',
socket.SOCK_DGRAM: 'UDP',
}
class CheckPlan:
__name__ = 'monitoring.check.plan'
@ -86,10 +113,24 @@ class CheckPlan:
})
return res
def check_process_count(self):
return [{
'result': 'process_count',
'float_value': len(psutil.pids()),
}]
def check_uptime(self):
boot_time = datetime.fromtimestamp(psutil.boot_time())
uptime = (datetime.now() - boot_time).total_seconds()
return [{
'result': 'uptime',
'float_value': uptime,
}]
def check_process_cpu_percent(self):
processes = self.get_attribute('processes')
if processes:
processes = processes.split(',')
processes = [x.strip() for x in processes.split(',')]
res = []
for process in psutil.process_iter():
try:
@ -97,7 +138,7 @@ class CheckPlan:
if processes and name not in processes:
continue
cpu = process.cpu_percent()
except psutil.NoSuchProcess:
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
res.append({
'result': 'process_cpu_percent',
@ -109,7 +150,7 @@ class CheckPlan:
def check_process_open_files_count(self):
processes = self.get_attribute('processes')
if processes:
processes = processes.split(',')
processes = [x.strip() for x in processes.split(',')]
res = []
for process in psutil.process_iter():
try:
@ -117,7 +158,7 @@ class CheckPlan:
if processes and name not in processes:
continue
files = process.num_fds()
except psutil.NoSuchProcess:
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
res.append({
'result': 'process_open_files_count',
@ -129,7 +170,7 @@ class CheckPlan:
def check_process_memory_percent(self):
processes = self.get_attribute('processes')
if processes:
processes = processes.split(',')
processes = [x.strip() for x in processes.split(',')]
res = []
for process in psutil.process_iter():
try:
@ -137,7 +178,7 @@ class CheckPlan:
if processes and name not in processes:
continue
memory = process.memory_percent()
except psutil.NoSuchProcess:
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
res.append({
'result': 'process_memory_percent',
@ -149,7 +190,7 @@ class CheckPlan:
def check_process_io_counters(self):
processes = self.get_attribute('processes')
if processes:
processes = processes.split(',')
processes = [x.strip() for x in processes.split(',')]
res = []
for process in psutil.process_iter():
try:
@ -157,7 +198,7 @@ class CheckPlan:
if processes and name not in processes:
continue
counters = process.io_counters()
except psutil.NoSuchProcess:
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
for name in ('read_count', 'write_count', 'read_bytes',
'write_bytes'):
@ -167,3 +208,186 @@ class CheckPlan:
'float_value': getattr(counters, name),
})
return res
def check_process_open_ports(self):
'''
Expected structure in ports attribute:
protocol:ip:port
Example:
TCP:*:22
TCP:*:8000
'''
valid_entries = set()
entries = [x.strip() for x in
self.get_attribute('process_open_ports').split()]
for entry in entries:
if len(entry.split(':')) != 3:
continue
protocol, ip, port = entry.split(':')
if '*' in entry:
valid_entries.add((protocol, entry.replace('*', '0.0.0.0'),
port))
valid_entries.add((protocol, entry.replace('*', '::'), port))
else:
valid_entries.add((protocol, ip, port))
invalids = []
value = 'OK'
for process in psutil.process_iter():
try:
connections = process.get_connections()
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
for connection in connections:
if connection.status != 'LISTEN':
continue
if connection.type not in PROTOCOLS:
continue
protocol = PROTOCOLS[connection.type]
ip = connection.laddr[0]
port = connection.laddr[1]
entry = (protocol, ip, port)
if entry not in valid_entries:
invalids.append(entry)
value = 'Error'
continue
return [{
'result': 'process_open_ports_status',
'char_value': value,
'payload': json.dumps({
'invalid_ports': invalids,
}),
}]
def check_load(self):
one, five, fifteen = os.getloadavg()
res = []
res.append({
'result': 'load_1',
'float_value': one,
})
res.append({
'result': 'load_5',
'float_value': five,
})
res.append({
'result': 'load_15',
'float_value': fifteen,
})
return res
def check_raid(self):
"""
Expected values in raid_devices:
md0, md1
"""
devices = self.get_attribute('raid_devices')
if devices:
devices = [x.strip() for x in devices.split(',')]
lines = open('/proc/mdstat', 'r').readlines()
current_device = None
current_payload = ''
res = []
for line in lines:
if line.startswith('md'):
current_device = line.split()[0]
current_payload += line
continue
if current_device:
if devices and current_device not in devices:
current_device = None
current_payload = ''
continue
current_payload += line
if '[UU]' in line:
state = 'OK'
else:
state = 'Error'
res.append({
'result': 'raid_status',
'label': current_device,
'char_value': state,
'payload': json.dumps({
'output': current_payload,
}),
})
current_device = None
current_payload = ''
return res
def check_ntp_status(self):
output = check_output('/usr/sbin/ntpdate', '-q', 'pool.ntp.org')
line = output.splitlines()[-1]
sec = line.split()[-1]
text = line.split()[-2]
offset = line.split()[-3]
res = []
value = 999999
if sec == 'sec' and offset == 'offset':
try:
value = float(text)
except ValueError:
pass
res.append({
'result': 'ntp_offset',
'float_value': value,
})
return res
def check_apt(self):
output = check_output('apt-get', '-s', 'upgrade')
upgrades = 0
security_upgrades = 0
errors = False
for line in output.splitlines():
if not line.startswith('Inst'):
continue
upgrades += 1
items = line.split()
if len(items) != 5:
errors = True
continue
release = items[3]
if 'security' in release.lower():
security_upgrades += 1
res = []
res.append({
'result': 'apt_status',
'char_value': 'Error' if errors else 'OK',
})
res.append({
'result': 'apt_upgrades',
'float_value': upgrades,
})
res.append({
'result': 'apt_security_upgrades',
'float_value': security_upgrades,
})
return res
def check_disk_writable(self):
path = self.get_attribute('writable_path')
path = path.strip()
if not path.endswith('/'):
path += '/'
try:
with tempfile.TemporaryFile(prefix=path):
pass
except Exception, e:
return [{
'result': 'disk_writable',
'label': path,
'char_value': 'Error',
'payload': str(e),
}]
return [{
'result': 'disk_writable',
'label': path,
'char_value': 'OK',
}]

View File

@ -293,6 +293,28 @@
<field name="uom" ref="product.uom_unit"/>
</record>
<record model="monitoring.check.type" id="check_process_count">
<field name="name">Process Count</field>
<field name="internal_name">check_process_count</field>
</record>
<record model="monitoring.result.type" id="result_process_count">
<field name="name">Process Count</field>
<field name="internal_name">process_count</field>
<field name="type">float</field>
<field name="uom" ref="product.uom_unit"/>
</record>
<record model="monitoring.check.type" id="check_uptime">
<field name="name">Uptime</field>
<field name="internal_name">check_uptime</field>
</record>
<record model="monitoring.result.type" id="result_uptime">
<field name="name">Uptime</field>
<field name="internal_name">uptime</field>
<field name="type">float</field>
<field name="uom" ref="product.uom_second"/>
</record>
<record model="monitoring.check.type" id="check_process_cpu_percent">
<field name="name">Process CPU Percent</field>
<field name="internal_name">check_process_cpu_percent</field>
@ -354,5 +376,102 @@
<field name="type">float</field>
<field name="uom" ref="monitoring.uom_byte"/>
</record>
<record model="monitoring.check.type" id="check_process_open_ports">
<field name="name">Process Open Ports</field>
<field name="internal_name">check_process_open_ports</field>
</record>
<record model="monitoring.result.type" id="result_process_open_ports_status">
<field name="name">Process Open Ports Status</field>
<field name="internal_name">process_open_ports_status</field>
<field name="type">char</field>
</record>
<record model="product.uom.category" id="uom_cat_load">
<field name="name">Load</field>
</record>
<record model="product.uom" id="uom_load">
<field name="name">Load</field>
<field name="symbol"> </field>
<field name="category" ref="uom_cat_load"/>
<field name="factor" eval="1"/>
<field name="rate" eval="1"/>
</record>
<record model="monitoring.check.type" id="check_load">
<field name="name">Load</field>
<field name="internal_name">check_load</field>
</record>
<record model="monitoring.result.type" id="result_load_1">
<field name="name">Load 1</field>
<field name="internal_name">load_1</field>
<field name="type">float</field>
<field name="uom" ref="uom_load"/>
</record>
<record model="monitoring.result.type" id="result_load_5">
<field name="name">Load 5</field>
<field name="internal_name">load_5</field>
<field name="type">float</field>
<field name="uom" ref="uom_load"/>
</record>
<record model="monitoring.result.type" id="result_load_15">
<field name="name">Load 15</field>
<field name="internal_name">load_15</field>
<field name="type">float</field>
<field name="uom" ref="uom_load"/>
</record>
<record model="monitoring.check.type" id="check_raid">
<field name="name">RAID Status</field>
<field name="internal_name">check_raid</field>
</record>
<record model="monitoring.result.type" id="result_raid_status">
<field name="name">RAID Status</field>
<field name="internal_name">raid_status</field>
<field name="type">char</field>
</record>
<record model="monitoring.check.type" id="check_ntp_status">
<field name="name">NTP Status</field>
<field name="internal_name">check_ntp_status</field>
</record>
<record model="monitoring.result.type" id="result_ntp_status">
<field name="name">NTP Status</field>
<field name="internal_name">ntp_offset</field>
<field name="type">float</field>
<field name="uom" ref="product.uom_second"/>
</record>
<record model="monitoring.check.type" id="check_apt">
<field name="name">APT</field>
<field name="internal_name">check_apt</field>
</record>
<record model="monitoring.result.type" id="result_apt_status">
<field name="name">APT Status</field>
<field name="internal_name">apt_status</field>
<field name="type">char</field>
</record>
<record model="monitoring.result.type" id="result_apt_upgrades">
<field name="name">APT Upgrades</field>
<field name="internal_name">apt_upgrades</field>
<field name="type">float</field>
<field name="uom" ref="product.uom_unit"/>
</record>
<record model="monitoring.result.type" id="result_apt_security_upgrades">
<field name="name">APT Security Upgrades</field>
<field name="internal_name">apt_security_upgrades</field>
<field name="type">float</field>
<field name="uom" ref="product.uom_unit"/>
</record>
<record model="monitoring.check.type" id="check_disk_writable">
<field name="name">Disk Writable</field>
<field name="internal_name">check_disk_writable</field>
</record>
<record model="monitoring.result.type" id="result_disk_writable">
<field name="name">Disk Writable</field>
<field name="internal_name">disk_writable</field>
<field name="type">char</field>
</record>
</data>
</tryton>