Add smart monitoring using a textcollector

Collects the smart data using smartctl and outputs them in the
textcollector dir. This expects smartd to be configured to regularly
self tests on a regular interval to detect if a disk is broken.
This commit is contained in:
Jelle van der Waa 2021-02-26 22:25:39 +01:00
parent 14f2a83aa9
commit bf5a165303
No known key found for this signature in database
GPG key ID: C06086337C50773E
6 changed files with 131 additions and 0 deletions

View file

@ -73,3 +73,7 @@ The Nginx access logs/systemd logs are indexed by loki. For non webserver hosts
### AUR monitoring
Some fun statistics are scraped from aur.archlinux.org using `curl` and `hq` as there is no proper AUR prometheus endpoint as of yet. The statistics are the AUR packages and users and is retrieved every 5 minutes.
### Smart
TODO:

7
hosts
View file

@ -182,3 +182,10 @@ america.mirror.pkgbuild.com
europe.mirror.pkgbuild.com
repro2.pkgbuild.com
runner1.archlinux.org
[dedicated_servers]
gemini.archlinux.org
build.archlinux.org
runner1.archlinux.org
runner2.archlinux.org
secure-runner1.archlinux.org

View file

@ -0,0 +1,63 @@
#!/bin/bash
set -o errexit
set -o nounset
if (( $# != 1 )); then
echo "Missing textcollector directory argument"
exit 1
fi
TEXTFILE_COLLECTOR_DIR=${1}
PROM_FILE=$TEXTFILE_COLLECTOR_DIR/smart.prom
TMP_FILE=$PROM_FILE.$$
[ -e $TMP_FILE ] && rm -f $TMP_FILE
trap "rm -f $TMP_FILE" EXIT
# Metric types
echo "# HELP smart_device_smart_healthy SMART metric device_smart_healthy" >> $TMP_FILE
echo "# TYPE smart_device_smart_healthy gauge" >> $TMP_FILE
echo "# HELP smart_temperature_celsius SMART metric temperature_celsius" >> $TMP_FILE
echo "# TYPE smart_temperature_celsius gauge" >> $TMP_FILE
echo "# HELP smart_device_info Device information, family/model name" >> $TMP_FILE
echo "# TYPE smart_device_info gauge" >> $TMP_FILE
echo "# HELP smart_device_self_test Self test status" >> $TMP_FILE
echo "# TYPE smart_device_self_test gauge" >> $TMP_FILE
devices="$(smartctl --scan-open --json)"
devices_total="$(echo $devices | jq '.devices | length')"
for ((i=0; i < $devices_total; i++)); do
disk=$(echo $devices | jq -r ".devices[${i}].name")
type=$(echo $devices | jq -r ".devices[${i}].type")
info=$(smartctl -a --json $disk)
status=$(echo $info | jq '.smart_status.passed')
if [[ "$status" == "true" ]]; then
echo "smart_device_smart_healthy{disk=\"${disk}\"} 1" >> $TMP_FILE
else
echo "smart_device_smart_healthy{disk=\"${disk}\"} 0" >> $TMP_FILE
fi
status=$(echo $info | jq '.ata_smart_data.self_test.status.passed')
if [[ "$status" == "true" ]]; then
echo "smart_device_self_test{disk=\"${disk}\"} 1" >> $TMP_FILE
else
echo "smart_device_self_test{disk=\"${disk}\"} 0" >> $TMP_FILE
fi
echo "smart_temperature_celsius{disk=\"${disk}\"} $(echo $info | jq '.temperature.current')" >> $TMP_FILE
# disk information
model_family=$(echo $info | jq -r '.model_family')
model_name=$(echo $info | jq -r '.model_name')
serial_number=$(echo $info | jq -r '.serial_number')
echo "smart_device_info{disk=\"$disk\",type=\"$type\",model_family=\"$model_family\",model_name=\"$model_name\",serial_number=\"$serial_number\"} 1" >> $TMP_FILE
done
mv -f $TMP_FILE $PROM_FILE

View file

@ -7,6 +7,10 @@
pacman: name=prometheus-blackbox-exporter state=present
when: "'prometheus' in group_names"
- name: install smartmontools for dedicated servers
pacman: name=smartmontools state=present
when: "'dedicated_servers' in group_names"
- name: install prometheus-memcached-exporter
pacman: name=prometheus-memcached-exporter state=present
when: "'memcached' in group_names"
@ -61,6 +65,7 @@
- btrfs-textcollector.sh
- aur-textcollector.sh
- fail2ban-textcollector.sh
- smart-textcollector.sh
- name: install arch textcollector service
template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=644
@ -85,6 +90,18 @@
- { name: borg-offsite, service: borg-backup-offsite }
when: "'borg_clients' in group_names"
- name: install smart textcollector service
template: src=prometheus-smart-textcollector.service.j2 dest=/etc/systemd/system/prometheus-smart-textcollector.service owner=root group=root mode=644
when: "'dedicated_servers' in group_names"
- name: install smart textcollector timer
template: src=prometheus-smart-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-smart-textcollector.timer owner=root group=root mode=644
when: "'dedicated_servers' in group_names"
- name: enable and start prometheus smart textcollector timer
systemd: name=prometheus-smart-textcollector.timer enabled=yes daemon_reload=yes state=started
when: "'dedicated_servers' in group_names"
- name: install hetzner textcollector service
template: src=prometheus-hetzner-textcollector.service.j2 dest=/etc/systemd/system/prometheus-hetzner-textcollector.service owner=root group=root mode=644
when: "inventory_hostname == 'monitoring.archlinux.org'"

View file

@ -0,0 +1,30 @@
[Unit]
Description=Prometheus Smart Exporter TextCollector
[Service]
Type=oneshot
ExecStart=/usr/local/bin/smart-textcollector.sh {{ prometheus_textfile_dir }}
NoNewPrivileges=true
LockPersonality=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=read-only
ReadWritePaths={{ prometheus_textfile_dir }}
MemoryDenyWriteExecute=true
RemoveIPC=true
RestrictRealtime=true
RestrictNamespaces=true
RestrictSUIDSGID=true
PrivateNetwork=true
ProtectHostname=true
ProtectControlGroups=true
ProtectKernelLogs=true
ProtectKernelTunables=true
ProtectKernelModules=true
#ProtectClock=true
SystemCallArchitectures=native

View file

@ -0,0 +1,10 @@
[Unit]
Description=Prometheus Smart Exporter TextCollector Timer
[Timer]
OnUnitActiveSec=1h
OnBootSec=15min
RandomizedDelaySec=1min
[Install]
WantedBy=timers.target