Add smart monitoring using a textcollector
Collects the smart data using smartctl and outputs them in the textcollector dir. This expects smartd to be configured to regularly self tests on a regular interval to detect if a disk is broken.
This commit is contained in:
parent
14f2a83aa9
commit
bf5a165303
6 changed files with 131 additions and 0 deletions
|
@ -73,3 +73,7 @@ The Nginx access logs/systemd logs are indexed by loki. For non webserver hosts
|
|||
### AUR monitoring
|
||||
|
||||
Some fun statistics are scraped from aur.archlinux.org using `curl` and `hq` as there is no proper AUR prometheus endpoint as of yet. The statistics are the AUR packages and users and is retrieved every 5 minutes.
|
||||
|
||||
### Smart
|
||||
|
||||
TODO:
|
||||
|
|
7
hosts
7
hosts
|
@ -182,3 +182,10 @@ america.mirror.pkgbuild.com
|
|||
europe.mirror.pkgbuild.com
|
||||
repro2.pkgbuild.com
|
||||
runner1.archlinux.org
|
||||
|
||||
[dedicated_servers]
|
||||
gemini.archlinux.org
|
||||
build.archlinux.org
|
||||
runner1.archlinux.org
|
||||
runner2.archlinux.org
|
||||
secure-runner1.archlinux.org
|
||||
|
|
63
roles/prometheus_exporters/files/smart-textcollector.sh
Executable file
63
roles/prometheus_exporters/files/smart-textcollector.sh
Executable file
|
@ -0,0 +1,63 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
|
||||
if (( $# != 1 )); then
|
||||
echo "Missing textcollector directory argument"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TEXTFILE_COLLECTOR_DIR=${1}
|
||||
PROM_FILE=$TEXTFILE_COLLECTOR_DIR/smart.prom
|
||||
|
||||
TMP_FILE=$PROM_FILE.$$
|
||||
[ -e $TMP_FILE ] && rm -f $TMP_FILE
|
||||
|
||||
trap "rm -f $TMP_FILE" EXIT
|
||||
|
||||
# Metric types
|
||||
echo "# HELP smart_device_smart_healthy SMART metric device_smart_healthy" >> $TMP_FILE
|
||||
echo "# TYPE smart_device_smart_healthy gauge" >> $TMP_FILE
|
||||
|
||||
echo "# HELP smart_temperature_celsius SMART metric temperature_celsius" >> $TMP_FILE
|
||||
echo "# TYPE smart_temperature_celsius gauge" >> $TMP_FILE
|
||||
|
||||
echo "# HELP smart_device_info Device information, family/model name" >> $TMP_FILE
|
||||
echo "# TYPE smart_device_info gauge" >> $TMP_FILE
|
||||
|
||||
echo "# HELP smart_device_self_test Self test status" >> $TMP_FILE
|
||||
echo "# TYPE smart_device_self_test gauge" >> $TMP_FILE
|
||||
|
||||
devices="$(smartctl --scan-open --json)"
|
||||
devices_total="$(echo $devices | jq '.devices | length')"
|
||||
|
||||
for ((i=0; i < $devices_total; i++)); do
|
||||
disk=$(echo $devices | jq -r ".devices[${i}].name")
|
||||
type=$(echo $devices | jq -r ".devices[${i}].type")
|
||||
info=$(smartctl -a --json $disk)
|
||||
|
||||
status=$(echo $info | jq '.smart_status.passed')
|
||||
if [[ "$status" == "true" ]]; then
|
||||
echo "smart_device_smart_healthy{disk=\"${disk}\"} 1" >> $TMP_FILE
|
||||
else
|
||||
echo "smart_device_smart_healthy{disk=\"${disk}\"} 0" >> $TMP_FILE
|
||||
fi
|
||||
|
||||
status=$(echo $info | jq '.ata_smart_data.self_test.status.passed')
|
||||
if [[ "$status" == "true" ]]; then
|
||||
echo "smart_device_self_test{disk=\"${disk}\"} 1" >> $TMP_FILE
|
||||
else
|
||||
echo "smart_device_self_test{disk=\"${disk}\"} 0" >> $TMP_FILE
|
||||
fi
|
||||
|
||||
echo "smart_temperature_celsius{disk=\"${disk}\"} $(echo $info | jq '.temperature.current')" >> $TMP_FILE
|
||||
|
||||
# disk information
|
||||
model_family=$(echo $info | jq -r '.model_family')
|
||||
model_name=$(echo $info | jq -r '.model_name')
|
||||
serial_number=$(echo $info | jq -r '.serial_number')
|
||||
echo "smart_device_info{disk=\"$disk\",type=\"$type\",model_family=\"$model_family\",model_name=\"$model_name\",serial_number=\"$serial_number\"} 1" >> $TMP_FILE
|
||||
done
|
||||
|
||||
mv -f $TMP_FILE $PROM_FILE
|
|
@ -7,6 +7,10 @@
|
|||
pacman: name=prometheus-blackbox-exporter state=present
|
||||
when: "'prometheus' in group_names"
|
||||
|
||||
- name: install smartmontools for dedicated servers
|
||||
pacman: name=smartmontools state=present
|
||||
when: "'dedicated_servers' in group_names"
|
||||
|
||||
- name: install prometheus-memcached-exporter
|
||||
pacman: name=prometheus-memcached-exporter state=present
|
||||
when: "'memcached' in group_names"
|
||||
|
@ -61,6 +65,7 @@
|
|||
- btrfs-textcollector.sh
|
||||
- aur-textcollector.sh
|
||||
- fail2ban-textcollector.sh
|
||||
- smart-textcollector.sh
|
||||
|
||||
- name: install arch textcollector service
|
||||
template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=644
|
||||
|
@ -85,6 +90,18 @@
|
|||
- { name: borg-offsite, service: borg-backup-offsite }
|
||||
when: "'borg_clients' in group_names"
|
||||
|
||||
- name: install smart textcollector service
|
||||
template: src=prometheus-smart-textcollector.service.j2 dest=/etc/systemd/system/prometheus-smart-textcollector.service owner=root group=root mode=644
|
||||
when: "'dedicated_servers' in group_names"
|
||||
|
||||
- name: install smart textcollector timer
|
||||
template: src=prometheus-smart-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-smart-textcollector.timer owner=root group=root mode=644
|
||||
when: "'dedicated_servers' in group_names"
|
||||
|
||||
- name: enable and start prometheus smart textcollector timer
|
||||
systemd: name=prometheus-smart-textcollector.timer enabled=yes daemon_reload=yes state=started
|
||||
when: "'dedicated_servers' in group_names"
|
||||
|
||||
- name: install hetzner textcollector service
|
||||
template: src=prometheus-hetzner-textcollector.service.j2 dest=/etc/systemd/system/prometheus-hetzner-textcollector.service owner=root group=root mode=644
|
||||
when: "inventory_hostname == 'monitoring.archlinux.org'"
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
[Unit]
|
||||
Description=Prometheus Smart Exporter TextCollector
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/smart-textcollector.sh {{ prometheus_textfile_dir }}
|
||||
|
||||
NoNewPrivileges=true
|
||||
LockPersonality=true
|
||||
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ReadWritePaths={{ prometheus_textfile_dir }}
|
||||
|
||||
MemoryDenyWriteExecute=true
|
||||
RemoveIPC=true
|
||||
RestrictRealtime=true
|
||||
RestrictNamespaces=true
|
||||
RestrictSUIDSGID=true
|
||||
|
||||
PrivateNetwork=true
|
||||
ProtectHostname=true
|
||||
ProtectControlGroups=true
|
||||
ProtectKernelLogs=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
#ProtectClock=true
|
||||
|
||||
SystemCallArchitectures=native
|
|
@ -0,0 +1,10 @@
|
|||
[Unit]
|
||||
Description=Prometheus Smart Exporter TextCollector Timer
|
||||
|
||||
[Timer]
|
||||
OnUnitActiveSec=1h
|
||||
OnBootSec=15min
|
||||
RandomizedDelaySec=1min
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
Loading…
Reference in a new issue