maintenance/hydra/berlin.scm

701 lines
29 KiB
Scheme
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

;; OS configuration for "berlin", the frontend of the compile farm
;; hosted at the MDC.
;; Copyright © 2016-2023 Ludovic Courtès <ludo@gnu.org>
;; Copyright © 2017, 2018, 2019, 2020, 2021, 2022 Ricardo Wurmus <rekado@elephly.net>
;; Copyright © 2019, 2021 Julien Lepiller <julien@lepiller.eu>
;; Copyright © 2020, 2021 Florian Pelz <pelzflorian@pelzflorian.de>
;; Copyright © 2020, 2021 Mathieu Othacehe <othacehe@gnu.org>
;; Copyright © 2021 Tobias Geerinckx-Rice <me@tobias.gr>
;; Copyright © 2022, 2023 Maxim Cournoyer <maxim.cournoyer@gmail.com>
;; Copyright © 2023 Andreas Enge <andreas@enge.fr>
;; Copyright © 2023 Arun Isaac <arunisaac@systemreboot.net>
;; Released under the GNU GPLv3 or any later version.
(use-modules (gnu) (guix) (sysadmin services) (sysadmin people) (sysadmin dns)
(sysadmin web)
(guix git-download)
(guix modules)
((guix utils) #:select (current-source-directory))
((guix build utils) #:select (find-files))
(srfi srfi-1)
(ice-9 match))
(use-service-modules avahi base databases dns linux
mcron monitoring networking admin
rsync shepherd ssh vpn web)
(use-package-modules admin base certs databases disk emacs linux mail monitoring
screen ssh tls tor vim package-management
version-control
web wget ci rsync
guile-xyz)
(define %sysadmins
;; The sysadmins.
(list (sysadmin (name "ludo")
(full-name "Ludovic Courtès")
(ssh-public-key (local-file "keys/ssh/ludo.pub")))
(sysadmin (name "rekado")
(full-name "Ricardo Wurmus")
(ssh-public-key (local-file "keys/ssh/rekado.pub")))
(sysadmin (name "andreas")
(full-name "Andreas Enge")
(ssh-public-key (local-file "keys/ssh/andreas.pub")))
(sysadmin (name "mbakke")
(full-name "Marius Bakke")
(ssh-public-key (local-file "keys/ssh/mbakke.pub")))
(sysadmin (name "nckx")
(full-name "Tobias Geerinckx-Rice")
(ssh-public-key (local-file "keys/ssh/nckx.pub")))
(sysadmin (name "mathieu")
(full-name "Mathieu Othacehe")
(ssh-public-key (local-file "keys/ssh/mathieu.pub")))
(sysadmin (name "pimi")
(full-name "Mădălin Patrascu")
(ssh-public-key (local-file "keys/ssh/pimi.pub")))
(sysadmin (name "janneke")
(full-name "Jan (janneke) Nieuwenhuizen")
(ssh-public-key (local-file "keys/ssh/janneke.pub")))
(sysadmin (name "cbaines")
(full-name "Christopher Baines")
(ssh-public-key (local-file "keys/ssh/cbaines.pub")))
(sysadmin (name "lfam")
(full-name "Leo Famulari")
(ssh-public-key (local-file "keys/ssh/lfam.pub")))
(sysadmin (name "maxim")
(full-name "Maxim Cournoyer")
(ssh-public-key (local-file "keys/ssh/maxim.pub")))
(sysadmin (name "arunisaac")
(full-name "Arun Isaac")
(ssh-public-key (local-file "keys/ssh/arunisaac.pub")))
(sysadmin (name "efraim")
(full-name "Efraim Flashner")
(ssh-public-key (local-file "keys/ssh/efraim.pub")))))
(include "nginx/berlin.scm")
(include "website.scm")
;;;
;;; Operating system.
;;;
(define %motd
;; Message of the day!
(plain-file "motd"
"\
░░░ ░░░
░░▒▒░░░░░░░░░ ░░░░░░░░░▒▒░░
░░▒▒▒▒▒░░░░░░░ ░░░░░░░▒▒▒▒▒░
░▒▒▒░░▒▒▒▒▒ ░░░░░░░▒▒░
░▒▒▒▒░ ░░░░░░
▒▒▒▒▒ ░░░░░░
▒▒▒▒▒ ░░░░░
░▒▒▒▒▒ ░░░░░ Welcome to berlin!
▒▒▒▒▒ ░░░░░
▒▒▒▒▒ ░░░░░
░▒▒▒▒▒░░░░░
▒▒▒▒▒▒░░░
▒▒▒▒▒▒░
Best practices:
1. Store everything in guix-maintenance.git.
2. To reconfigure, use the latest Guix available and your personal,
up-to-date copy of guix-maintenance, e.g. '~/src/guix-maintenance'.
Use 'sudo' to reconfigure to leave traces.
3. Notify guix-sysadmin@gnu.org when reconfiguring.
4. Notify guix-sysadmin@gnu.org when something goes wrong.
5. Notify ricardo.wurmus@mdc-berlin.de or rekado@elephly.net when the
machine doesn't respond. Only Ricardo has access to the serial console
to reset the machine.
Happy hacking!\n"))
(define %multipath.conf
(plain-file "multipath.conf"
"\
defaults {
user_friendly_names \"yes\"
find_multipaths \"yes\"
}
blacklist {
devnode \"!^(sd[a-z]|dasd[a-z]|nvme[0-9])\"
device {
vendor \".*\"
product \".*\"
}
}
# allow only Dell Compelent volumes
blacklist_exceptions {
device {
vendor \"COMPELNT\"
product \"Compellent Vol\"
}
}
devices {
device {
vendor \"COMPELNT\"
product \"Compellent Vol\"
path_grouping_policy \"group_by_prio\"
failback \"immediate\"
no_path_retry \"queue\"
}
}
"))
(define %copy-kernel-and-initrd
;; The storage device where the root file system is is invisible to
;; GRUB. Thus, copy the kernel and initrd to /boot, where GRUB will
;; be able to find them.
(with-imported-modules '((guix build utils))
#~(begin
(use-modules (guix build utils))
(for-each
(lambda (file)
(let ((target (string-append "/boot/@root/" file)))
(format #t "copying '~a' to /boot/@root/~%" file)
(mkdir-p (dirname target))
(copy-recursively file target)))
;; /run/current-system/kernel is a profile. The trick below
;; allows us to get at its actual directory name, which is
;; what 'grub.cfg' refers to.
(list (dirname
(canonicalize-path "/run/current-system/kernel/bzImage"))
(dirname (canonicalize-path "/run/current-system/initrd")))))))
(define %build-node-key-directory
;; Directory containing the signing keys of build nodes.
(string-append (current-source-directory) "/keys/guix/berlin"))
(define %build-node-keys
;; Signing keys of the build nodes. The signing key of the head
;; node should be available so that it can use cached substitutes
;; that no longer exist in its store.
(map (lambda (file)
(local-file file (string-map (match-lambda
(#\: #\-)
(chr chr))
(basename file))))
(find-files %build-node-key-directory "\\.pub$")))
;;;
;;; Backups.
;;;
(define %rsync-modules-for-backup
;; Directories exported so they can be backed up on another machine on the
;; project's VPN.
(list (rsync-module
(name "web-pdf")
(file-name "/srv/guix-pdfs"))
(rsync-module
(name "web-video")
(file-name "/srv/videos"))
(rsync-module
(name "web-audio")
(file-name "/srv/audio"))
(rsync-module
(name "web-cuirass")
(file-name "/srv/cuirass-releases"))
(rsync-module
(name "web-cuirass-manual")
(file-name "/srv/cuirass-manual"))
(rsync-module
(name "disarchive")
(file-name "/gnu/disarchive"))
(rsync-module
(name "substitutes")
(file-name "/var/cache/guix/publish"))))
;;;
;;; Btrfs pools.
;;;
;;; Large Btrfs partition on the MDC-provided SAN storage (100 TiB).
(define %btrfs-san-uuid "d5d1a040-7f2a-4c38-9a89-82f08866f6ec")
(define %common-btrfs-options '(("compress" . "zstd")
("space_cache" . "v2")))
(define %btrfs-pool-san
(file-system
(device (uuid %btrfs-san-uuid))
(mount-point "/mnt/btrfs-pool-san")
(type "btrfs")
(options (alist->file-system-options
(cons '("subvolid" . "5")
%common-btrfs-options)))))
(define* (btrfs-subvolume-mount name mount-point
#:key (device-uuid %btrfs-san-uuid))
"Return a file system to mount the Btrfs subvolume NAME at
MOUNT-POINT. DEVICE-UUID can be provided to use a different Btrfs
file system than the default one hosted on the SAN storage."
(file-system
(device (uuid device-uuid))
(mount-point mount-point)
(create-mount-point? #t)
(type "btrfs")
(options (alist->file-system-options
(cons (cons "subvol" name)
%common-btrfs-options)))))
(define btrfs-balance-job
;; Re-allocate chunks which are using less than 5% of their chunk
;; space, to regain Btrfs 'unallocated' space. The usage is kept
;; low (5%) to minimize wear on the SSD. Runs at 5 AM every 3 days.
#~(job '(next-hour-from (next-day (range 1 31 3)) '(5))
(lambda ()
(system* #$(file-append btrfs-progs "/bin/btrfs")
"balance" "start" "-dusage=5" "/"))
"btrfs-balance"))
(define btrfs-send-job
;; Take a snapshot of the substitutes, and send it to
;; hydra-guix-129.
#~(job '(next-minute (range 0 60 10))
#$(program-file
"btrfs-send-publish"
(with-imported-modules (source-module-closure
'((guix build utils)))
#~(begin
(use-modules (guix build utils)
(ice-9 ftw)
(ice-9 exceptions)
(ice-9 match)
(rnrs io simple)
(srfi srfi-1)
(srfi srfi-19)
(srfi srfi-26)
(srfi srfi-71))
(define %lock-file "/var/lock/mcron-btrfs-send-job.lock")
(define btrfs #$(file-append btrfs-progs "/bin/btrfs"))
(define %subvolume "/mnt/btrfs-pool-san/@publish")
(define %snapshots-dir "/mnt/btrfs-pool-san/snapshots/")
;; TODO: Add non-overlapping job support to mcron
;; itself, instead of this ad-hoc advisory lock
;; based solution.
(define (call-with-advisory-lock file thunk)
(call-with-port (open-file file "r")
(lambda (lock)
(flock lock (logior LOCK_EX LOCK_NB))
(thunk))))
(define (snapshot-subvolume subvolume dest)
"Create a new snapshot of SUBVOLUME in DEST."
(let* ((subvolume-name (basename subvolume))
(timestamp (date->string
(time-utc->date (current-time)) "~5"))
(snapshot-name (string-append subvolume-name "."
timestamp)))
(mkdir-p dest)
(with-directory-excursion dest
(invoke btrfs "subvolume" "snapshot" "-r"
subvolume snapshot-name))))
(define (prune-snapshots dir prefix preserve-count)
"Delete all but the PRESERVE-COUNT newest snapshots
found in DIR whose name match PREFIX."
(with-directory-excursion dir
(let* ((snapshots (scandir "." (cut string-prefix?
prefix <>)))
(old-snapshots (if (> (length snapshots)
preserve-count)
(drop-right snapshots
preserve-count)
'())))
;; Only preserve the last two snapshots.
(for-each (cut invoke btrfs "subvolume" "delete" <>)
old-snapshots))))
(define* (get-latest-snapshots dir prefix)
"Return two values: the latest snapshot and its parent, if available."
(with-directory-excursion dir
(let ((snapshots (scandir "." (cut string-prefix?
prefix <>))))
(match snapshots
((head ... parent latest)
(values latest parent))
((latest)
(values latest #f))))))
(define* (send-snapshot file #:key parent
pipe)
"Send the snapshot FILE, a file name, to the output PIPE.
An incremental send is attempted if a PARENT snapshot is provided."
;; Send the snapshot to the remote server (hydra).
(format #t "TODO: send snapshot to hydra-guix-129~%"))
;; Create the lock file if it doesn't exist.
(unless (file-exists? %lock-file)
(mkdir-p (dirname %lock-file))
(call-with-output-file %lock-file (const #t)))
(guard (ex ((eq? 'system-error (exception-kind ex))
(match (exception-args ex)
(("flock" _ _ (11))
(format #t "btrfs-send job already running~%")))))
(call-with-advisory-lock
%lock-file
(lambda _
(define subvolume-name (basename %subvolume))
(snapshot-subvolume %subvolume %snapshots-dir)
(prune-snapshots %snapshots-dir subvolume-name 2)
(let ((snapshot parent (get-latest-snapshots
%snapshots-dir subvolume-name)))
(send-snapshot snapshot #:parent parent
#:pipe #f))))))))))
(define (anonip-service file)
(service anonip-service-type
(anonip-configuration
(input (format #false "/var/run/anonip/~a" file))
(output (format #false "/var/log/anonip/~a" file)))))
(define %anonip-log-files
;; List of files handled by Anonip
'("http.access.log"
"https.access.log"
"disarchive.access.log"
"dump-guix-gnu-org.https.access.log"
"qualif.access.log"
"bootstrappable.access.log"
"bootstrappable.https.access.log"
"workflows-guix-info.access.log"
"workflows-guix-info.https.access.log"
"issues-guix-gnu-org.https.access.log"))
(define (log-file->anonip-service-name file)
"Return the name of the Anonip service handling FILE, a log file."
(symbol-append 'anonip-/var/log/anonip/ (string->symbol file)))
;;;
;;; mumi.
;;;
(define mumi-latest
(let ((commit "025fc600f1cb4c73042bf920aee3e07d5fb9c53a")
(revision "6"))
(package
(inherit mumi)
(name "mumi")
(version (git-version "0.0.5" revision commit))
(source (origin
(method git-fetch)
(uri (git-reference
(url "https://git.savannah.gnu.org/git/guix/mumi.git/")
(commit commit)))
(file-name (git-file-name name version))
(sha256
(base32
"1kzq4h3qwjqxr4471i8g2x9ik0ihcca8nm1ryqszljiz6ml91f6w")))))))
(operating-system
(host-name "berlin.guix.gnu.org")
(timezone "Europe/Berlin")
(locale "en_US.utf8")
(name-service-switch %mdns-host-lookup-nss)
;; Allow access through the serial console at 141.80.167.201; the
;; management interface can only be accessed through selected
;; servers within the MDC campus network.
(kernel-arguments '("console=tty0"
"console=ttyS0,115200"))
;; The Dell server need these kernel modules for the
;; RAID controller.
(initrd-modules (append (list "megaraid_sas" "scsi_transport_sas"
"mpt3sas" "libsas"
;; Suggested by 'guix system init' for
;; the SAN storage.
"qla2xxx")
%base-initrd-modules))
;; Show the GRUB menu on the serial interface.
(bootloader (bootloader-configuration
(bootloader grub-efi-bootloader)
(targets '("/boot/efi"))
(terminal-inputs '(serial))
(terminal-outputs '(serial))))
(file-systems (cons*
(file-system
(mount-point "/boot")
(device (uuid "67498a2f-3e32-4e8c-96a5-8a4844ea229c")) ;/dev/sdg3
(type "ext4"))
(file-system
(mount-point "/boot/efi")
(device (uuid "43AE-6859" 'fat)) ;/dev/sdg2
(type "vfat"))
%btrfs-pool-san ;for convenience
(btrfs-subvolume-mount "@root" "/")
(btrfs-subvolume-mount "@cache" "/var/cache")
(btrfs-subvolume-mount "@home" "/home")
(btrfs-subvolume-mount "@publish" "/var/cache/guix/publish")
%base-file-systems))
;; Local admin account for MDC maintenance.
(users (cons (user-account
(name "bi-admin")
(comment "Local admin")
(group "users")
(supplementary-groups '("wheel"))
(home-directory "/home/bi-admin"))
%base-user-accounts))
(packages (cons* certbot emacs wget iptables
jnettop openssh rsync screen strace
;; This is needed to set GIT_SSL_CAINFO allowing
;; Cuirass to fetch sources via HTTPS.
nss-certs
;; This is for git-receive-pack et al
git-minimal
;; This is for the mumi mailer
msmtp
;; This is for bypassing the firewall...
torsocks
;; This is for a redundant connection to the SAN
multipath-tools
%base-packages))
(services (cons*
(simple-service 'copy-kernel+initrd-to-/boot
activation-service-type
%copy-kernel-and-initrd)
(simple-service 'etc-multipath.conf
etc-service-type
(list `("multipath.conf" ,%multipath.conf)))
(service static-networking-service-type
(list (static-networking
(addresses
(list
;; Connection to the DMZ for public access
;; This is a 10G port.
(network-address
(device "eno2")
(value "141.80.181.40/24"))
;; Connection to build nodes
(network-address
(device "eno1")
(value "141.80.167.131/26"))
;; Connection to maintenance network
(network-address
(device "eno4")
(value "141.80.167.253/26"))))
(routes
(list (network-route
(destination "default")
(gateway "141.80.181.1")))))))
;; Allow login over serial console.
(service agetty-service-type
(agetty-configuration
(tty "ttyS0")
(baud-rate "115200")))
;; Discover substitute servers.
(service avahi-service-type
(avahi-configuration (debug? #t)))
;; Don't let greedy processes put the machine to its knees.
(service earlyoom-service-type)
;; Periodically populate the Disarchive database. Store it
;; under /gnu, which is the big drive; talk directly to the
;; local Cuirass instance.
(service disarchive-service-type
(disarchive-configuration
(directory "/gnu/disarchive")
(cuirass-url "http://localhost:8081")))
;; Rsync service for backup purposes. Listen only the VPN
;; address.
(service rsync-service-type
(rsync-configuration
(address "10.0.0.1")
(modules %rsync-modules-for-backup)))
;; DNS
(service knot-service-type
(knot-configuration
(zones (list (knot-zone-configuration
(domain "guix.gnu.org")
(master '("bayfront-master"))
(acl '("notify-allow")))))
(acls (list (knot-acl-configuration
(id "notify-allow")
(address (list bayfront-ip4))
(action '(notify)))))
(remotes (list (knot-remote-configuration
(id "bayfront-master")
(address (list bayfront-ip4)))))))
;; Monitoring
(service prometheus-node-exporter-service-type)
(service zabbix-agent-service-type)
(service zabbix-server-service-type
(zabbix-server-configuration
(include-files '("/root/zabbix-pass"))
(extra-options "AlertScriptsPath=/root/zabbix-alert-scripts\n")))
(service zabbix-front-end-service-type
(zabbix-front-end-configuration
(nginx (list %zabbix-nginx-server
%zabbix-nginx-local-server))
(db-secret-file "/root/zabbix-front-end-secrets")))
;; For the Zabbix database. It was created by manually
;; following the instructions here:
;; https://www.zabbix.com/documentation/3.0/manual/appendix/install/db_scripts
(service postgresql-service-type
(postgresql-configuration
(postgresql postgresql-13)
(config-file
(postgresql-config-file
(extra-config
'(("max_connections" 300)))))))
(service postgresql-role-service-type)
(service ntp-service-type)
;; Make SSH and ci.guix available over Tor.
(tor-hidden-service "http"
'((22 "127.0.0.1:22")
(80 "127.0.0.1:80")
(443 "127.0.0.1:443")))
;; Onion service for the web site.
(tor-hidden-service "guix.gnu.org"
'((80 "127.0.0.1:80")
(443 "127.0.0.1:443")))
(service tor-service-type)
(service nginx-service-type
(nginx-configuration
(inherit %nginx-configuration)
;; Have the nginx shepherd service depend on the
;; Anonip services so that the writing end of the
;; logs, handled by Anonip, is ready when nginx starts
;; running.
(shepherd-requirement
(map log-file->anonip-service-name
%anonip-log-files))))
(service mumi-service-type
(mumi-configuration
(mumi mumi-latest)
;; The mailer is broken again. No pretty bug report
;; like <https://issues.guix.gnu.org/49295>, but it's
;; broken.
(mailer? #f)
(sender "issues.guix.gnu.org@elephly.net")
(smtp "sendmail:///var/mumi/mumi-mailer")))
;; For the Mumi mailer queue
(service redis-service-type)
;; Guix CRAN updater
(service guix-cran-service-type)
;; Stop Cuirass when disk space is low.
(service disk-space-watchdog-service-type
(list (* 500 GiB) (* 50 GiB)))
;; VPN connection to the remote build nodes.
(service wireguard-service-type
(wireguard-configuration
(addresses (list "10.0.0.1/32"))
(peers
(list
(wireguard-peer
(name "overdrive1")
(public-key "m2qys8ATAKUTT8YNUM3OmnJnw1lYm5GHpUA42/l1Qz8=")
(allowed-ips '("10.0.0.3/32")))
(wireguard-peer
(name "dover")
(public-key "g7Hx3iojVXZTLNqNyr2MlY7UzL60Pc91jM0TtthM7gg=")
(allowed-ips '("10.0.0.4/32")))
(wireguard-peer
(name "guix-x15")
(public-key "pM7dAWXJ35isIDJA3OpbR2YG1Pk3MI9VTlP5ELAeQkM=")
(allowed-ips '("10.0.0.5/32")))
(wireguard-peer
(name "guix-x15b")
(public-key "cwRqmMc8IPAHxFCGIt1WcnJnoWJcefcTXte2vMNi6Wo=")
(allowed-ips '("10.0.0.6/32")))
(wireguard-peer
(name "guixp9")
(public-key "4jflGVC+6ee1jsXR/6GgBKjxzw4T4WIwYiMhj/lYQTE=")
(allowed-ips '("10.0.0.7/32")))
(wireguard-peer
(name "pankow")
(public-key "BVfQ20Hh+3WSr5esDaXcoM6T7F809iPuGNSpeD1Qd3I=")
(allowed-ips '("10.0.0.8/32")))
(wireguard-peer
(name "kreuzberg")
(public-key "f9WGJTXp8bozJb0KxePjkOclF5pJUy1AomHWJHy80y4=")
(allowed-ips '("10.0.0.9/32")))
(wireguard-peer
(name "grunewald")
(public-key "icqpTshydmh1TW43YDMRS+dpb8ND6iVy6vLlfwtlGSk=")
(allowed-ips '("10.0.0.10/32")))
(wireguard-peer
(name "bayfront")
(public-key "/oydkAV1bep1JRQ/JRO+tEuybwtwczYlidSP97CnDwI=")
(allowed-ips '("10.0.0.11/32")))
(wireguard-peer
(name "jade")
(public-key "FEFR3NX+DfkrsTHpgECvzW/M/0D8V4bVtCEEzQ5naww=")
(allowed-ips '("10.0.0.12/32")))
(wireguard-peer
(name "sjd-p9")
(public-key "JESZIT1RikNQ+xM1a18pXGvZQoZ3vmVkNA+w/qx1Bzs=")
(allowed-ips '("10.0.0.13/32")))
(wireguard-peer
(name "lieserl")
(public-key "CeRd0ZKjlyMDSMbSes1UQ43lADxWX2X8dS/VFo9qej8=")
(allowed-ips '("10.0.0.14/32")))))))
(append
(map anonip-service %anonip-log-files)
(website-services)
(modify-services
(frontend-services %sysadmins
#:authorized-keys %build-node-keys
;; Get substitutes from our local 'guix
;; publish' cache.
#:substitute-urls '("http://ci.guix.gnu.org")
;; Make sure we get enough build users.
#:build-accounts-to-max-jobs-ratio 5
#:gc-threshold #f
#:systems '("x86_64-linux" "i686-linux"
"aarch64-linux"
"powerpc64le-linux")
#:motd %motd
#:publish-workers 8
#:max-jobs 20)
(mcron-service-type
config => (mcron-configuration
(inherit config)
(jobs (cons* btrfs-balance-job
btrfs-send-job
(mcron-configuration-jobs
config))))))))))