diff --git a/secrets/all/monitoring.htpasswd b/secrets/all/monitoring.htpasswd new file mode 100644 index 0000000..dd52339 --- /dev/null +++ b/secrets/all/monitoring.htpasswd @@ -0,0 +1,40 @@ +{ + "data": "ENC[AES256_GCM,data:/qJNDvVv7ap7CiCHbxtPEs35RepVZ/jSWeqaKtTMu4ePvZbpX+BDMDPztg==,iv:8p1gnTObH883P/h7QLtCYVbCWvhi6nKhkWNlvazy1CM=,tag:ha+5Mniu0On1TCAGTSCf3A==,type:str]", + "sops": { + "kms": null, + "gcp_kms": null, + "azure_kv": null, + "hc_vault": null, + "age": [ + { + "recipient": "age14ysl953378r2vvy7ft3gwce9xp83pr6wypf5lgx2yjwx2lxra5qs6j8eqe", + "enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBJM21QS0VGNXNZV0NJSVU0\nYUxZaEpMQ0hQNkZPNFNzUjFOM3V2c1ZkaUY4CjdqY2tGaXFOL2liTlBxSitzSkx5\nQmMrTW5EYWZ3RWY0dEZ6NldVSkhBN28KLS0tIDEzamtzdEFQam5ZU3pJY1p3Lzh5\nY2wwdmpIWktWMEwyb09mVDh4Ujd2TUEKiIxFNC5fk38+GvBPIP0RygwA4uw49Uht\nfabstjYfQACI5Auc2IUL7nWPqVTT61obbogbGPfgRkOU/lnU7lhvtw==\n-----END AGE ENCRYPTED FILE-----\n" + }, + { + "recipient": "age1w3wqxt5t00hjv43dcxlr5rjec5mvuzz9ajc8k04azq0gfx0ncgysu6mdmm", + "enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBIQzUraTdVYkg3THQzSXlS\neEtqSUNBMEIvSDZQNWVhNXdBaklPenk4aFhRCkU2UDE5UWZSdEFGV0tPYmxteUlh\nd2hZNXdXYSthK3dmalNrdi9MdE9ybm8KLS0tIFRTc3g2RFR3VWtyaDJBb2UvMWU3\nVkw5QW03djdyd2tCUndNNlpWcW1iUDgKF01T2t0pqk6O7gQh9xYn+KqtLXZRr3v3\nIjhMh46UOrm5OYSDqkKKFEU6Tz6PmHii9ClcC9ejClg+5lBaavlAqg==\n-----END AGE ENCRYPTED FILE-----\n" + } + ], + "lastmodified": "2022-12-30T14:49:56Z", + "mac": "ENC[AES256_GCM,data:vCOGAJDFg1HH2+JwWqKWaSUAq0IS1voaTphRQHWPjXzgJ2eeDbXQQPEkp9Rk5Q5/zfPMdGgw8oR1OjWAv+KcaE/+t8PKPOd1B6a50VkFtgHYuHbunRai3RLMfexZMPfC7QvO5TLLNM4/OzrDsdVozPecj4LXpMrrC9rpacBCnWc=,iv:UtQvqNXzMC/kb0l0sG16PCZSIfacO7jEa88x5Y7eDS4=,tag:yXa2ohlBnh+kqJzstUYIWQ==,type:str]", + "pgp": [ + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhF4DOnsoj685gdcSAQdA7WmJAs7/hgq93M3SphsDJv9m/oOLhf3ftXyngoj4MVkw\n4xKE1ZCN4LErd+/enCdGLeCkzxIFL7lPc1Q1BaCJZwAZjQIhwUFtK2ORWeXK+y65\n0l4Bon20voZRVNjqiU5XuzWTNUQOStiq30qf5MdCYTfyXQI7fpwYsIWoL3T3en41\nNsIHoDYmXgfoIGDHWmkFEW7YRy04/9wUvWnX67eQIEB9WPvX0s1nUnq0t/x7BVem\n=4Zp6\n-----END PGP MESSAGE-----\n", + "fp": "0x6D617FD0A85BAADA" + }, + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhF4D6iFd6webPCUSAQdAk5UEGAJw64qu9tVkeaGGDkX4+9awdb3APedVb5EZPXIw\n46V4iFaF6U6mry2BqbPOzBZxE0AzQtmVR/H3SfPlrqJYak98lhYrRMfMjTs+7DAy\n0l4B4WIodnYaPhkhXuoq2gglCCyQ8FN3yUoDfYcIRgmmBtuwp+t3rPe4pJpDkf1t\nNyHhuKWRKtLaYc9YJMq6smpCICFFYV8RohVUmK6ucWGHJVC5wTX5+Cz/UZv+qpwB\n=tQa7\n-----END PGP MESSAGE-----\n", + "fp": "0xE0262A773B824745" + }, + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhQIMAxFMPvz46t7rAQ/9FjFVuDmr8uuPAGozef2avvej0G8XfJsKHtuVCCUK9IuC\n+dyGQ+stw5bj5BgwepACk7eeFkt+Dr0Xqmit5Q+HrnkEdKt5l5SJXiBkqsU7PwYP\neKL7LmqTQ6KdWhzu1zcL7jdQeKGJagvgaoT1t5Os8sd8Jl4SvzLWNQdVb4RkbU6p\nnBLAHTlTlH5f8MA/nkGO/ku3zs1Hfjg4laI2rWvbqSv7YouJxU2bpZ5KdGlyNMD7\nnjY0joe0EIm/CkY8s5FcOyzu299Q23fYsMKtHDs7fP7MgD7OriKh9Q3s443B++Fg\nK70Ngz2aZNs5liWG+ce+zrSL6QE2+zVtYKb116kMfFFPlWUxZTa0UcHRWJ5mqQV7\nPgum2yMFE6XYpsqlv2k1fnsSAagU8VceddYw+BZwfFJtIXhWIubvXpa3FV3xHdtD\nqJyuA+hE8rwUM6OPiwLBQmaqkywUhNjS7kEhdxTfEvmHWWQKmqqEC/27rooK2bLX\nPWNmkm3XYZeoNXB4aJY16I+lv9cuWr4zFwNF1GFx2zg3hkLU6v8zqN4NuFxa0l50\nTfTZTGYak+whMtdIbIehbjKnZ7nuKrD3agsvGsH4vd02oCpucBjz5xEhDDuoSWFb\n3QqWiPqyfTGWAuo5Xb1S4JyXkLnt8KfXtlgfYlMfW3YmbB9LcPI3e1Hg6c7YwFzS\nXgFBDmD+OY+CFYMsewrke5324Jco+CDEm6G/VKdJg4YS8RDA3J3/CT0zel8Th63J\nOGfKr8+/ilgNqFXv5uxyW6JuIrMcsBRtmosq56lCDlxp0SaLmeVpX71r3opcUWo=\n=a73T\n-----END PGP MESSAGE-----\n", + "fp": "B1480CFF9BBE8E2648A26A640B2E7C171E3AD6D7" + } + ], + "unencrypted_suffix": "_unencrypted", + "version": "3.7.3" + } +} \ No newline at end of file diff --git a/secrets/all/secrets.yaml b/secrets/all/secrets.yaml index d27bb15..93c4b28 100644 --- a/secrets/all/secrets.yaml +++ b/secrets/all/secrets.yaml @@ -4,6 +4,8 @@ smtp: user: ENC[AES256_GCM,data:cyxh+oTM7MdF31+umMplmA==,iv:maKuY4iXQCg10O8TTenBUl6tMJKt54AycfKQW6sheVw=,tag:4DAUiBBgO5H9O7DwLYZJtQ==,type:str] pass: ENC[AES256_GCM,data:CkpgvLnQGBzShLkDaHtOHgZ9zwlsEJ+9KFBmHLgBol9Pcadww040clNM0WJgz8NfDIoYF8M=,iv:OIxuwC/XQOBBKdkRb6/5X8pNPLJNKAEI/WCYKhFScPw=,tag:CoAFk7n5AeSue18OXZvjKA==,type:str] host: ENC[AES256_GCM,data:psXf+lg9433jS1GxH6HQQQ==,iv:wyq/VgWLNs4RbzxNHf4Ave8ZDhQeNPpYXJIdjLLMlr4=,tag:pQ4Wmf+PzSBsHO3eigx91Q==,type:str] +alertmanager: + env: ENC[AES256_GCM,data:eYytFrXgsm4NCyS5F3oVNKsXwf5RL+zEFomMszP4fzb0HkLBk659uWZmqi7kIe8ptyUqqr6oAIV31alFo2TwpWuImvh4gbmaD0sfzesBmqhXyeA3cgi3EMGfMxsdLgUpHKGuJhgrRGLY9jDRI6VIwuLng1YmRw==,iv:YQYwC+Xef9I5IdUsBlTg3g7yRxWJe5Pi1LCur9g9/8w=,tag:U/A2lYu79urWjBFraQh16w==,type:str] sops: kms: [] gcp_kms: [] @@ -28,8 +30,8 @@ sops: UXludFJ5UDEvN2pHMEQweWIvN0NYaFEKB7lVKrsB3eX77iKWwFAXp7LVl+fPcGOl 8CkIBa/rkSWMe0xIetew60wIwx2ZVAv5TTDmKIZyQSTayOpbG3zcdg== -----END AGE ENCRYPTED FILE----- - lastmodified: "2022-12-24T21:21:02Z" - mac: ENC[AES256_GCM,data:GbEO+lLUeswXR6Dc9kGlR9YJhXWh7Io6Z/qciolwkrzIwZAA6l6kaWCdw6OJ6gSNBGhWNye+VDdDV2Ow1Ix1hjnZdl3MUlY/LEEmzP1uUjPvNnEjZjxC8ebzNg3rgVqT6W6q9S7D9nPmAzJMgMlzypR/b1ZX5QuzodhotsleVNM=,iv:TJX940f70Vka+hsfDSvm7Jj0klFylRwB+9XbA9pFkRo=,tag:uaJi+5UI/bppLw+hOgyiDg==,type:str] + lastmodified: "2022-12-30T22:26:42Z" + mac: ENC[AES256_GCM,data:KcJVntK3kAMH64ZHYoJmDCA7BroYJZhmMvtgbnjI44h5cwT9TyDOso2HO/x2z5XzTfPQfbW3tfhT1UmbaBHxBTkV7gJGU9fJ1xOrCRDoHuI9RTwGnCHKwQPLIWj56NMiC2CMf00pFpUlAPyDP/VHwRsc/laUPne2Da1MKZp/NOk=,iv:iHONd6xTjfVZNFELQVafAiljWYavu1DOZrS+d7pd8Zw=,tag:1Zh32h2OEtqKPoXy+8pypw==,type:str] pgp: - created_at: "2022-12-23T19:08:38Z" enc: | diff --git a/services/monitoring/default.nix b/services/monitoring/default.nix new file mode 100644 index 0000000..d29233f --- /dev/null +++ b/services/monitoring/default.nix @@ -0,0 +1,274 @@ +{ + config, + lib, + outputs, + ... +}: let + inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional; + inherit (lib) escapeRegex; + inherit (config.networking) fqdn hostName; + + # Basically a manual list of (legacy) hosts not yet migrated to NixOS + # but on which we'd like to have included in the monitoring. + externalTargets = let + host = hostName: { + _module.args.baseDomain = "chaos.jetzt"; + config = { + networking = { + inherit hostName; + domain = "net.chaos.jetzt"; + }; + services.prometheus = { + enable = true; + port = 9090; + alertmanager = { + enable = true; + port = 9093; + }; + }; + }; + }; + in { + hopper = host "hopper"; + lovelace = host "lovelace"; + }; + + monDomain = "mon.${config.networking.domain}"; + + isMe = host: host.config.networking.fqdn == fqdn; + others = filterAttrs (_: !isMe) outputs.nixosConfigurations; + isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev"; + allHosts = outputs.nixosConfigurations // externalTargets; + /* + Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts). + That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts, + the dev host can be taken out of the equation + */ + # allTargets = filterAttrs (_: c: (isMe c) || !(isDev c)) allHosts; + allTargets = allHosts; + + # monFqdn = config: "${config.networking.hostName}.${monDomain}"; + hasEnabled = servicePath: config: let + path = servicePath ++ ["enable"]; + in + (hasAttrByPath path config) && (getAttrFromPath path config); + + monTarget = servicePath: config: let + port = toString (getAttrFromPath (servicePath ++ ["port"]) config); + in "${config.networking.hostName}.${monDomain}:${port}"; + + dropMetrics = {wildcard ? true}: extraRegexen: let + dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen; + in + singleton { + inherit (regex); + regex = "(${concatStringsSep "|" dropRegexen})${optionalString wildcard ".*"}"; + source_labels = ["__name__"]; + action = "drop"; + }; + + relabelInstance = { + source_labels = ["__address__"]; + regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*"; + target_label = "instance"; + }; + + prometheusPath = ["services" "prometheus"]; + alertmanagerPath = ["services" "prometheus" "alertmanager"]; + targetAllHosts = servicePath: + mapAttrsToList + (_: config: monTarget servicePath config.config) + (filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets)); +in { + /* + Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password): + + 1. `sops -d secrets/all/monitoring.htpasswd > /tmp/monitoring.htpasswd` + 2. Use `htpasswd` (from the `apacheHttpd` package) to your hearts content + 3. `sops -e /tmp/monitoring.htpasswd > secrets/all/monitoring.htpasswd` + 4. `rm /tmp/monitoring.htpasswd` + */ + sops.secrets = { + "monitoring.htpasswd" = { + format = "binary"; + owner = config.services.nginx.user; + sopsFile = ../../secrets/all/monitoring.htpasswd; + }; + "alertmanager/env" = { + format = "yaml"; + sopsFile = ../../secrets/all/secrets.yaml; + }; + }; + + services.nginx.virtualHosts."${fqdn}" = let + monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path; + in { + enableACME = true; + forceSSL = true; + locations."/prometheus/" = { + basicAuthFile = monitoring_htpasswd; + proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/"; + }; + locations."/alertmanager/" = { + basicAuthFile = monitoring_htpasswd; + proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/"; + }; + }; + + services.prometheus.exporters.node = { + enable = true; + enabledCollectors = ["systemd"]; + # They either don't apply to us or will provide us with metrics not usefull to us + disabledCollectors = [ + "arp" + "bcache" + "bonding" + "btrfs" + "cpufreq" + "edac" + "entropy" + "infiniband" + "rapl" + "selinux" + "timex" + ]; + }; + + services.prometheus = { + enable = true; + webExternalUrl = "https://${fqdn}/prometheus/"; + extraFlags = [ + "--web.route-prefix=\"/\"" + "--web.enable-admin-api" + ]; + ruleFiles = [ + ./rules.yaml + ]; + retentionTime = "30d"; + + alertmanagers = [{ + static_configs = [{ + targets = [(monTarget alertmanagerPath config)]; + }]; + }]; + + scrapeConfigs = [ + { + job_name = "node"; + static_configs = [{ + targets = [ + # Only scraping to own node-exporter + (monTarget ["services" "prometheus" "exporters" "node"] config) + ]; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} []; + } + { + job_name = "alertmanager"; + static_configs = [{ + targets = targetAllHosts alertmanagerPath; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} [ + "alertmanager_http_(response_size_bytes|request_duration_seconds)_" + "alertmanager_notification_latency_seconds_" + "alertmanager_(nflog|cluster)_" + "alertmanager_silences_(query_duration_seconds|gc)_" + ]; + } + { + job_name = "prometheus"; + static_configs = [{ + targets = targetAllHosts prometheusPath; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} [ + "prometheus_(sd|tsdb|target)_" + "prometheus_(engine_query|rule_evaluation)_duration_" + "prometheus_http_(response_size_bytes|request_duration_seconds)_" + "net_conntrack_dialer_conn_" + ]; + } + ]; + }; + + services.prometheus.alertmanager = { + enable = true; + extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="]; + webExternalUrl = "https://${fqdn}/alertmanager/"; + environmentFile = config.sops.secrets."alertmanager/env".path; + + configuration = { + global = { + smtp_from = "Chaos-Jetzt Monitoring (${hostName}) "; + smtp_smarthost = "\${SMTP_HOST}:587"; + smtp_auth_username = "\${SMTP_USER}"; + smtp_auth_password = "\${SMTP_PASS}"; + smtp_hello = config.networking.fqdn; + }; + + receivers = [{ + name = "mail"; + email_configs = [ + { to = "jetzt+mon@e1mo.de"; + send_resolved = true; } + { to = "info@adb.sh"; + send_resolved = true; } + ]; + }]; + + route = { + receiver = "mail"; + repeat_interval = "16h"; + group_wait = "1m"; + group_by = ["alertname" "instance"]; + routes = [ + { + match.severiy = "critical"; + receiver = "mail"; + repeat_interval = "6h"; + } + { + match.severiy = "error"; + receiver = "mail"; + repeat_interval = "16h"; + } + { + match.severiy = "warn"; + receiver = "mail"; + repeat_interval = "28h"; + } + { + match.severiy = "info"; + receiver = "mail"; + repeat_interval = "56h"; + } + ]; + }; + + inhibit_rules = [ + { + target_matchers = ["alertname = ReducedAvailableMemory"]; + source_matchers = ["alertname =~ (Very)LowAvailableMemory"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = LowAvailableMemory"]; + source_matchers = ["alertname = VeryLowAvailableMemory"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = ElevatedLoad"]; + source_matchers = ["alertname =~ (Very)HighLoad"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = HighLoad"]; + source_matchers = ["alertname = VeryHighLoad"]; + equal = ["instance"]; + } + ]; + }; + }; +} diff --git a/services/monitoring/rules.yaml b/services/monitoring/rules.yaml new file mode 100644 index 0000000..7827c86 --- /dev/null +++ b/services/monitoring/rules.yaml @@ -0,0 +1,139 @@ +--- +groups: + - name: General system + rules: + - record: node_memory_MemAvailable_percentage + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + - record: node_cpu_count + expr: count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) + + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + serverity: critical + annotations: + summary: "{{ $labels.instance }} down" + + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} > 0 + for: 5m + labels: + serverity: error + annotations: + summary: "{{ $labels.name }} on {{ $labels.instance }} crashed" + + - alert: OomKill + expr: increase(node_vmstat_oom_kill[5m]) > 0 # Scrape interval of 1 minute + for: 0m + labels: + serverity: error + annotations: + summary: "OOM kill on {{ $labels.instance }}" + + - alert: VeryLowAvailableMemory + # Less than 3% available or less than 100KB available memory + expr: ((node_memory_MemAvailable_percentage < 3) or (node_memory_MemFree_bytes < (1024 * 100))) + for: 2m + labels: + serverity: critical + annotations: + summary: "Very low free memory on {{ $labels.instance }}" + + - alert: LowAvailableMemory + expr: node_memory_MemAvailable_percentage < 10 + for: 5m + labels: + serverity: warn + annotations: + summary: "Low free memory on {{ $labels.instance }}" + + - alert: ReducedAvailableMemory + expr: node_memory_MemAvailable_percentage < 20 + # Detect constant high memory usage as a potential sign that the host could maybe need some more memory + for: 1h + labels: + serverity: info + annotations: + summary: "Reduced available memory on {{ $labels.instance }}" + + - alert: HighMemoryPressure + # For at least 5 seconds over the last 2 Minutes, no progress could be made + # due to memory congestion + expr: increase(node_pressure_memory_stalled_seconds_total[2m]) >= 5 + labels: + serverity: error + annotations: + summary: "Memory preassure on {{ $labels.instance }}" + + - alert: VeryHighLoad + expr: ((node_load1 / node_cpu_count) > 1) or ((node_load5 / node_cpu_count) > 0.9) + for: 5m + labels: + serverity: error + annotations: + summary: "Very High load on {{ $labels.instance }}" + + - alert: HighLoad + expr: ((node_load1 / node_cpu_count) > 0.9) or ((node_load5 / node_cpu_count) > 0.8) + for: 10m + labels: + serverity: warn + annotations: + summary: "High load on {{ $labels.instance }}" + + - alert: ElevatedLoad + # Detecting a long-term increased load + expr: (node_load15 / node_cpu_count) > 0.7 + for: 15m + labels: + serverity: info + annotations: + summary: "Elevated load15 on {{ $labels.instance }}" + + - alert: LowFreeDiskSpace + expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15 + for: 10m + labels: + serverity: info + annotations: + summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space left" + + - alert: VeryLowFreeDiskSpace + expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 + for: 3m + labels: + serverity: error + annotations: + summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space left" + + - name: Prometheus Alertmanager + rules: + - alert: AlertmanagerMissing + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 5m + labels: + serverity: warn + annotations: + summary: "No connected alertmanager on {{ $labels.instance }}" + + - alert: AlertmanagerMissing + expr: increase(prometheus_notifications_dropped_total[5m]) > 0 + labels: + serverity: error + annotations: + summary: "Dropped alert notifications on {{ $labels.instance }}" + + - alert: FailingAlertmanagerNotifications + expr: increase(alertmanager_notifications_failed_total[5m]) > 0 + labels: + serverity: error + annotations: + summary: "Failing notifications via {{ $labels.integration }} on {{ $labels.instance }}" + + - alert: FailingRuleEvaluation + expr: sum by (instance) (increase(prometheus_rule_evaluation_failures_total[5m])) > 0 + labels: + serverity: warn + annotations: + summary: "Failing rule evaluations on {{ $labels.instance }}"