From 3acc1865c058aa825b9b0bad9b31687a6df0ae5f Mon Sep 17 00:00:00 2001 From: Moritz 'e1mo' Fromm Date: Sat, 31 Dec 2022 22:40:07 +0100 Subject: [PATCH] services/monitoring: Setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal is to create a monitoring setup where each server monitors itself when it comes failing systemd services, disk or RAM filling up, …. In addition each prometheus will monitor remote prometheus and alertmanager instances for signs of failure (e.g. being unreachable, errors in notification delivery, dropping alerts). A lot of metrics (especially histograms from prometheus or alertmanager) are being dropped before ingestion to disk save on space and memory. Depending on how many servers we may or may not have in the future this could probably use some kind of overhaul since we rightnow have n^2 monitoring peer relationships (not even speaking of possible duplicated alerts). --- secrets/all/monitoring.htpasswd | 40 +++++ secrets/all/secrets.yaml | 6 +- services/monitoring/default.nix | 274 ++++++++++++++++++++++++++++++++ services/monitoring/rules.yaml | 139 ++++++++++++++++ 4 files changed, 457 insertions(+), 2 deletions(-) create mode 100644 secrets/all/monitoring.htpasswd create mode 100644 services/monitoring/default.nix create mode 100644 services/monitoring/rules.yaml diff --git a/secrets/all/monitoring.htpasswd b/secrets/all/monitoring.htpasswd new file mode 100644 index 0000000..dd52339 --- /dev/null +++ b/secrets/all/monitoring.htpasswd @@ -0,0 +1,40 @@ +{ + "data": "ENC[AES256_GCM,data:/qJNDvVv7ap7CiCHbxtPEs35RepVZ/jSWeqaKtTMu4ePvZbpX+BDMDPztg==,iv:8p1gnTObH883P/h7QLtCYVbCWvhi6nKhkWNlvazy1CM=,tag:ha+5Mniu0On1TCAGTSCf3A==,type:str]", + "sops": { + "kms": null, + "gcp_kms": null, + "azure_kv": null, + "hc_vault": null, + "age": [ + { + "recipient": "age14ysl953378r2vvy7ft3gwce9xp83pr6wypf5lgx2yjwx2lxra5qs6j8eqe", + "enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBJM21QS0VGNXNZV0NJSVU0\nYUxZaEpMQ0hQNkZPNFNzUjFOM3V2c1ZkaUY4CjdqY2tGaXFOL2liTlBxSitzSkx5\nQmMrTW5EYWZ3RWY0dEZ6NldVSkhBN28KLS0tIDEzamtzdEFQam5ZU3pJY1p3Lzh5\nY2wwdmpIWktWMEwyb09mVDh4Ujd2TUEKiIxFNC5fk38+GvBPIP0RygwA4uw49Uht\nfabstjYfQACI5Auc2IUL7nWPqVTT61obbogbGPfgRkOU/lnU7lhvtw==\n-----END AGE ENCRYPTED FILE-----\n" + }, + { + "recipient": "age1w3wqxt5t00hjv43dcxlr5rjec5mvuzz9ajc8k04azq0gfx0ncgysu6mdmm", + "enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBIQzUraTdVYkg3THQzSXlS\neEtqSUNBMEIvSDZQNWVhNXdBaklPenk4aFhRCkU2UDE5UWZSdEFGV0tPYmxteUlh\nd2hZNXdXYSthK3dmalNrdi9MdE9ybm8KLS0tIFRTc3g2RFR3VWtyaDJBb2UvMWU3\nVkw5QW03djdyd2tCUndNNlpWcW1iUDgKF01T2t0pqk6O7gQh9xYn+KqtLXZRr3v3\nIjhMh46UOrm5OYSDqkKKFEU6Tz6PmHii9ClcC9ejClg+5lBaavlAqg==\n-----END AGE ENCRYPTED FILE-----\n" + } + ], + "lastmodified": "2022-12-30T14:49:56Z", + "mac": "ENC[AES256_GCM,data:vCOGAJDFg1HH2+JwWqKWaSUAq0IS1voaTphRQHWPjXzgJ2eeDbXQQPEkp9Rk5Q5/zfPMdGgw8oR1OjWAv+KcaE/+t8PKPOd1B6a50VkFtgHYuHbunRai3RLMfexZMPfC7QvO5TLLNM4/OzrDsdVozPecj4LXpMrrC9rpacBCnWc=,iv:UtQvqNXzMC/kb0l0sG16PCZSIfacO7jEa88x5Y7eDS4=,tag:yXa2ohlBnh+kqJzstUYIWQ==,type:str]", + "pgp": [ + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhF4DOnsoj685gdcSAQdA7WmJAs7/hgq93M3SphsDJv9m/oOLhf3ftXyngoj4MVkw\n4xKE1ZCN4LErd+/enCdGLeCkzxIFL7lPc1Q1BaCJZwAZjQIhwUFtK2ORWeXK+y65\n0l4Bon20voZRVNjqiU5XuzWTNUQOStiq30qf5MdCYTfyXQI7fpwYsIWoL3T3en41\nNsIHoDYmXgfoIGDHWmkFEW7YRy04/9wUvWnX67eQIEB9WPvX0s1nUnq0t/x7BVem\n=4Zp6\n-----END PGP MESSAGE-----\n", + "fp": "0x6D617FD0A85BAADA" + }, + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhF4D6iFd6webPCUSAQdAk5UEGAJw64qu9tVkeaGGDkX4+9awdb3APedVb5EZPXIw\n46V4iFaF6U6mry2BqbPOzBZxE0AzQtmVR/H3SfPlrqJYak98lhYrRMfMjTs+7DAy\n0l4B4WIodnYaPhkhXuoq2gglCCyQ8FN3yUoDfYcIRgmmBtuwp+t3rPe4pJpDkf1t\nNyHhuKWRKtLaYc9YJMq6smpCICFFYV8RohVUmK6ucWGHJVC5wTX5+Cz/UZv+qpwB\n=tQa7\n-----END PGP MESSAGE-----\n", + "fp": "0xE0262A773B824745" + }, + { + "created_at": "2022-12-30T14:49:56Z", + "enc": "-----BEGIN PGP MESSAGE-----\n\nhQIMAxFMPvz46t7rAQ/9FjFVuDmr8uuPAGozef2avvej0G8XfJsKHtuVCCUK9IuC\n+dyGQ+stw5bj5BgwepACk7eeFkt+Dr0Xqmit5Q+HrnkEdKt5l5SJXiBkqsU7PwYP\neKL7LmqTQ6KdWhzu1zcL7jdQeKGJagvgaoT1t5Os8sd8Jl4SvzLWNQdVb4RkbU6p\nnBLAHTlTlH5f8MA/nkGO/ku3zs1Hfjg4laI2rWvbqSv7YouJxU2bpZ5KdGlyNMD7\nnjY0joe0EIm/CkY8s5FcOyzu299Q23fYsMKtHDs7fP7MgD7OriKh9Q3s443B++Fg\nK70Ngz2aZNs5liWG+ce+zrSL6QE2+zVtYKb116kMfFFPlWUxZTa0UcHRWJ5mqQV7\nPgum2yMFE6XYpsqlv2k1fnsSAagU8VceddYw+BZwfFJtIXhWIubvXpa3FV3xHdtD\nqJyuA+hE8rwUM6OPiwLBQmaqkywUhNjS7kEhdxTfEvmHWWQKmqqEC/27rooK2bLX\nPWNmkm3XYZeoNXB4aJY16I+lv9cuWr4zFwNF1GFx2zg3hkLU6v8zqN4NuFxa0l50\nTfTZTGYak+whMtdIbIehbjKnZ7nuKrD3agsvGsH4vd02oCpucBjz5xEhDDuoSWFb\n3QqWiPqyfTGWAuo5Xb1S4JyXkLnt8KfXtlgfYlMfW3YmbB9LcPI3e1Hg6c7YwFzS\nXgFBDmD+OY+CFYMsewrke5324Jco+CDEm6G/VKdJg4YS8RDA3J3/CT0zel8Th63J\nOGfKr8+/ilgNqFXv5uxyW6JuIrMcsBRtmosq56lCDlxp0SaLmeVpX71r3opcUWo=\n=a73T\n-----END PGP MESSAGE-----\n", + "fp": "B1480CFF9BBE8E2648A26A640B2E7C171E3AD6D7" + } + ], + "unencrypted_suffix": "_unencrypted", + "version": "3.7.3" + } +} \ No newline at end of file diff --git a/secrets/all/secrets.yaml b/secrets/all/secrets.yaml index d27bb15..93c4b28 100644 --- a/secrets/all/secrets.yaml +++ b/secrets/all/secrets.yaml @@ -4,6 +4,8 @@ smtp: user: ENC[AES256_GCM,data:cyxh+oTM7MdF31+umMplmA==,iv:maKuY4iXQCg10O8TTenBUl6tMJKt54AycfKQW6sheVw=,tag:4DAUiBBgO5H9O7DwLYZJtQ==,type:str] pass: ENC[AES256_GCM,data:CkpgvLnQGBzShLkDaHtOHgZ9zwlsEJ+9KFBmHLgBol9Pcadww040clNM0WJgz8NfDIoYF8M=,iv:OIxuwC/XQOBBKdkRb6/5X8pNPLJNKAEI/WCYKhFScPw=,tag:CoAFk7n5AeSue18OXZvjKA==,type:str] host: ENC[AES256_GCM,data:psXf+lg9433jS1GxH6HQQQ==,iv:wyq/VgWLNs4RbzxNHf4Ave8ZDhQeNPpYXJIdjLLMlr4=,tag:pQ4Wmf+PzSBsHO3eigx91Q==,type:str] +alertmanager: + env: ENC[AES256_GCM,data:eYytFrXgsm4NCyS5F3oVNKsXwf5RL+zEFomMszP4fzb0HkLBk659uWZmqi7kIe8ptyUqqr6oAIV31alFo2TwpWuImvh4gbmaD0sfzesBmqhXyeA3cgi3EMGfMxsdLgUpHKGuJhgrRGLY9jDRI6VIwuLng1YmRw==,iv:YQYwC+Xef9I5IdUsBlTg3g7yRxWJe5Pi1LCur9g9/8w=,tag:U/A2lYu79urWjBFraQh16w==,type:str] sops: kms: [] gcp_kms: [] @@ -28,8 +30,8 @@ sops: UXludFJ5UDEvN2pHMEQweWIvN0NYaFEKB7lVKrsB3eX77iKWwFAXp7LVl+fPcGOl 8CkIBa/rkSWMe0xIetew60wIwx2ZVAv5TTDmKIZyQSTayOpbG3zcdg== -----END AGE ENCRYPTED FILE----- - lastmodified: "2022-12-24T21:21:02Z" - mac: ENC[AES256_GCM,data:GbEO+lLUeswXR6Dc9kGlR9YJhXWh7Io6Z/qciolwkrzIwZAA6l6kaWCdw6OJ6gSNBGhWNye+VDdDV2Ow1Ix1hjnZdl3MUlY/LEEmzP1uUjPvNnEjZjxC8ebzNg3rgVqT6W6q9S7D9nPmAzJMgMlzypR/b1ZX5QuzodhotsleVNM=,iv:TJX940f70Vka+hsfDSvm7Jj0klFylRwB+9XbA9pFkRo=,tag:uaJi+5UI/bppLw+hOgyiDg==,type:str] + lastmodified: "2022-12-30T22:26:42Z" + mac: ENC[AES256_GCM,data:KcJVntK3kAMH64ZHYoJmDCA7BroYJZhmMvtgbnjI44h5cwT9TyDOso2HO/x2z5XzTfPQfbW3tfhT1UmbaBHxBTkV7gJGU9fJ1xOrCRDoHuI9RTwGnCHKwQPLIWj56NMiC2CMf00pFpUlAPyDP/VHwRsc/laUPne2Da1MKZp/NOk=,iv:iHONd6xTjfVZNFELQVafAiljWYavu1DOZrS+d7pd8Zw=,tag:1Zh32h2OEtqKPoXy+8pypw==,type:str] pgp: - created_at: "2022-12-23T19:08:38Z" enc: | diff --git a/services/monitoring/default.nix b/services/monitoring/default.nix new file mode 100644 index 0000000..d29233f --- /dev/null +++ b/services/monitoring/default.nix @@ -0,0 +1,274 @@ +{ + config, + lib, + outputs, + ... +}: let + inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional; + inherit (lib) escapeRegex; + inherit (config.networking) fqdn hostName; + + # Basically a manual list of (legacy) hosts not yet migrated to NixOS + # but on which we'd like to have included in the monitoring. + externalTargets = let + host = hostName: { + _module.args.baseDomain = "chaos.jetzt"; + config = { + networking = { + inherit hostName; + domain = "net.chaos.jetzt"; + }; + services.prometheus = { + enable = true; + port = 9090; + alertmanager = { + enable = true; + port = 9093; + }; + }; + }; + }; + in { + hopper = host "hopper"; + lovelace = host "lovelace"; + }; + + monDomain = "mon.${config.networking.domain}"; + + isMe = host: host.config.networking.fqdn == fqdn; + others = filterAttrs (_: !isMe) outputs.nixosConfigurations; + isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev"; + allHosts = outputs.nixosConfigurations // externalTargets; + /* + Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts). + That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts, + the dev host can be taken out of the equation + */ + # allTargets = filterAttrs (_: c: (isMe c) || !(isDev c)) allHosts; + allTargets = allHosts; + + # monFqdn = config: "${config.networking.hostName}.${monDomain}"; + hasEnabled = servicePath: config: let + path = servicePath ++ ["enable"]; + in + (hasAttrByPath path config) && (getAttrFromPath path config); + + monTarget = servicePath: config: let + port = toString (getAttrFromPath (servicePath ++ ["port"]) config); + in "${config.networking.hostName}.${monDomain}:${port}"; + + dropMetrics = {wildcard ? true}: extraRegexen: let + dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen; + in + singleton { + inherit (regex); + regex = "(${concatStringsSep "|" dropRegexen})${optionalString wildcard ".*"}"; + source_labels = ["__name__"]; + action = "drop"; + }; + + relabelInstance = { + source_labels = ["__address__"]; + regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*"; + target_label = "instance"; + }; + + prometheusPath = ["services" "prometheus"]; + alertmanagerPath = ["services" "prometheus" "alertmanager"]; + targetAllHosts = servicePath: + mapAttrsToList + (_: config: monTarget servicePath config.config) + (filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets)); +in { + /* + Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password): + + 1. `sops -d secrets/all/monitoring.htpasswd > /tmp/monitoring.htpasswd` + 2. Use `htpasswd` (from the `apacheHttpd` package) to your hearts content + 3. `sops -e /tmp/monitoring.htpasswd > secrets/all/monitoring.htpasswd` + 4. `rm /tmp/monitoring.htpasswd` + */ + sops.secrets = { + "monitoring.htpasswd" = { + format = "binary"; + owner = config.services.nginx.user; + sopsFile = ../../secrets/all/monitoring.htpasswd; + }; + "alertmanager/env" = { + format = "yaml"; + sopsFile = ../../secrets/all/secrets.yaml; + }; + }; + + services.nginx.virtualHosts."${fqdn}" = let + monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path; + in { + enableACME = true; + forceSSL = true; + locations."/prometheus/" = { + basicAuthFile = monitoring_htpasswd; + proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/"; + }; + locations."/alertmanager/" = { + basicAuthFile = monitoring_htpasswd; + proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/"; + }; + }; + + services.prometheus.exporters.node = { + enable = true; + enabledCollectors = ["systemd"]; + # They either don't apply to us or will provide us with metrics not usefull to us + disabledCollectors = [ + "arp" + "bcache" + "bonding" + "btrfs" + "cpufreq" + "edac" + "entropy" + "infiniband" + "rapl" + "selinux" + "timex" + ]; + }; + + services.prometheus = { + enable = true; + webExternalUrl = "https://${fqdn}/prometheus/"; + extraFlags = [ + "--web.route-prefix=\"/\"" + "--web.enable-admin-api" + ]; + ruleFiles = [ + ./rules.yaml + ]; + retentionTime = "30d"; + + alertmanagers = [{ + static_configs = [{ + targets = [(monTarget alertmanagerPath config)]; + }]; + }]; + + scrapeConfigs = [ + { + job_name = "node"; + static_configs = [{ + targets = [ + # Only scraping to own node-exporter + (monTarget ["services" "prometheus" "exporters" "node"] config) + ]; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} []; + } + { + job_name = "alertmanager"; + static_configs = [{ + targets = targetAllHosts alertmanagerPath; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} [ + "alertmanager_http_(response_size_bytes|request_duration_seconds)_" + "alertmanager_notification_latency_seconds_" + "alertmanager_(nflog|cluster)_" + "alertmanager_silences_(query_duration_seconds|gc)_" + ]; + } + { + job_name = "prometheus"; + static_configs = [{ + targets = targetAllHosts prometheusPath; + }]; + relabel_configs = [relabelInstance]; + metric_relabel_configs = dropMetrics {} [ + "prometheus_(sd|tsdb|target)_" + "prometheus_(engine_query|rule_evaluation)_duration_" + "prometheus_http_(response_size_bytes|request_duration_seconds)_" + "net_conntrack_dialer_conn_" + ]; + } + ]; + }; + + services.prometheus.alertmanager = { + enable = true; + extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="]; + webExternalUrl = "https://${fqdn}/alertmanager/"; + environmentFile = config.sops.secrets."alertmanager/env".path; + + configuration = { + global = { + smtp_from = "Chaos-Jetzt Monitoring (${hostName}) "; + smtp_smarthost = "\${SMTP_HOST}:587"; + smtp_auth_username = "\${SMTP_USER}"; + smtp_auth_password = "\${SMTP_PASS}"; + smtp_hello = config.networking.fqdn; + }; + + receivers = [{ + name = "mail"; + email_configs = [ + { to = "jetzt+mon@e1mo.de"; + send_resolved = true; } + { to = "info@adb.sh"; + send_resolved = true; } + ]; + }]; + + route = { + receiver = "mail"; + repeat_interval = "16h"; + group_wait = "1m"; + group_by = ["alertname" "instance"]; + routes = [ + { + match.severiy = "critical"; + receiver = "mail"; + repeat_interval = "6h"; + } + { + match.severiy = "error"; + receiver = "mail"; + repeat_interval = "16h"; + } + { + match.severiy = "warn"; + receiver = "mail"; + repeat_interval = "28h"; + } + { + match.severiy = "info"; + receiver = "mail"; + repeat_interval = "56h"; + } + ]; + }; + + inhibit_rules = [ + { + target_matchers = ["alertname = ReducedAvailableMemory"]; + source_matchers = ["alertname =~ (Very)LowAvailableMemory"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = LowAvailableMemory"]; + source_matchers = ["alertname = VeryLowAvailableMemory"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = ElevatedLoad"]; + source_matchers = ["alertname =~ (Very)HighLoad"]; + equal = ["instance"]; + } + { + target_matchers = ["alertname = HighLoad"]; + source_matchers = ["alertname = VeryHighLoad"]; + equal = ["instance"]; + } + ]; + }; + }; +} diff --git a/services/monitoring/rules.yaml b/services/monitoring/rules.yaml new file mode 100644 index 0000000..7827c86 --- /dev/null +++ b/services/monitoring/rules.yaml @@ -0,0 +1,139 @@ +--- +groups: + - name: General system + rules: + - record: node_memory_MemAvailable_percentage + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + - record: node_cpu_count + expr: count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) + + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + serverity: critical + annotations: + summary: "{{ $labels.instance }} down" + + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} > 0 + for: 5m + labels: + serverity: error + annotations: + summary: "{{ $labels.name }} on {{ $labels.instance }} crashed" + + - alert: OomKill + expr: increase(node_vmstat_oom_kill[5m]) > 0 # Scrape interval of 1 minute + for: 0m + labels: + serverity: error + annotations: + summary: "OOM kill on {{ $labels.instance }}" + + - alert: VeryLowAvailableMemory + # Less than 3% available or less than 100KB available memory + expr: ((node_memory_MemAvailable_percentage < 3) or (node_memory_MemFree_bytes < (1024 * 100))) + for: 2m + labels: + serverity: critical + annotations: + summary: "Very low free memory on {{ $labels.instance }}" + + - alert: LowAvailableMemory + expr: node_memory_MemAvailable_percentage < 10 + for: 5m + labels: + serverity: warn + annotations: + summary: "Low free memory on {{ $labels.instance }}" + + - alert: ReducedAvailableMemory + expr: node_memory_MemAvailable_percentage < 20 + # Detect constant high memory usage as a potential sign that the host could maybe need some more memory + for: 1h + labels: + serverity: info + annotations: + summary: "Reduced available memory on {{ $labels.instance }}" + + - alert: HighMemoryPressure + # For at least 5 seconds over the last 2 Minutes, no progress could be made + # due to memory congestion + expr: increase(node_pressure_memory_stalled_seconds_total[2m]) >= 5 + labels: + serverity: error + annotations: + summary: "Memory preassure on {{ $labels.instance }}" + + - alert: VeryHighLoad + expr: ((node_load1 / node_cpu_count) > 1) or ((node_load5 / node_cpu_count) > 0.9) + for: 5m + labels: + serverity: error + annotations: + summary: "Very High load on {{ $labels.instance }}" + + - alert: HighLoad + expr: ((node_load1 / node_cpu_count) > 0.9) or ((node_load5 / node_cpu_count) > 0.8) + for: 10m + labels: + serverity: warn + annotations: + summary: "High load on {{ $labels.instance }}" + + - alert: ElevatedLoad + # Detecting a long-term increased load + expr: (node_load15 / node_cpu_count) > 0.7 + for: 15m + labels: + serverity: info + annotations: + summary: "Elevated load15 on {{ $labels.instance }}" + + - alert: LowFreeDiskSpace + expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15 + for: 10m + labels: + serverity: info + annotations: + summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space left" + + - alert: VeryLowFreeDiskSpace + expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 + for: 3m + labels: + serverity: error + annotations: + summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space left" + + - name: Prometheus Alertmanager + rules: + - alert: AlertmanagerMissing + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 5m + labels: + serverity: warn + annotations: + summary: "No connected alertmanager on {{ $labels.instance }}" + + - alert: AlertmanagerMissing + expr: increase(prometheus_notifications_dropped_total[5m]) > 0 + labels: + serverity: error + annotations: + summary: "Dropped alert notifications on {{ $labels.instance }}" + + - alert: FailingAlertmanagerNotifications + expr: increase(alertmanager_notifications_failed_total[5m]) > 0 + labels: + serverity: error + annotations: + summary: "Failing notifications via {{ $labels.integration }} on {{ $labels.instance }}" + + - alert: FailingRuleEvaluation + expr: sum by (instance) (increase(prometheus_rule_evaluation_failures_total[5m])) > 0 + labels: + serverity: warn + annotations: + summary: "Failing rule evaluations on {{ $labels.instance }}"