services/monitoring: Setup

The goal is to create a monitoring setup where each server monitors itself when it comes failing systemd services, disk or RAM filling up, …. In addition each prometheus will monitor remote prometheus and alertmanager instances for signs of failure (e.g. being unreachable, errors in notification delivery, dropping alerts). A lot of metrics (especially histograms from prometheus or alertmanager) are being dropped before ingestion to disk save on space and memory. Depending on how many servers we may or may not have in the future this could probably use some kind of overhaul since we rightnow have n^2 monitoring peer relationships (not even speaking of possible duplicated alerts).
2022-12-31 22:40:07 +01:00 · 2022-12-31 22:40:07 +01:00 · 3acc1865c0
commit 3acc1865c0
parent 6e9c9a26cd
4 changed files with 457 additions and 2 deletions
--- a/secrets/all/monitoring.htpasswd
+++ b/secrets/all/monitoring.htpasswd
@ -0,0 +1,40 @@
 {
 	"data": "ENC[AES256_GCM,data:/qJNDvVv7ap7CiCHbxtPEs35RepVZ/jSWeqaKtTMu4ePvZbpX+BDMDPztg==,iv:8p1gnTObH883P/h7QLtCYVbCWvhi6nKhkWNlvazy1CM=,tag:ha+5Mniu0On1TCAGTSCf3A==,type:str]",
 	"sops": {
 		"kms": null,
 		"gcp_kms": null,
 		"azure_kv": null,
 		"hc_vault": null,
 		"age": [
 			{
 				"recipient": "age14ysl953378r2vvy7ft3gwce9xp83pr6wypf5lgx2yjwx2lxra5qs6j8eqe",
 				"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBJM21QS0VGNXNZV0NJSVU0\nYUxZaEpMQ0hQNkZPNFNzUjFOM3V2c1ZkaUY4CjdqY2tGaXFOL2liTlBxSitzSkx5\nQmMrTW5EYWZ3RWY0dEZ6NldVSkhBN28KLS0tIDEzamtzdEFQam5ZU3pJY1p3Lzh5\nY2wwdmpIWktWMEwyb09mVDh4Ujd2TUEKiIxFNC5fk38+GvBPIP0RygwA4uw49Uht\nfabstjYfQACI5Auc2IUL7nWPqVTT61obbogbGPfgRkOU/lnU7lhvtw==\n-----END AGE ENCRYPTED FILE-----\n"
 			},
 			{
 				"recipient": "age1w3wqxt5t00hjv43dcxlr5rjec5mvuzz9ajc8k04azq0gfx0ncgysu6mdmm",
 				"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBIQzUraTdVYkg3THQzSXlS\neEtqSUNBMEIvSDZQNWVhNXdBaklPenk4aFhRCkU2UDE5UWZSdEFGV0tPYmxteUlh\nd2hZNXdXYSthK3dmalNrdi9MdE9ybm8KLS0tIFRTc3g2RFR3VWtyaDJBb2UvMWU3\nVkw5QW03djdyd2tCUndNNlpWcW1iUDgKF01T2t0pqk6O7gQh9xYn+KqtLXZRr3v3\nIjhMh46UOrm5OYSDqkKKFEU6Tz6PmHii9ClcC9ejClg+5lBaavlAqg==\n-----END AGE ENCRYPTED FILE-----\n"
 			}
 		],
 		"lastmodified": "2022-12-30T14:49:56Z",
 		"mac": "ENC[AES256_GCM,data:vCOGAJDFg1HH2+JwWqKWaSUAq0IS1voaTphRQHWPjXzgJ2eeDbXQQPEkp9Rk5Q5/zfPMdGgw8oR1OjWAv+KcaE/+t8PKPOd1B6a50VkFtgHYuHbunRai3RLMfexZMPfC7QvO5TLLNM4/OzrDsdVozPecj4LXpMrrC9rpacBCnWc=,iv:UtQvqNXzMC/kb0l0sG16PCZSIfacO7jEa88x5Y7eDS4=,tag:yXa2ohlBnh+kqJzstUYIWQ==,type:str]",
 		"pgp": [
 			{
 				"created_at": "2022-12-30T14:49:56Z",
 				"enc": "-----BEGIN PGP MESSAGE-----\n\nhF4DOnsoj685gdcSAQdA7WmJAs7/hgq93M3SphsDJv9m/oOLhf3ftXyngoj4MVkw\n4xKE1ZCN4LErd+/enCdGLeCkzxIFL7lPc1Q1BaCJZwAZjQIhwUFtK2ORWeXK+y65\n0l4Bon20voZRVNjqiU5XuzWTNUQOStiq30qf5MdCYTfyXQI7fpwYsIWoL3T3en41\nNsIHoDYmXgfoIGDHWmkFEW7YRy04/9wUvWnX67eQIEB9WPvX0s1nUnq0t/x7BVem\n=4Zp6\n-----END PGP MESSAGE-----\n",
 				"fp": "0x6D617FD0A85BAADA"
 			},
 			{
 				"created_at": "2022-12-30T14:49:56Z",
 				"enc": "-----BEGIN PGP MESSAGE-----\n\nhF4D6iFd6webPCUSAQdAk5UEGAJw64qu9tVkeaGGDkX4+9awdb3APedVb5EZPXIw\n46V4iFaF6U6mry2BqbPOzBZxE0AzQtmVR/H3SfPlrqJYak98lhYrRMfMjTs+7DAy\n0l4B4WIodnYaPhkhXuoq2gglCCyQ8FN3yUoDfYcIRgmmBtuwp+t3rPe4pJpDkf1t\nNyHhuKWRKtLaYc9YJMq6smpCICFFYV8RohVUmK6ucWGHJVC5wTX5+Cz/UZv+qpwB\n=tQa7\n-----END PGP MESSAGE-----\n",
 				"fp": "0xE0262A773B824745"
 			},
 			{
 				"created_at": "2022-12-30T14:49:56Z",
 				"enc": "-----BEGIN PGP MESSAGE-----\n\nhQIMAxFMPvz46t7rAQ/9FjFVuDmr8uuPAGozef2avvej0G8XfJsKHtuVCCUK9IuC\n+dyGQ+stw5bj5BgwepACk7eeFkt+Dr0Xqmit5Q+HrnkEdKt5l5SJXiBkqsU7PwYP\neKL7LmqTQ6KdWhzu1zcL7jdQeKGJagvgaoT1t5Os8sd8Jl4SvzLWNQdVb4RkbU6p\nnBLAHTlTlH5f8MA/nkGO/ku3zs1Hfjg4laI2rWvbqSv7YouJxU2bpZ5KdGlyNMD7\nnjY0joe0EIm/CkY8s5FcOyzu299Q23fYsMKtHDs7fP7MgD7OriKh9Q3s443B++Fg\nK70Ngz2aZNs5liWG+ce+zrSL6QE2+zVtYKb116kMfFFPlWUxZTa0UcHRWJ5mqQV7\nPgum2yMFE6XYpsqlv2k1fnsSAagU8VceddYw+BZwfFJtIXhWIubvXpa3FV3xHdtD\nqJyuA+hE8rwUM6OPiwLBQmaqkywUhNjS7kEhdxTfEvmHWWQKmqqEC/27rooK2bLX\nPWNmkm3XYZeoNXB4aJY16I+lv9cuWr4zFwNF1GFx2zg3hkLU6v8zqN4NuFxa0l50\nTfTZTGYak+whMtdIbIehbjKnZ7nuKrD3agsvGsH4vd02oCpucBjz5xEhDDuoSWFb\n3QqWiPqyfTGWAuo5Xb1S4JyXkLnt8KfXtlgfYlMfW3YmbB9LcPI3e1Hg6c7YwFzS\nXgFBDmD+OY+CFYMsewrke5324Jco+CDEm6G/VKdJg4YS8RDA3J3/CT0zel8Th63J\nOGfKr8+/ilgNqFXv5uxyW6JuIrMcsBRtmosq56lCDlxp0SaLmeVpX71r3opcUWo=\n=a73T\n-----END PGP MESSAGE-----\n",
 				"fp": "B1480CFF9BBE8E2648A26A640B2E7C171E3AD6D7"
 			}
 		],
 		"unencrypted_suffix": "_unencrypted",
 		"version": "3.7.3"
 	}
 }
--- a/secrets/all/secrets.yaml
+++ b/secrets/all/secrets.yaml
@ -4,6 +4,8 @@ smtp:
    user: ENC[AES256_GCM,data:cyxh+oTM7MdF31+umMplmA==,iv:maKuY4iXQCg10O8TTenBUl6tMJKt54AycfKQW6sheVw=,tag:4DAUiBBgO5H9O7DwLYZJtQ==,type:str]
    pass: ENC[AES256_GCM,data:CkpgvLnQGBzShLkDaHtOHgZ9zwlsEJ+9KFBmHLgBol9Pcadww040clNM0WJgz8NfDIoYF8M=,iv:OIxuwC/XQOBBKdkRb6/5X8pNPLJNKAEI/WCYKhFScPw=,tag:CoAFk7n5AeSue18OXZvjKA==,type:str]
    host: ENC[AES256_GCM,data:psXf+lg9433jS1GxH6HQQQ==,iv:wyq/VgWLNs4RbzxNHf4Ave8ZDhQeNPpYXJIdjLLMlr4=,tag:pQ4Wmf+PzSBsHO3eigx91Q==,type:str]
 alertmanager:
    env: ENC[AES256_GCM,data:eYytFrXgsm4NCyS5F3oVNKsXwf5RL+zEFomMszP4fzb0HkLBk659uWZmqi7kIe8ptyUqqr6oAIV31alFo2TwpWuImvh4gbmaD0sfzesBmqhXyeA3cgi3EMGfMxsdLgUpHKGuJhgrRGLY9jDRI6VIwuLng1YmRw==,iv:YQYwC+Xef9I5IdUsBlTg3g7yRxWJe5Pi1LCur9g9/8w=,tag:U/A2lYu79urWjBFraQh16w==,type:str]
 sops:
    kms: []
    gcp_kms: []
@ -28,8 +30,8 @@ sops:
            UXludFJ5UDEvN2pHMEQweWIvN0NYaFEKB7lVKrsB3eX77iKWwFAXp7LVl+fPcGOl
            8CkIBa/rkSWMe0xIetew60wIwx2ZVAv5TTDmKIZyQSTayOpbG3zcdg==
            -----END AGE ENCRYPTED FILE-----
-    lastmodified: "2022-12-24T21:21:02Z"
+    lastmodified: "2022-12-30T22:26:42Z"
-    mac: ENC[AES256_GCM,data:GbEO+lLUeswXR6Dc9kGlR9YJhXWh7Io6Z/qciolwkrzIwZAA6l6kaWCdw6OJ6gSNBGhWNye+VDdDV2Ow1Ix1hjnZdl3MUlY/LEEmzP1uUjPvNnEjZjxC8ebzNg3rgVqT6W6q9S7D9nPmAzJMgMlzypR/b1ZX5QuzodhotsleVNM=,iv:TJX940f70Vka+hsfDSvm7Jj0klFylRwB+9XbA9pFkRo=,tag:uaJi+5UI/bppLw+hOgyiDg==,type:str]
+    mac: ENC[AES256_GCM,data:KcJVntK3kAMH64ZHYoJmDCA7BroYJZhmMvtgbnjI44h5cwT9TyDOso2HO/x2z5XzTfPQfbW3tfhT1UmbaBHxBTkV7gJGU9fJ1xOrCRDoHuI9RTwGnCHKwQPLIWj56NMiC2CMf00pFpUlAPyDP/VHwRsc/laUPne2Da1MKZp/NOk=,iv:iHONd6xTjfVZNFELQVafAiljWYavu1DOZrS+d7pd8Zw=,tag:1Zh32h2OEtqKPoXy+8pypw==,type:str]
    pgp:
        - created_at: "2022-12-23T19:08:38Z"
          enc: |
--- a/services/monitoring/default.nix
+++ b/services/monitoring/default.nix
@ -0,0 +1,274 @@
 {
  config,
  lib,
  outputs,
  ...
 }: let
  inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional;
  inherit (lib) escapeRegex;
  inherit (config.networking) fqdn hostName;
  # Basically a manual list of (legacy) hosts not yet migrated to NixOS
  # but on which we'd like to have included in the monitoring.
  externalTargets = let
    host = hostName: {
      _module.args.baseDomain = "chaos.jetzt";
      config = {
        networking = {
          inherit hostName;
          domain = "net.chaos.jetzt";
        };
        services.prometheus = {
          enable = true;
          port = 9090;
          alertmanager = {
            enable = true;
            port = 9093;
          };
        };
      };
    };
  in {
    hopper = host "hopper";
    lovelace = host "lovelace";
  };
  monDomain = "mon.${config.networking.domain}";
  isMe = host: host.config.networking.fqdn == fqdn;
  others = filterAttrs (_: !isMe) outputs.nixosConfigurations;
  isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev";
  allHosts = outputs.nixosConfigurations // externalTargets;
  /*
    Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts).
    That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts,
    the dev host can be taken out of the equation
  */
  # allTargets = filterAttrs (_: c: (isMe c) || !(isDev c)) allHosts;
  allTargets = allHosts;
  # monFqdn = config: "${config.networking.hostName}.${monDomain}";
  hasEnabled = servicePath: config: let
    path = servicePath ++ ["enable"];
  in
    (hasAttrByPath path config) && (getAttrFromPath path config);
  monTarget = servicePath: config: let
    port = toString (getAttrFromPath (servicePath ++ ["port"]) config);
  in "${config.networking.hostName}.${monDomain}:${port}";
  dropMetrics = {wildcard ? true}: extraRegexen: let
    dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen;
  in
    singleton {
      inherit (regex);
      regex = "(${concatStringsSep "|" dropRegexen})${optionalString wildcard ".*"}";
      source_labels = ["__name__"];
      action = "drop";
    };
  relabelInstance = {
    source_labels = ["__address__"];
    regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*";
    target_label = "instance";
  };
  prometheusPath = ["services" "prometheus"];
  alertmanagerPath = ["services" "prometheus" "alertmanager"];
  targetAllHosts = servicePath:
    mapAttrsToList
    (_: config: monTarget servicePath config.config)
    (filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets));
 in {
  /*
  Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password):
  1. `sops -d secrets/all/monitoring.htpasswd > /tmp/monitoring.htpasswd`
  2. Use `htpasswd` (from the `apacheHttpd` package) to your hearts content
  3. `sops -e /tmp/monitoring.htpasswd > secrets/all/monitoring.htpasswd`
  4. `rm /tmp/monitoring.htpasswd`
  */
  sops.secrets = {
    "monitoring.htpasswd" = {
      format = "binary";
      owner = config.services.nginx.user;
      sopsFile = ../../secrets/all/monitoring.htpasswd;
    };
    "alertmanager/env" = {
      format = "yaml";
      sopsFile = ../../secrets/all/secrets.yaml;
    };
  };
  services.nginx.virtualHosts."${fqdn}" = let
    monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path;
  in {
    enableACME = true;
    forceSSL = true;
    locations."/prometheus/" = {
      basicAuthFile = monitoring_htpasswd;
      proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/";
    };
    locations."/alertmanager/" = {
      basicAuthFile = monitoring_htpasswd;
      proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/";
    };
  };
  services.prometheus.exporters.node = {
    enable = true;
    enabledCollectors = ["systemd"];
    # They either don't apply to us or will provide us with metrics not usefull to us
    disabledCollectors = [
      "arp"
      "bcache"
      "bonding"
      "btrfs"
      "cpufreq"
      "edac"
      "entropy"
      "infiniband"
      "rapl"
      "selinux"
      "timex"
    ];
  };
  services.prometheus = {
    enable = true;
    webExternalUrl = "https://${fqdn}/prometheus/";
    extraFlags = [
      "--web.route-prefix=\"/\""
      "--web.enable-admin-api"
    ];
    ruleFiles = [
      ./rules.yaml
    ];
    retentionTime = "30d";
    alertmanagers = [{
      static_configs = [{
          targets = [(monTarget alertmanagerPath config)];
      }];
    }];
    scrapeConfigs = [
      {
        job_name = "node";
        static_configs = [{
          targets = [
            # Only scraping to own node-exporter
            (monTarget ["services" "prometheus" "exporters" "node"] config)
          ];
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [];
      }
      {
        job_name = "alertmanager";
        static_configs = [{
          targets = targetAllHosts alertmanagerPath;
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [
          "alertmanager_http_(response_size_bytes|request_duration_seconds)_"
          "alertmanager_notification_latency_seconds_"
          "alertmanager_(nflog|cluster)_"
          "alertmanager_silences_(query_duration_seconds|gc)_"
        ];
      }
      {
        job_name = "prometheus";
        static_configs = [{
          targets = targetAllHosts prometheusPath;
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [
          "prometheus_(sd|tsdb|target)_"
          "prometheus_(engine_query|rule_evaluation)_duration_"
          "prometheus_http_(response_size_bytes|request_duration_seconds)_"
          "net_conntrack_dialer_conn_"
        ];
      }
    ];
  };
  services.prometheus.alertmanager = {
    enable = true;
    extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="];
    webExternalUrl = "https://${fqdn}/alertmanager/";
    environmentFile = config.sops.secrets."alertmanager/env".path;
    configuration = {
      global = {
        smtp_from = "Chaos-Jetzt Monitoring (${hostName}) <monitoring-${hostName}@chaos.jetzt>";
        smtp_smarthost = "\${SMTP_HOST}:587";
        smtp_auth_username = "\${SMTP_USER}";
        smtp_auth_password = "\${SMTP_PASS}";
        smtp_hello = config.networking.fqdn;
      };
      receivers = [{
        name = "mail";
        email_configs = [
          { to = "jetzt+mon@e1mo.de";
            send_resolved = true; }
          { to = "info@adb.sh";
            send_resolved = true; }
        ];
      }];
      route = {
        receiver = "mail";
        repeat_interval = "16h";
        group_wait = "1m";
        group_by = ["alertname" "instance"];
        routes = [
          {
            match.severiy = "critical";
            receiver = "mail";
            repeat_interval = "6h";
          }
          {
            match.severiy = "error";
            receiver = "mail";
            repeat_interval = "16h";
          }
          {
            match.severiy = "warn";
            receiver = "mail";
            repeat_interval = "28h";
          }
          {
            match.severiy = "info";
            receiver = "mail";
            repeat_interval = "56h";
          }
        ];
      };
      inhibit_rules = [
        {
          target_matchers = ["alertname = ReducedAvailableMemory"];
          source_matchers = ["alertname =~ (Very)LowAvailableMemory"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = LowAvailableMemory"];
          source_matchers = ["alertname = VeryLowAvailableMemory"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = ElevatedLoad"];
          source_matchers = ["alertname =~ (Very)HighLoad"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = HighLoad"];
          source_matchers = ["alertname = VeryHighLoad"];
          equal = ["instance"];
        }
      ];
    };
  };
 }
--- a/services/monitoring/rules.yaml
+++ b/services/monitoring/rules.yaml
@ -0,0 +1,139 @@
 ---
 groups:
  - name: General system
    rules:
      - record: node_memory_MemAvailable_percentage
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
      - record: node_cpu_count
        expr: count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
      - alert: InstanceDown
        expr: up == 0
        for: 5m
        labels:
          serverity: critical
        annotations:
          summary: "{{ $labels.instance }} down"
      - alert: SystemdServiceFailed
        expr: node_systemd_unit_state{state="failed"} > 0
        for: 5m
        labels:
          serverity: error
        annotations:
          summary: "{{ $labels.name }} on {{ $labels.instance }} crashed"
      - alert: OomKill
        expr: increase(node_vmstat_oom_kill[5m]) > 0 # Scrape interval of 1 minute
        for: 0m
        labels:
          serverity: error
        annotations:
          summary: "OOM kill on {{ $labels.instance }}"
      - alert: VeryLowAvailableMemory
        # Less than 3% available or less than 100KB available memory
        expr: ((node_memory_MemAvailable_percentage < 3) or (node_memory_MemFree_bytes < (1024 * 100)))
        for: 2m
        labels:
          serverity: critical
        annotations:
          summary: "Very low free memory on {{ $labels.instance }}"
      - alert: LowAvailableMemory
        expr: node_memory_MemAvailable_percentage < 10
        for: 5m
        labels:
          serverity: warn
        annotations:
          summary: "Low free memory on {{ $labels.instance }}"
      - alert: ReducedAvailableMemory
        expr: node_memory_MemAvailable_percentage < 20
        # Detect constant high memory usage as a potential sign that the host could maybe need some more memory
        for: 1h
        labels:
          serverity: info
        annotations:
          summary: "Reduced available memory on {{ $labels.instance }}"
      - alert: HighMemoryPressure
        # For at least 5 seconds over the last 2 Minutes, no progress could be made
        # due to memory congestion
        expr: increase(node_pressure_memory_stalled_seconds_total[2m]) >= 5
        labels:
          serverity: error
        annotations:
          summary: "Memory preassure on {{ $labels.instance }}"
      - alert: VeryHighLoad
        expr: ((node_load1 / node_cpu_count) > 1) or ((node_load5 / node_cpu_count) > 0.9)
        for: 5m
        labels:
          serverity: error
        annotations:
          summary: "Very High load on {{ $labels.instance }}"
      - alert: HighLoad
        expr: ((node_load1 / node_cpu_count) > 0.9) or ((node_load5 / node_cpu_count) > 0.8)
        for: 10m
        labels:
          serverity: warn
        annotations:
          summary: "High load on {{ $labels.instance }}"
      - alert: ElevatedLoad
        # Detecting a long-term increased load
        expr: (node_load15 / node_cpu_count) > 0.7
        for: 15m
        labels:
          serverity: info
        annotations:
          summary: "Elevated load15 on {{ $labels.instance }}"
      - alert: LowFreeDiskSpace
        expr:  node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15
        for: 10m
        labels:
          serverity: info
        annotations:
          summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space left"
      - alert: VeryLowFreeDiskSpace
        expr:  node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5
        for: 3m
        labels:
          serverity: error
        annotations:
          summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space left"
  - name: Prometheus Alertmanager
    rules:
      - alert: AlertmanagerMissing
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 5m
        labels:
          serverity: warn
        annotations:
          summary: "No connected alertmanager on {{ $labels.instance }}"
      - alert: AlertmanagerMissing
        expr: increase(prometheus_notifications_dropped_total[5m]) > 0
        labels:
          serverity: error
        annotations:
          summary: "Dropped alert notifications on {{ $labels.instance }}"
      - alert: FailingAlertmanagerNotifications
        expr: increase(alertmanager_notifications_failed_total[5m]) > 0
        labels:
          serverity: error
        annotations:
          summary: "Failing notifications via {{ $labels.integration }} on {{ $labels.instance }}"
      - alert: FailingRuleEvaluation
        expr: sum by (instance) (increase(prometheus_rule_evaluation_failures_total[5m])) > 0
        labels:
          serverity: warn
        annotations:
          summary: "Failing rule evaluations on {{ $labels.instance }}"