chaos-jetzt-nixfiles/services/monitoring/default.nix

{
  config,
  lib,
  outputs,
  ...
}: let
  inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional;
  inherit (lib) escapeRegex;
  inherit (config.networking) fqdn hostName;

  # Basically a manual list of (legacy) hosts not yet migrated to NixOS
  # but on which we'd like to have included in the monitoring.
  externalTargets = let
    host = hostName: {
      _module.args.baseDomain = "chaos.jetzt";
      config = {
        networking = {
          inherit hostName;
          domain = "net.chaos.jetzt";
        };
        services.prometheus = {
          enable = true;
          port = 9090;
          alertmanager = {
            enable = true;
            port = 9093;
          };
        };
      };
    };
  in {
    hopper = host "hopper";
    lovelace = host "lovelace";
  };

  monDomain = "mon.${config.networking.domain}";

  isMe = host: host.config.networking.fqdn == fqdn;
  others = filterAttrs (_: !isMe) outputs.nixosConfigurations;
  isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev";
  allHosts = outputs.nixosConfigurations // externalTargets;
  /*
    Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts).
    That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts,
    the dev host can be taken out of the equation
  */
  # allTargets = filterAttrs (_: c: (isMe c) || !(isDev c)) allHosts;
  allTargets = allHosts;

  # monFqdn = config: "${config.networking.hostName}.${monDomain}";
  hasEnabled = servicePath: config: let
    path = servicePath ++ ["enable"];
  in
    (hasAttrByPath path config) && (getAttrFromPath path config);

  monTarget = servicePath: config: let
    port = toString (getAttrFromPath (servicePath ++ ["port"]) config);
  in "${config.networking.hostName}.${monDomain}:${port}";

  dropMetrics = {wildcard ? true}: extraRegexen: let
    dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen;
  in
    singleton {
      inherit (regex);
      regex = "(${concatStringsSep "|" dropRegexen})${optionalString wildcard ".*"}";
      source_labels = ["__name__"];
      action = "drop";
    };

  relabelInstance = {
    source_labels = ["__address__"];
    regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*";
    target_label = "instance";
  };

  prometheusPath = ["services" "prometheus"];
  alertmanagerPath = ["services" "prometheus" "alertmanager"];
  targetAllHosts = servicePath:
    mapAttrsToList
    (_: config: monTarget servicePath config.config)
    (filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets));
in {
  /*
  Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password):

  1. Use `htpasswd` (from the `apacheHttpd` package) to generate the hashed password
  2. `sops secrets/all/monitoring.htpasswd` and replace/add the specfic lines
  */
  sops.secrets = {
    "monitoring.htpasswd" = {
      format = "binary";
      owner = config.services.nginx.user;
      sopsFile = ../../secrets/all/monitoring.htpasswd;
    };
    "alertmanager/env" = {
      format = "yaml";
      sopsFile = ../../secrets/all/secrets.yaml;
    };
  };

  services.nginx.virtualHosts."${fqdn}" = let
    monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path;
  in {
    enableACME = true;
    forceSSL = true;
    locations."/prometheus/" = {
      basicAuthFile = monitoring_htpasswd;
      proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/";
    };
    locations."/alertmanager/" = {
      basicAuthFile = monitoring_htpasswd;
      proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/";
    };
  };

  services.prometheus.exporters.node = {
    enable = true;
    enabledCollectors = ["systemd"];
    # They either don't apply to us or will provide us with metrics not usefull to us
    disabledCollectors = [
      "arp"
      "bcache"
      "bonding"
      "btrfs"
      "cpufreq"
      "edac"
      "entropy"
      "infiniband"
      "rapl"
      "selinux"
      "timex"
    ];
  };

  services.prometheus = {
    enable = true;
    webExternalUrl = "https://${fqdn}/prometheus/";
    extraFlags = [
      "--web.route-prefix=\"/\""
      "--web.enable-admin-api"
    ];
    ruleFiles = [
      ./rules.yaml
    ];
    retentionTime = "30d";

    alertmanagers = [{
      static_configs = [{
          targets = [(monTarget alertmanagerPath config)];
      }];
    }];

    scrapeConfigs = [
      {
        job_name = "node";
        static_configs = [{
          targets = [
            # Only scraping to own node-exporter
            (monTarget ["services" "prometheus" "exporters" "node"] config)
          ];
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [];
      }
      {
        job_name = "alertmanager";
        static_configs = [{
          targets = targetAllHosts alertmanagerPath;
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [
          "alertmanager_http_(response_size_bytes|request_duration_seconds)_"
          "alertmanager_notification_latency_seconds_"
          "alertmanager_(nflog|cluster)_"
          "alertmanager_silences_(query_duration_seconds|gc)_"
        ];
      }
      {
        job_name = "prometheus";
        static_configs = [{
          targets = targetAllHosts prometheusPath;
        }];
        relabel_configs = [relabelInstance];
        metric_relabel_configs = dropMetrics {} [
          "prometheus_(sd|tsdb|target)_"
          "prometheus_(engine_query|rule_evaluation)_duration_"
          "prometheus_http_(response_size_bytes|request_duration_seconds)_"
          "net_conntrack_dialer_conn_"
        ];
      }
    ];
  };

  services.prometheus.alertmanager = {
    enable = true;
    extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="];
    webExternalUrl = "https://${fqdn}/alertmanager/";
    environmentFile = config.sops.secrets."alertmanager/env".path;

    configuration = {
      global = {
        smtp_from = "Chaos-Jetzt Monitoring (${hostName}) <monitoring-${hostName}@chaos.jetzt>";
        smtp_smarthost = "\${SMTP_HOST}:587";
        smtp_auth_username = "\${SMTP_USER}";
        smtp_auth_password = "\${SMTP_PASS}";
        smtp_hello = config.networking.fqdn;
      };

      receivers = [{
        name = "mail";
        email_configs = [
          { to = "jetzt+mon@e1mo.de";
            send_resolved = true; }
          { to = "info@adb.sh";
            send_resolved = true; }
        ];
      }];

      route = {
        receiver = "mail";
        repeat_interval = "16h";
        group_wait = "1m";
        group_by = ["alertname" "instance"];
        routes = [
          {
            match.severiy = "critical";
            receiver = "mail";
            repeat_interval = "6h";
          }
          {
            match.severiy = "error";
            receiver = "mail";
            repeat_interval = "16h";
          }
          {
            match.severiy = "warn";
            receiver = "mail";
            repeat_interval = "28h";
          }
          {
            match.severiy = "info";
            receiver = "mail";
            repeat_interval = "56h";
          }
        ];
      };

      inhibit_rules = [
        {
          target_matchers = ["alertname = ReducedAvailableMemory"];
          source_matchers = ["alertname =~ (Very)LowAvailableMemory"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = LowAvailableMemory"];
          source_matchers = ["alertname = VeryLowAvailableMemory"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = ElevatedLoad"];
          source_matchers = ["alertname =~ (Very)HighLoad"];
          equal = ["instance"];
        }
        {
          target_matchers = ["alertname = HighLoad"];
          source_matchers = ["alertname = VeryHighLoad"];
          equal = ["instance"];
        }
      ];
    };
  };
}
services/monitoring: Setup The goal is to create a monitoring setup where each server monitors itself when it comes failing systemd services, disk or RAM filling up, …. In addition each prometheus will monitor remote prometheus and alertmanager instances for signs of failure (e.g. being unreachable, errors in notification delivery, dropping alerts). A lot of metrics (especially histograms from prometheus or alertmanager) are being dropped before ingestion to disk save on space and memory. Depending on how many servers we may or may not have in the future this could probably use some kind of overhaul since we rightnow have n^2 monitoring peer relationships (not even speaking of possible duplicated alerts). 2022-12-31 22:40:07 +01:00			`{`
			`config,`
			`lib,`
			`outputs,`
			`...`
			`}: let`
			`inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional;`
			`inherit (lib) escapeRegex;`
			`inherit (config.networking) fqdn hostName;`

			`# Basically a manual list of (legacy) hosts not yet migrated to NixOS`
			`# but on which we'd like to have included in the monitoring.`
			`externalTargets = let`
			`host = hostName: {`
			`_module.args.baseDomain = "chaos.jetzt";`
			`config = {`
			`networking = {`
			`inherit hostName;`
			`domain = "net.chaos.jetzt";`
			`};`
			`services.prometheus = {`
			`enable = true;`
			`port = 9090;`
			`alertmanager = {`
			`enable = true;`
			`port = 9093;`
			`};`
			`};`
			`};`
			`};`
			`in {`
			`hopper = host "hopper";`
			`lovelace = host "lovelace";`
			`};`

			`monDomain = "mon.${config.networking.domain}";`

			`isMe = host: host.config.networking.fqdn == fqdn;`
			`others = filterAttrs (_: !isMe) outputs.nixosConfigurations;`
			`isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev";`
			`allHosts = outputs.nixosConfigurations // externalTargets;`
			`/*`
			`Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts).`
			`That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts,`
			`the dev host can be taken out of the equation`
			`*/`
			`# allTargets = filterAttrs (_: c: (isMe c) \|\| !(isDev c)) allHosts;`
			`allTargets = allHosts;`

			`# monFqdn = config: "${config.networking.hostName}.${monDomain}";`
			`hasEnabled = servicePath: config: let`
			`path = servicePath ++ ["enable"];`
			`in`
			`(hasAttrByPath path config) && (getAttrFromPath path config);`

			`monTarget = servicePath: config: let`
			`port = toString (getAttrFromPath (servicePath ++ ["port"]) config);`
			`in "${config.networking.hostName}.${monDomain}:${port}";`

			`dropMetrics = {wildcard ? true}: extraRegexen: let`
			`dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen;`
			`in`
			`singleton {`
			`inherit (regex);`
			`regex = "(${concatStringsSep "\|" dropRegexen})${optionalString wildcard ".*"}";`
			`source_labels = ["__name__"];`
			`action = "drop";`
			`};`

			`relabelInstance = {`
			`source_labels = ["__address__"];`
			`regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*";`
			`target_label = "instance";`
			`};`

			`prometheusPath = ["services" "prometheus"];`
			`alertmanagerPath = ["services" "prometheus" "alertmanager"];`
			`targetAllHosts = servicePath:`
			`mapAttrsToList`
			`(_: config: monTarget servicePath config.config)`
			`(filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets));`
			`in {`
			`/*`
			`Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password):`

Add adb and admin htpasswd user Also updated instructions for editing the .htpasswd 2023-01-03 16:07:00 +01:00			1. Use `htpasswd` (from the `apacheHttpd` package) to generate the hashed password
			2. `sops secrets/all/monitoring.htpasswd` and replace/add the specfic lines
services/monitoring: Setup The goal is to create a monitoring setup where each server monitors itself when it comes failing systemd services, disk or RAM filling up, …. In addition each prometheus will monitor remote prometheus and alertmanager instances for signs of failure (e.g. being unreachable, errors in notification delivery, dropping alerts). A lot of metrics (especially histograms from prometheus or alertmanager) are being dropped before ingestion to disk save on space and memory. Depending on how many servers we may or may not have in the future this could probably use some kind of overhaul since we rightnow have n^2 monitoring peer relationships (not even speaking of possible duplicated alerts). 2022-12-31 22:40:07 +01:00			`*/`
			`sops.secrets = {`
			`"monitoring.htpasswd" = {`
			`format = "binary";`
			`owner = config.services.nginx.user;`
			`sopsFile = ../../secrets/all/monitoring.htpasswd;`
			`};`
			`"alertmanager/env" = {`
			`format = "yaml";`
			`sopsFile = ../../secrets/all/secrets.yaml;`
			`};`
			`};`

			`services.nginx.virtualHosts."${fqdn}" = let`
			`monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path;`
			`in {`
			`enableACME = true;`
			`forceSSL = true;`
			`locations."/prometheus/" = {`
			`basicAuthFile = monitoring_htpasswd;`
			`proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/";`
			`};`
			`locations."/alertmanager/" = {`
			`basicAuthFile = monitoring_htpasswd;`
			`proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/";`
			`};`
			`};`

			`services.prometheus.exporters.node = {`
			`enable = true;`
			`enabledCollectors = ["systemd"];`
			`# They either don't apply to us or will provide us with metrics not usefull to us`
			`disabledCollectors = [`
			`"arp"`
			`"bcache"`
			`"bonding"`
			`"btrfs"`
			`"cpufreq"`
			`"edac"`
			`"entropy"`
			`"infiniband"`
			`"rapl"`
			`"selinux"`
			`"timex"`
			`];`
			`};`

			`services.prometheus = {`
			`enable = true;`
			`webExternalUrl = "https://${fqdn}/prometheus/";`
			`extraFlags = [`
			`"--web.route-prefix=\"/\""`
			`"--web.enable-admin-api"`
			`];`
			`ruleFiles = [`
			`./rules.yaml`
			`];`
			`retentionTime = "30d";`

			`alertmanagers = [{`
			`static_configs = [{`
			`targets = [(monTarget alertmanagerPath config)];`
			`}];`
			`}];`

			`scrapeConfigs = [`
			`{`
			`job_name = "node";`
			`static_configs = [{`
			`targets = [`
			`# Only scraping to own node-exporter`
			`(monTarget ["services" "prometheus" "exporters" "node"] config)`
			`];`
			`}];`
			`relabel_configs = [relabelInstance];`
			`metric_relabel_configs = dropMetrics {} [];`
			`}`
			`{`
			`job_name = "alertmanager";`
			`static_configs = [{`
			`targets = targetAllHosts alertmanagerPath;`
			`}];`
			`relabel_configs = [relabelInstance];`
			`metric_relabel_configs = dropMetrics {} [`
			`"alertmanager_http_(response_size_bytes\|request_duration_seconds)_"`
			`"alertmanager_notification_latency_seconds_"`
			`"alertmanager_(nflog\|cluster)_"`
			`"alertmanager_silences_(query_duration_seconds\|gc)_"`
			`];`
			`}`
			`{`
			`job_name = "prometheus";`
			`static_configs = [{`
			`targets = targetAllHosts prometheusPath;`
			`}];`
			`relabel_configs = [relabelInstance];`
			`metric_relabel_configs = dropMetrics {} [`
			`"prometheus_(sd\|tsdb\|target)_"`
			`"prometheus_(engine_query\|rule_evaluation)_duration_"`
			`"prometheus_http_(response_size_bytes\|request_duration_seconds)_"`
			`"net_conntrack_dialer_conn_"`
			`];`
			`}`
			`];`
			`};`

			`services.prometheus.alertmanager = {`
			`enable = true;`
			`extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="];`
			`webExternalUrl = "https://${fqdn}/alertmanager/";`
			`environmentFile = config.sops.secrets."alertmanager/env".path;`

			`configuration = {`
			`global = {`
			`smtp_from = "Chaos-Jetzt Monitoring (${hostName}) <monitoring-${hostName}@chaos.jetzt>";`
			`smtp_smarthost = "\${SMTP_HOST}:587";`
			`smtp_auth_username = "\${SMTP_USER}";`
			`smtp_auth_password = "\${SMTP_PASS}";`
			`smtp_hello = config.networking.fqdn;`
			`};`

			`receivers = [{`
			`name = "mail";`
			`email_configs = [`
			`{ to = "jetzt+mon@e1mo.de";`
			`send_resolved = true; }`
			`{ to = "info@adb.sh";`
			`send_resolved = true; }`
			`];`
			`}];`

			`route = {`
			`receiver = "mail";`
			`repeat_interval = "16h";`
			`group_wait = "1m";`
			`group_by = ["alertname" "instance"];`
			`routes = [`
			`{`
			`match.severiy = "critical";`
			`receiver = "mail";`
			`repeat_interval = "6h";`
			`}`
			`{`
			`match.severiy = "error";`
			`receiver = "mail";`
			`repeat_interval = "16h";`
			`}`
			`{`
			`match.severiy = "warn";`
			`receiver = "mail";`
			`repeat_interval = "28h";`
			`}`
			`{`
			`match.severiy = "info";`
			`receiver = "mail";`
			`repeat_interval = "56h";`
			`}`
			`];`
			`};`

			`inhibit_rules = [`
			`{`
			`target_matchers = ["alertname = ReducedAvailableMemory"];`
			`source_matchers = ["alertname =~ (Very)LowAvailableMemory"];`
			`equal = ["instance"];`
			`}`
			`{`
			`target_matchers = ["alertname = LowAvailableMemory"];`
			`source_matchers = ["alertname = VeryLowAvailableMemory"];`
			`equal = ["instance"];`
			`}`
			`{`
			`target_matchers = ["alertname = ElevatedLoad"];`
			`source_matchers = ["alertname =~ (Very)HighLoad"];`
			`equal = ["instance"];`
			`}`
			`{`
			`target_matchers = ["alertname = HighLoad"];`
			`source_matchers = ["alertname = VeryHighLoad"];`
			`equal = ["instance"];`
			`}`
			`];`
			`};`
			`};`
			`}`