788fb22732
Otherwise, building hosts that don't have any (nginx using) services configured will faill.
281 lines
8.2 KiB
Nix
281 lines
8.2 KiB
Nix
{
|
|
config,
|
|
lib,
|
|
outputs,
|
|
...
|
|
}: let
|
|
inherit (lib) concatStringsSep mapAttrsToList getAttrFromPath filterAttrs singleton optional;
|
|
inherit (lib) escapeRegex;
|
|
inherit (config.networking) fqdn hostName;
|
|
|
|
# Basically a manual list of (legacy) hosts not yet migrated to NixOS
|
|
# but on which we'd like to have included in the monitoring.
|
|
externalTargets = let
|
|
host = hostName: {
|
|
_module.args = {
|
|
isDev = false;
|
|
baseDomain = "chaos.jetzt";
|
|
};
|
|
config = {
|
|
networking = rec {
|
|
inherit hostName;
|
|
domain = "net.chaos.jetzt";
|
|
fqdn = "${hostName}.${domain}";
|
|
};
|
|
services.prometheus = {
|
|
enable = true;
|
|
port = 9090;
|
|
alertmanager = {
|
|
enable = true;
|
|
port = 9093;
|
|
};
|
|
};
|
|
};
|
|
};
|
|
in {
|
|
hopper = host "hopper";
|
|
lovelace = host "lovelace";
|
|
};
|
|
|
|
monDomain = "mon.${config.networking.domain}";
|
|
|
|
# deadnix: skip # Will be used as soon as we have two non-dev hosts
|
|
isMe = host: host.config.networking.fqdn == fqdn;
|
|
# deadnix: skip # Will be used as soon as we have two non-dev hosts
|
|
isDev_ = getAttrFromPath [ "_module" "args" "isDev" ];
|
|
allHosts = outputs.nixosConfigurations // externalTargets;
|
|
/*
|
|
Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts).
|
|
That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts,
|
|
the dev host can be taken out of the equation
|
|
*/
|
|
# allTargets = filterAttrs (_: c: (isMe c) || !(isDev_ c)) allHosts;
|
|
allTargets = allHosts;
|
|
|
|
monTarget = service: config: "${config.networking.hostName}.${monDomain}:${toString service.port}";
|
|
targetAllHosts = servicePath: let
|
|
service = cfg: getAttrFromPath servicePath cfg.config;
|
|
in
|
|
mapAttrsToList
|
|
(_: c: monTarget (service c) c.config)
|
|
(filterAttrs (_: c: (service c).enable or false) allTargets);
|
|
|
|
dropMetrics = extraRegexen: let
|
|
dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen;
|
|
in
|
|
singleton {
|
|
inherit (regex);
|
|
regex = "(${concatStringsSep "|" dropRegexen}).*";
|
|
source_labels = ["__name__"];
|
|
action = "drop";
|
|
};
|
|
|
|
relabelInstance = {
|
|
source_labels = ["__address__"];
|
|
regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*";
|
|
target_label = "instance";
|
|
};
|
|
|
|
prometheusPath = ["services" "prometheus"];
|
|
alertmanagerPath = ["services" "prometheus" "alertmanager"];
|
|
in {
|
|
/*
|
|
Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password):
|
|
|
|
1. Use `htpasswd` (from the `apacheHttpd` package) to generate the hashed password
|
|
2. `sops secrets/all/monitoring.htpasswd` and replace/add the specfic lines
|
|
*/
|
|
sops.secrets = {
|
|
"monitoring.htpasswd" = {
|
|
format = "binary";
|
|
owner = config.services.nginx.user;
|
|
sopsFile = ../../secrets/all/monitoring.htpasswd;
|
|
};
|
|
"alertmanager/env" = {
|
|
format = "yaml";
|
|
sopsFile = ../../secrets/all/secrets.yaml;
|
|
};
|
|
};
|
|
|
|
services.nginx.enable = lib.mkDefault true;
|
|
services.nginx.virtualHosts."${fqdn}" = let
|
|
monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path;
|
|
in {
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/prometheus/" = {
|
|
basicAuthFile = monitoring_htpasswd;
|
|
proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/";
|
|
};
|
|
locations."/alertmanager/" = {
|
|
basicAuthFile = monitoring_htpasswd;
|
|
proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/";
|
|
};
|
|
};
|
|
|
|
services.prometheus.exporters.node = {
|
|
enable = true;
|
|
enabledCollectors = ["systemd"];
|
|
# They either don't apply to us or will provide us with metrics not usefull to us
|
|
disabledCollectors = [
|
|
"arp"
|
|
"bcache"
|
|
"bonding"
|
|
"btrfs"
|
|
"cpufreq"
|
|
"edac"
|
|
"entropy"
|
|
"infiniband"
|
|
"rapl"
|
|
"selinux"
|
|
"timex"
|
|
];
|
|
};
|
|
|
|
networking.firewall.interfaces.ens10.allowedTCPPorts = let
|
|
inherit (config.services) prometheus;
|
|
ifEnabled = x: lib.optional x.enable x.port;
|
|
in (
|
|
(ifEnabled prometheus)
|
|
++ (ifEnabled prometheus.alertmanager)
|
|
++ (ifEnabled prometheus.exporters.node)
|
|
);
|
|
|
|
services.prometheus = {
|
|
enable = true;
|
|
webExternalUrl = "https://${fqdn}/prometheus/";
|
|
extraFlags = [
|
|
"--web.route-prefix=\"/\""
|
|
"--web.enable-admin-api"
|
|
];
|
|
ruleFiles = [
|
|
./rules.yaml
|
|
];
|
|
retentionTime = "30d";
|
|
|
|
alertmanagers = [{
|
|
static_configs = [{
|
|
targets = [(monTarget config.services.prometheus.alertmanager config)];
|
|
}];
|
|
}];
|
|
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "node";
|
|
static_configs = [{
|
|
targets = [
|
|
# Only scraping to own node-exporter
|
|
(monTarget config.services.prometheus.exporters.node config)
|
|
];
|
|
}];
|
|
relabel_configs = [relabelInstance];
|
|
metric_relabel_configs = dropMetrics [];
|
|
}
|
|
{
|
|
job_name = "alertmanager";
|
|
static_configs = [{
|
|
targets = targetAllHosts alertmanagerPath;
|
|
}];
|
|
relabel_configs = [relabelInstance];
|
|
metric_relabel_configs = dropMetrics [
|
|
"alertmanager_http_(response_size_bytes|request_duration_seconds)_"
|
|
"alertmanager_notification_latency_seconds_"
|
|
"alertmanager_(nflog|cluster)_"
|
|
"alertmanager_silences_(query_duration_seconds|gc)_"
|
|
];
|
|
}
|
|
{
|
|
job_name = "prometheus";
|
|
static_configs = [{
|
|
targets = targetAllHosts prometheusPath;
|
|
}];
|
|
relabel_configs = [relabelInstance];
|
|
metric_relabel_configs = dropMetrics [
|
|
"prometheus_(sd|tsdb|target)_"
|
|
"prometheus_(engine_query|rule_evaluation)_duration_"
|
|
"prometheus_http_(response_size_bytes|request_duration_seconds)_"
|
|
"net_conntrack_dialer_conn_"
|
|
];
|
|
}
|
|
];
|
|
};
|
|
|
|
services.prometheus.alertmanager = {
|
|
enable = true;
|
|
extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="];
|
|
webExternalUrl = "https://${fqdn}/alertmanager/";
|
|
environmentFile = config.sops.secrets."alertmanager/env".path;
|
|
|
|
configuration = {
|
|
global = {
|
|
smtp_from = "Chaos-Jetzt Monitoring (${hostName}) <monitoring-${hostName}@chaos.jetzt>";
|
|
smtp_smarthost = "\${SMTP_HOST}:587";
|
|
smtp_auth_username = "\${SMTP_USER}";
|
|
smtp_auth_password = "\${SMTP_PASS}";
|
|
smtp_hello = config.networking.fqdn;
|
|
};
|
|
|
|
receivers = [{
|
|
name = "mail";
|
|
email_configs = [
|
|
{ to = "jetzt+mon@e1mo.de";
|
|
send_resolved = true; }
|
|
{ to = "admin+mon@adb.sh";
|
|
send_resolved = true; }
|
|
];
|
|
}];
|
|
|
|
route = {
|
|
receiver = "mail";
|
|
repeat_interval = "16h";
|
|
group_wait = "1m";
|
|
group_by = ["alertname" "instance"];
|
|
routes = [
|
|
{
|
|
match.severiy = "critical";
|
|
receiver = "mail";
|
|
repeat_interval = "6h";
|
|
}
|
|
{
|
|
match.severiy = "error";
|
|
receiver = "mail";
|
|
repeat_interval = "16h";
|
|
}
|
|
{
|
|
match.severiy = "warn";
|
|
receiver = "mail";
|
|
repeat_interval = "28h";
|
|
}
|
|
{
|
|
match.severiy = "info";
|
|
receiver = "mail";
|
|
repeat_interval = "56h";
|
|
}
|
|
];
|
|
};
|
|
|
|
inhibit_rules = [
|
|
{
|
|
target_matchers = ["alertname = ReducedAvailableMemory"];
|
|
source_matchers = ["alertname =~ (Very)LowAvailableMemory"];
|
|
equal = ["instance"];
|
|
}
|
|
{
|
|
target_matchers = ["alertname = LowAvailableMemory"];
|
|
source_matchers = ["alertname = VeryLowAvailableMemory"];
|
|
equal = ["instance"];
|
|
}
|
|
{
|
|
target_matchers = ["alertname = ElevatedLoad"];
|
|
source_matchers = ["alertname =~ (Very)HighLoad"];
|
|
equal = ["instance"];
|
|
}
|
|
{
|
|
target_matchers = ["alertname = HighLoad"];
|
|
source_matchers = ["alertname = VeryHighLoad"];
|
|
equal = ["instance"];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
}
|