services/monitoring: Setup

The goal is to create a monitoring setup where each server monitors
itself when it comes failing systemd services, disk or RAM filling up,
…. In addition each prometheus will monitor remote prometheus and
alertmanager instances for signs of failure (e.g. being unreachable,
errors in notification delivery, dropping alerts).

A lot of metrics (especially histograms from prometheus or alertmanager)
are being dropped before ingestion to disk save on space and memory.

Depending on how many servers we may or may not have in the future this
could probably use some kind of overhaul since we rightnow have n^2
monitoring peer relationships (not even speaking of possible duplicated
alerts).
This commit is contained in:
Moritz 'e1mo' Fromm 2022-12-31 22:40:07 +01:00
parent 6e9c9a26cd
commit 3acc1865c0
No known key found for this signature in database
GPG key ID: 1D5D79A439E787F1
4 changed files with 457 additions and 2 deletions

View file

@ -0,0 +1,40 @@
{
"data": "ENC[AES256_GCM,data:/qJNDvVv7ap7CiCHbxtPEs35RepVZ/jSWeqaKtTMu4ePvZbpX+BDMDPztg==,iv:8p1gnTObH883P/h7QLtCYVbCWvhi6nKhkWNlvazy1CM=,tag:ha+5Mniu0On1TCAGTSCf3A==,type:str]",
"sops": {
"kms": null,
"gcp_kms": null,
"azure_kv": null,
"hc_vault": null,
"age": [
{
"recipient": "age14ysl953378r2vvy7ft3gwce9xp83pr6wypf5lgx2yjwx2lxra5qs6j8eqe",
"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBJM21QS0VGNXNZV0NJSVU0\nYUxZaEpMQ0hQNkZPNFNzUjFOM3V2c1ZkaUY4CjdqY2tGaXFOL2liTlBxSitzSkx5\nQmMrTW5EYWZ3RWY0dEZ6NldVSkhBN28KLS0tIDEzamtzdEFQam5ZU3pJY1p3Lzh5\nY2wwdmpIWktWMEwyb09mVDh4Ujd2TUEKiIxFNC5fk38+GvBPIP0RygwA4uw49Uht\nfabstjYfQACI5Auc2IUL7nWPqVTT61obbogbGPfgRkOU/lnU7lhvtw==\n-----END AGE ENCRYPTED FILE-----\n"
},
{
"recipient": "age1w3wqxt5t00hjv43dcxlr5rjec5mvuzz9ajc8k04azq0gfx0ncgysu6mdmm",
"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBIQzUraTdVYkg3THQzSXlS\neEtqSUNBMEIvSDZQNWVhNXdBaklPenk4aFhRCkU2UDE5UWZSdEFGV0tPYmxteUlh\nd2hZNXdXYSthK3dmalNrdi9MdE9ybm8KLS0tIFRTc3g2RFR3VWtyaDJBb2UvMWU3\nVkw5QW03djdyd2tCUndNNlpWcW1iUDgKF01T2t0pqk6O7gQh9xYn+KqtLXZRr3v3\nIjhMh46UOrm5OYSDqkKKFEU6Tz6PmHii9ClcC9ejClg+5lBaavlAqg==\n-----END AGE ENCRYPTED FILE-----\n"
}
],
"lastmodified": "2022-12-30T14:49:56Z",
"mac": "ENC[AES256_GCM,data:vCOGAJDFg1HH2+JwWqKWaSUAq0IS1voaTphRQHWPjXzgJ2eeDbXQQPEkp9Rk5Q5/zfPMdGgw8oR1OjWAv+KcaE/+t8PKPOd1B6a50VkFtgHYuHbunRai3RLMfexZMPfC7QvO5TLLNM4/OzrDsdVozPecj4LXpMrrC9rpacBCnWc=,iv:UtQvqNXzMC/kb0l0sG16PCZSIfacO7jEa88x5Y7eDS4=,tag:yXa2ohlBnh+kqJzstUYIWQ==,type:str]",
"pgp": [
{
"created_at": "2022-12-30T14:49:56Z",
"enc": "-----BEGIN PGP MESSAGE-----\n\nhF4DOnsoj685gdcSAQdA7WmJAs7/hgq93M3SphsDJv9m/oOLhf3ftXyngoj4MVkw\n4xKE1ZCN4LErd+/enCdGLeCkzxIFL7lPc1Q1BaCJZwAZjQIhwUFtK2ORWeXK+y65\n0l4Bon20voZRVNjqiU5XuzWTNUQOStiq30qf5MdCYTfyXQI7fpwYsIWoL3T3en41\nNsIHoDYmXgfoIGDHWmkFEW7YRy04/9wUvWnX67eQIEB9WPvX0s1nUnq0t/x7BVem\n=4Zp6\n-----END PGP MESSAGE-----\n",
"fp": "0x6D617FD0A85BAADA"
},
{
"created_at": "2022-12-30T14:49:56Z",
"enc": "-----BEGIN PGP MESSAGE-----\n\nhF4D6iFd6webPCUSAQdAk5UEGAJw64qu9tVkeaGGDkX4+9awdb3APedVb5EZPXIw\n46V4iFaF6U6mry2BqbPOzBZxE0AzQtmVR/H3SfPlrqJYak98lhYrRMfMjTs+7DAy\n0l4B4WIodnYaPhkhXuoq2gglCCyQ8FN3yUoDfYcIRgmmBtuwp+t3rPe4pJpDkf1t\nNyHhuKWRKtLaYc9YJMq6smpCICFFYV8RohVUmK6ucWGHJVC5wTX5+Cz/UZv+qpwB\n=tQa7\n-----END PGP MESSAGE-----\n",
"fp": "0xE0262A773B824745"
},
{
"created_at": "2022-12-30T14:49:56Z",
"enc": "-----BEGIN PGP MESSAGE-----\n\nhQIMAxFMPvz46t7rAQ/9FjFVuDmr8uuPAGozef2avvej0G8XfJsKHtuVCCUK9IuC\n+dyGQ+stw5bj5BgwepACk7eeFkt+Dr0Xqmit5Q+HrnkEdKt5l5SJXiBkqsU7PwYP\neKL7LmqTQ6KdWhzu1zcL7jdQeKGJagvgaoT1t5Os8sd8Jl4SvzLWNQdVb4RkbU6p\nnBLAHTlTlH5f8MA/nkGO/ku3zs1Hfjg4laI2rWvbqSv7YouJxU2bpZ5KdGlyNMD7\nnjY0joe0EIm/CkY8s5FcOyzu299Q23fYsMKtHDs7fP7MgD7OriKh9Q3s443B++Fg\nK70Ngz2aZNs5liWG+ce+zrSL6QE2+zVtYKb116kMfFFPlWUxZTa0UcHRWJ5mqQV7\nPgum2yMFE6XYpsqlv2k1fnsSAagU8VceddYw+BZwfFJtIXhWIubvXpa3FV3xHdtD\nqJyuA+hE8rwUM6OPiwLBQmaqkywUhNjS7kEhdxTfEvmHWWQKmqqEC/27rooK2bLX\nPWNmkm3XYZeoNXB4aJY16I+lv9cuWr4zFwNF1GFx2zg3hkLU6v8zqN4NuFxa0l50\nTfTZTGYak+whMtdIbIehbjKnZ7nuKrD3agsvGsH4vd02oCpucBjz5xEhDDuoSWFb\n3QqWiPqyfTGWAuo5Xb1S4JyXkLnt8KfXtlgfYlMfW3YmbB9LcPI3e1Hg6c7YwFzS\nXgFBDmD+OY+CFYMsewrke5324Jco+CDEm6G/VKdJg4YS8RDA3J3/CT0zel8Th63J\nOGfKr8+/ilgNqFXv5uxyW6JuIrMcsBRtmosq56lCDlxp0SaLmeVpX71r3opcUWo=\n=a73T\n-----END PGP MESSAGE-----\n",
"fp": "B1480CFF9BBE8E2648A26A640B2E7C171E3AD6D7"
}
],
"unencrypted_suffix": "_unencrypted",
"version": "3.7.3"
}
}

View file

@ -4,6 +4,8 @@ smtp:
user: ENC[AES256_GCM,data:cyxh+oTM7MdF31+umMplmA==,iv:maKuY4iXQCg10O8TTenBUl6tMJKt54AycfKQW6sheVw=,tag:4DAUiBBgO5H9O7DwLYZJtQ==,type:str]
pass: ENC[AES256_GCM,data:CkpgvLnQGBzShLkDaHtOHgZ9zwlsEJ+9KFBmHLgBol9Pcadww040clNM0WJgz8NfDIoYF8M=,iv:OIxuwC/XQOBBKdkRb6/5X8pNPLJNKAEI/WCYKhFScPw=,tag:CoAFk7n5AeSue18OXZvjKA==,type:str]
host: ENC[AES256_GCM,data:psXf+lg9433jS1GxH6HQQQ==,iv:wyq/VgWLNs4RbzxNHf4Ave8ZDhQeNPpYXJIdjLLMlr4=,tag:pQ4Wmf+PzSBsHO3eigx91Q==,type:str]
alertmanager:
env: ENC[AES256_GCM,data:eYytFrXgsm4NCyS5F3oVNKsXwf5RL+zEFomMszP4fzb0HkLBk659uWZmqi7kIe8ptyUqqr6oAIV31alFo2TwpWuImvh4gbmaD0sfzesBmqhXyeA3cgi3EMGfMxsdLgUpHKGuJhgrRGLY9jDRI6VIwuLng1YmRw==,iv:YQYwC+Xef9I5IdUsBlTg3g7yRxWJe5Pi1LCur9g9/8w=,tag:U/A2lYu79urWjBFraQh16w==,type:str]
sops:
kms: []
gcp_kms: []
@ -28,8 +30,8 @@ sops:
UXludFJ5UDEvN2pHMEQweWIvN0NYaFEKB7lVKrsB3eX77iKWwFAXp7LVl+fPcGOl
8CkIBa/rkSWMe0xIetew60wIwx2ZVAv5TTDmKIZyQSTayOpbG3zcdg==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2022-12-24T21:21:02Z"
mac: ENC[AES256_GCM,data:GbEO+lLUeswXR6Dc9kGlR9YJhXWh7Io6Z/qciolwkrzIwZAA6l6kaWCdw6OJ6gSNBGhWNye+VDdDV2Ow1Ix1hjnZdl3MUlY/LEEmzP1uUjPvNnEjZjxC8ebzNg3rgVqT6W6q9S7D9nPmAzJMgMlzypR/b1ZX5QuzodhotsleVNM=,iv:TJX940f70Vka+hsfDSvm7Jj0klFylRwB+9XbA9pFkRo=,tag:uaJi+5UI/bppLw+hOgyiDg==,type:str]
lastmodified: "2022-12-30T22:26:42Z"
mac: ENC[AES256_GCM,data:KcJVntK3kAMH64ZHYoJmDCA7BroYJZhmMvtgbnjI44h5cwT9TyDOso2HO/x2z5XzTfPQfbW3tfhT1UmbaBHxBTkV7gJGU9fJ1xOrCRDoHuI9RTwGnCHKwQPLIWj56NMiC2CMf00pFpUlAPyDP/VHwRsc/laUPne2Da1MKZp/NOk=,iv:iHONd6xTjfVZNFELQVafAiljWYavu1DOZrS+d7pd8Zw=,tag:1Zh32h2OEtqKPoXy+8pypw==,type:str]
pgp:
- created_at: "2022-12-23T19:08:38Z"
enc: |

View file

@ -0,0 +1,274 @@
{
config,
lib,
outputs,
...
}: let
inherit (lib) concatStringsSep mapAttrsToList hasAttrByPath getAttrFromPath filterAttrs substring singleton optionalString optional;
inherit (lib) escapeRegex;
inherit (config.networking) fqdn hostName;
# Basically a manual list of (legacy) hosts not yet migrated to NixOS
# but on which we'd like to have included in the monitoring.
externalTargets = let
host = hostName: {
_module.args.baseDomain = "chaos.jetzt";
config = {
networking = {
inherit hostName;
domain = "net.chaos.jetzt";
};
services.prometheus = {
enable = true;
port = 9090;
alertmanager = {
enable = true;
port = 9093;
};
};
};
};
in {
hopper = host "hopper";
lovelace = host "lovelace";
};
monDomain = "mon.${config.networking.domain}";
isMe = host: host.config.networking.fqdn == fqdn;
others = filterAttrs (_: !isMe) outputs.nixosConfigurations;
isDev = host: (substring 0 3 host._module.args.baseDomain) == "dev";
allHosts = outputs.nixosConfigurations // externalTargets;
/*
Right now we only have one non-dev host in our NixOS setup (the ansible hosts don't monitor the NixOS hosts).
That's why we currently add all hosts to our little monitoring "cluster". As soon as we have two or more production hosts,
the dev host can be taken out of the equation
*/
# allTargets = filterAttrs (_: c: (isMe c) || !(isDev c)) allHosts;
allTargets = allHosts;
# monFqdn = config: "${config.networking.hostName}.${monDomain}";
hasEnabled = servicePath: config: let
path = servicePath ++ ["enable"];
in
(hasAttrByPath path config) && (getAttrFromPath path config);
monTarget = servicePath: config: let
port = toString (getAttrFromPath (servicePath ++ ["port"]) config);
in "${config.networking.hostName}.${monDomain}:${port}";
dropMetrics = {wildcard ? true}: extraRegexen: let
dropRegexen = [ "go_" "promhttp_metric_handler_requests_" ] ++ extraRegexen;
in
singleton {
inherit (regex);
regex = "(${concatStringsSep "|" dropRegexen})${optionalString wildcard ".*"}";
source_labels = ["__name__"];
action = "drop";
};
relabelInstance = {
source_labels = ["__address__"];
regex = "(\\w+)\\.${escapeRegex monDomain}\\:\\d*";
target_label = "instance";
};
prometheusPath = ["services" "prometheus"];
alertmanagerPath = ["services" "prometheus" "alertmanager"];
targetAllHosts = servicePath:
mapAttrsToList
(_: config: monTarget servicePath config.config)
(filterAttrs (_: c: (hasEnabled servicePath c.config)) (outputs.nixosConfigurations // externalTargets));
in {
/*
Steps to edit the monitoring.htpasswd (aka. adding yourself / updating you password):
1. `sops -d secrets/all/monitoring.htpasswd > /tmp/monitoring.htpasswd`
2. Use `htpasswd` (from the `apacheHttpd` package) to your hearts content
3. `sops -e /tmp/monitoring.htpasswd > secrets/all/monitoring.htpasswd`
4. `rm /tmp/monitoring.htpasswd`
*/
sops.secrets = {
"monitoring.htpasswd" = {
format = "binary";
owner = config.services.nginx.user;
sopsFile = ../../secrets/all/monitoring.htpasswd;
};
"alertmanager/env" = {
format = "yaml";
sopsFile = ../../secrets/all/secrets.yaml;
};
};
services.nginx.virtualHosts."${fqdn}" = let
monitoring_htpasswd = config.sops.secrets."monitoring.htpasswd".path;
in {
enableACME = true;
forceSSL = true;
locations."/prometheus/" = {
basicAuthFile = monitoring_htpasswd;
proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.port}/";
};
locations."/alertmanager/" = {
basicAuthFile = monitoring_htpasswd;
proxyPass = "http://127.0.0.1:${builtins.toString config.services.prometheus.alertmanager.port}/";
};
};
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = ["systemd"];
# They either don't apply to us or will provide us with metrics not usefull to us
disabledCollectors = [
"arp"
"bcache"
"bonding"
"btrfs"
"cpufreq"
"edac"
"entropy"
"infiniband"
"rapl"
"selinux"
"timex"
];
};
services.prometheus = {
enable = true;
webExternalUrl = "https://${fqdn}/prometheus/";
extraFlags = [
"--web.route-prefix=\"/\""
"--web.enable-admin-api"
];
ruleFiles = [
./rules.yaml
];
retentionTime = "30d";
alertmanagers = [{
static_configs = [{
targets = [(monTarget alertmanagerPath config)];
}];
}];
scrapeConfigs = [
{
job_name = "node";
static_configs = [{
targets = [
# Only scraping to own node-exporter
(monTarget ["services" "prometheus" "exporters" "node"] config)
];
}];
relabel_configs = [relabelInstance];
metric_relabel_configs = dropMetrics {} [];
}
{
job_name = "alertmanager";
static_configs = [{
targets = targetAllHosts alertmanagerPath;
}];
relabel_configs = [relabelInstance];
metric_relabel_configs = dropMetrics {} [
"alertmanager_http_(response_size_bytes|request_duration_seconds)_"
"alertmanager_notification_latency_seconds_"
"alertmanager_(nflog|cluster)_"
"alertmanager_silences_(query_duration_seconds|gc)_"
];
}
{
job_name = "prometheus";
static_configs = [{
targets = targetAllHosts prometheusPath;
}];
relabel_configs = [relabelInstance];
metric_relabel_configs = dropMetrics {} [
"prometheus_(sd|tsdb|target)_"
"prometheus_(engine_query|rule_evaluation)_duration_"
"prometheus_http_(response_size_bytes|request_duration_seconds)_"
"net_conntrack_dialer_conn_"
];
}
];
};
services.prometheus.alertmanager = {
enable = true;
extraFlags = ["--web.route-prefix=\"/\"" "--cluster.listen-address="];
webExternalUrl = "https://${fqdn}/alertmanager/";
environmentFile = config.sops.secrets."alertmanager/env".path;
configuration = {
global = {
smtp_from = "Chaos-Jetzt Monitoring (${hostName}) <monitoring-${hostName}@chaos.jetzt>";
smtp_smarthost = "\${SMTP_HOST}:587";
smtp_auth_username = "\${SMTP_USER}";
smtp_auth_password = "\${SMTP_PASS}";
smtp_hello = config.networking.fqdn;
};
receivers = [{
name = "mail";
email_configs = [
{ to = "jetzt+mon@e1mo.de";
send_resolved = true; }
{ to = "info@adb.sh";
send_resolved = true; }
];
}];
route = {
receiver = "mail";
repeat_interval = "16h";
group_wait = "1m";
group_by = ["alertname" "instance"];
routes = [
{
match.severiy = "critical";
receiver = "mail";
repeat_interval = "6h";
}
{
match.severiy = "error";
receiver = "mail";
repeat_interval = "16h";
}
{
match.severiy = "warn";
receiver = "mail";
repeat_interval = "28h";
}
{
match.severiy = "info";
receiver = "mail";
repeat_interval = "56h";
}
];
};
inhibit_rules = [
{
target_matchers = ["alertname = ReducedAvailableMemory"];
source_matchers = ["alertname =~ (Very)LowAvailableMemory"];
equal = ["instance"];
}
{
target_matchers = ["alertname = LowAvailableMemory"];
source_matchers = ["alertname = VeryLowAvailableMemory"];
equal = ["instance"];
}
{
target_matchers = ["alertname = ElevatedLoad"];
source_matchers = ["alertname =~ (Very)HighLoad"];
equal = ["instance"];
}
{
target_matchers = ["alertname = HighLoad"];
source_matchers = ["alertname = VeryHighLoad"];
equal = ["instance"];
}
];
};
};
}

View file

@ -0,0 +1,139 @@
---
groups:
- name: General system
rules:
- record: node_memory_MemAvailable_percentage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- record: node_cpu_count
expr: count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
serverity: critical
annotations:
summary: "{{ $labels.instance }} down"
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} > 0
for: 5m
labels:
serverity: error
annotations:
summary: "{{ $labels.name }} on {{ $labels.instance }} crashed"
- alert: OomKill
expr: increase(node_vmstat_oom_kill[5m]) > 0 # Scrape interval of 1 minute
for: 0m
labels:
serverity: error
annotations:
summary: "OOM kill on {{ $labels.instance }}"
- alert: VeryLowAvailableMemory
# Less than 3% available or less than 100KB available memory
expr: ((node_memory_MemAvailable_percentage < 3) or (node_memory_MemFree_bytes < (1024 * 100)))
for: 2m
labels:
serverity: critical
annotations:
summary: "Very low free memory on {{ $labels.instance }}"
- alert: LowAvailableMemory
expr: node_memory_MemAvailable_percentage < 10
for: 5m
labels:
serverity: warn
annotations:
summary: "Low free memory on {{ $labels.instance }}"
- alert: ReducedAvailableMemory
expr: node_memory_MemAvailable_percentage < 20
# Detect constant high memory usage as a potential sign that the host could maybe need some more memory
for: 1h
labels:
serverity: info
annotations:
summary: "Reduced available memory on {{ $labels.instance }}"
- alert: HighMemoryPressure
# For at least 5 seconds over the last 2 Minutes, no progress could be made
# due to memory congestion
expr: increase(node_pressure_memory_stalled_seconds_total[2m]) >= 5
labels:
serverity: error
annotations:
summary: "Memory preassure on {{ $labels.instance }}"
- alert: VeryHighLoad
expr: ((node_load1 / node_cpu_count) > 1) or ((node_load5 / node_cpu_count) > 0.9)
for: 5m
labels:
serverity: error
annotations:
summary: "Very High load on {{ $labels.instance }}"
- alert: HighLoad
expr: ((node_load1 / node_cpu_count) > 0.9) or ((node_load5 / node_cpu_count) > 0.8)
for: 10m
labels:
serverity: warn
annotations:
summary: "High load on {{ $labels.instance }}"
- alert: ElevatedLoad
# Detecting a long-term increased load
expr: (node_load15 / node_cpu_count) > 0.7
for: 15m
labels:
serverity: info
annotations:
summary: "Elevated load15 on {{ $labels.instance }}"
- alert: LowFreeDiskSpace
expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15
for: 10m
labels:
serverity: info
annotations:
summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space left"
- alert: VeryLowFreeDiskSpace
expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5
for: 3m
labels:
serverity: error
annotations:
summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space left"
- name: Prometheus Alertmanager
rules:
- alert: AlertmanagerMissing
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
serverity: warn
annotations:
summary: "No connected alertmanager on {{ $labels.instance }}"
- alert: AlertmanagerMissing
expr: increase(prometheus_notifications_dropped_total[5m]) > 0
labels:
serverity: error
annotations:
summary: "Dropped alert notifications on {{ $labels.instance }}"
- alert: FailingAlertmanagerNotifications
expr: increase(alertmanager_notifications_failed_total[5m]) > 0
labels:
serverity: error
annotations:
summary: "Failing notifications via {{ $labels.integration }} on {{ $labels.instance }}"
- alert: FailingRuleEvaluation
expr: sum by (instance) (increase(prometheus_rule_evaluation_failures_total[5m])) > 0
labels:
serverity: warn
annotations:
summary: "Failing rule evaluations on {{ $labels.instance }}"