3acc1865c0
The goal is to create a monitoring setup where each server monitors itself when it comes failing systemd services, disk or RAM filling up, …. In addition each prometheus will monitor remote prometheus and alertmanager instances for signs of failure (e.g. being unreachable, errors in notification delivery, dropping alerts). A lot of metrics (especially histograms from prometheus or alertmanager) are being dropped before ingestion to disk save on space and memory. Depending on how many servers we may or may not have in the future this could probably use some kind of overhaul since we rightnow have n^2 monitoring peer relationships (not even speaking of possible duplicated alerts).
139 lines
4.8 KiB
YAML
139 lines
4.8 KiB
YAML
---
|
|
groups:
|
|
- name: General system
|
|
rules:
|
|
- record: node_memory_MemAvailable_percentage
|
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
|
- record: node_cpu_count
|
|
expr: count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
|
|
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
serverity: critical
|
|
annotations:
|
|
summary: "{{ $labels.instance }} down"
|
|
|
|
- alert: SystemdServiceFailed
|
|
expr: node_systemd_unit_state{state="failed"} > 0
|
|
for: 5m
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "{{ $labels.name }} on {{ $labels.instance }} crashed"
|
|
|
|
- alert: OomKill
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0 # Scrape interval of 1 minute
|
|
for: 0m
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "OOM kill on {{ $labels.instance }}"
|
|
|
|
- alert: VeryLowAvailableMemory
|
|
# Less than 3% available or less than 100KB available memory
|
|
expr: ((node_memory_MemAvailable_percentage < 3) or (node_memory_MemFree_bytes < (1024 * 100)))
|
|
for: 2m
|
|
labels:
|
|
serverity: critical
|
|
annotations:
|
|
summary: "Very low free memory on {{ $labels.instance }}"
|
|
|
|
- alert: LowAvailableMemory
|
|
expr: node_memory_MemAvailable_percentage < 10
|
|
for: 5m
|
|
labels:
|
|
serverity: warn
|
|
annotations:
|
|
summary: "Low free memory on {{ $labels.instance }}"
|
|
|
|
- alert: ReducedAvailableMemory
|
|
expr: node_memory_MemAvailable_percentage < 20
|
|
# Detect constant high memory usage as a potential sign that the host could maybe need some more memory
|
|
for: 1h
|
|
labels:
|
|
serverity: info
|
|
annotations:
|
|
summary: "Reduced available memory on {{ $labels.instance }}"
|
|
|
|
- alert: HighMemoryPressure
|
|
# For at least 5 seconds over the last 2 Minutes, no progress could be made
|
|
# due to memory congestion
|
|
expr: increase(node_pressure_memory_stalled_seconds_total[2m]) >= 5
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "Memory preassure on {{ $labels.instance }}"
|
|
|
|
- alert: VeryHighLoad
|
|
expr: ((node_load1 / node_cpu_count) > 1) or ((node_load5 / node_cpu_count) > 0.9)
|
|
for: 5m
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "Very High load on {{ $labels.instance }}"
|
|
|
|
- alert: HighLoad
|
|
expr: ((node_load1 / node_cpu_count) > 0.9) or ((node_load5 / node_cpu_count) > 0.8)
|
|
for: 10m
|
|
labels:
|
|
serverity: warn
|
|
annotations:
|
|
summary: "High load on {{ $labels.instance }}"
|
|
|
|
- alert: ElevatedLoad
|
|
# Detecting a long-term increased load
|
|
expr: (node_load15 / node_cpu_count) > 0.7
|
|
for: 15m
|
|
labels:
|
|
serverity: info
|
|
annotations:
|
|
summary: "Elevated load15 on {{ $labels.instance }}"
|
|
|
|
- alert: LowFreeDiskSpace
|
|
expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15
|
|
for: 10m
|
|
labels:
|
|
serverity: info
|
|
annotations:
|
|
summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space left"
|
|
|
|
- alert: VeryLowFreeDiskSpace
|
|
expr: node_filesystem_readonly == 0 and ON (instance, device, mountpoint) (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5
|
|
for: 3m
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space left"
|
|
|
|
- name: Prometheus Alertmanager
|
|
rules:
|
|
- alert: AlertmanagerMissing
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 5m
|
|
labels:
|
|
serverity: warn
|
|
annotations:
|
|
summary: "No connected alertmanager on {{ $labels.instance }}"
|
|
|
|
- alert: AlertmanagerMissing
|
|
expr: increase(prometheus_notifications_dropped_total[5m]) > 0
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "Dropped alert notifications on {{ $labels.instance }}"
|
|
|
|
- alert: FailingAlertmanagerNotifications
|
|
expr: increase(alertmanager_notifications_failed_total[5m]) > 0
|
|
labels:
|
|
serverity: error
|
|
annotations:
|
|
summary: "Failing notifications via {{ $labels.integration }} on {{ $labels.instance }}"
|
|
|
|
- alert: FailingRuleEvaluation
|
|
expr: sum by (instance) (increase(prometheus_rule_evaluation_failures_total[5m])) > 0
|
|
labels:
|
|
serverity: warn
|
|
annotations:
|
|
summary: "Failing rule evaluations on {{ $labels.instance }}"
|