diff --git a/guides/grafana/grafana-setup.md b/guides/grafana/grafana-setup.md new file mode 100644 index 0000000..cfdbaf8 --- /dev/null +++ b/guides/grafana/grafana-setup.md @@ -0,0 +1,1498 @@ +--- +title: Docker Compose Setup für Grafana mit Prometheus, cAdvisor, blackbox und node-exporter +description: +published: true +date: 2024-02-10T21:37:39.096Z +tags: +editor: markdown +dateCreated: 2024-02-10T21:37:39.096Z +--- + +# Docker Compose Setup für Grafana mit Prometheus, cAdvisor, blackbox und node-exporter + +Diese Docker Compose-Konfiguration ermöglicht die einfache Bereitstellung von Grafana in Verbindung mit Prometheus, cAdvisor, blackbox und node-exporter mit einem Alertmanager und Ntfy unter Verwendung von Docker. + +## Voraussetzungen + +Vor der Verwendung dieser Konfiguration stellen Sie sicher, dass Sie Docker, Docker Compose, Watchtower für automatische Aktualisierungen, Keycloak für die Authentifizierung und Flame als Dashboard mit Nginx als Reverse-Proxy auf Ihrem System installiert und konfiguriert haben. + +Die Programm die dazu benötigt werden sind in folgenden Anleitungen Dokumentiert: + +## Dienste + +### grafana + +- Grafana-Dashboard mit Konfiguration für Prometheus und Loki. +- Authentifizierung über Keycloak. +- Konfiguriert für die Verwendung mit Nginx als Reverse-Proxy. + +### Prometheus + +- Es sammelt Metriken von verschiedenen Quellen, speichert diese in einer Zeitreihendatenbank und ermöglicht die Abfrage und Analyse dieser Metriken. + +### Alertmanager +- Der Alertmanager ist ein Bestandteil des Prometheus-Ökosystems und dient der Verwaltung von Alarmmeldungen, die von Prometheus generiert werden. +- Er ermöglicht die Definition, Routing und Benachrichtigung von Alarmen an verschiedene Empfänger wie E-Mail, PagerDuty oder Slack. + +### Ntfy +- ntfy ist ein Befehlszeilenwerkzeug, das Benachrichtigungen über den Abschluss von Befehlen oder anderen Ereignissen auf dem System sendet. +- Nach Ausführung eines Befehls kann ntfy.sh Benachrichtigungen über verschiedene Kanäle wie Slack, Telegram oder den Systembenachrichtigungsmechanismus senden, um den Benutzer über den Abschluss des Befehls, oder Ereignisses vom Alertmanager zu informieren. + +### node-exporter + +- Exponiert Host-Metriken für Prometheus. + +### cadvisor + +- Überwacht Container und gibt Metriken für Prometheus aus. + +### blackbox + +- Blackbox ist ein Open-Source-Tool, das für die externe Überwachung von Netzwerkdiensten verwendet wird. +- Es ermöglicht das Überprüfen der Erreichbarkeit und Integrität von Diensten durch das Senden von Anfragen und Auswerten der Antworten, um sicherzustellen, dass Dienste ordnungsgemäß funktionieren. + +### loki + +- Log-Aggregationsservice für Grafana. + +### promtail + +- Agent, der Log-Daten an Loki sendet. + +## Anwendung starten + +Erstellen sie einen neuen Ordner für die Grafana Installation und wechseln sie dorthin: + +~~~ +mkdir -p /opt/containers/grafana +cd /opt/containers/grafana +~~~ + +Erstellen sie die folgenden Dateien und passen sie Die Variablen an: + +~~~ +nvim ${FILENAME} +~~~ + +*docker-compose.yml* +~~~ +version: "3.8" + +services: + + alertmanager: + image: prom/alertmanager:latest + volumes: + - ./alertmanager/config.yml:/etc/alertmanager/config.yml + - ./alertmanager/web.yml:/etc/alertmanager/web.yml + command: + - '--config.file=/etc/alertmanager/config.yml' + - '--web.config.file=/etc/alertmanager/web.yml' + - '--web.external-url=https://alertmanager.brothertec.eu/' + restart: unless-stopped + #ports: + # - 9093:9093 + networks: + default: + proxy: + edge-tier: + backend: + environment: + - VIRTUAL_HOST=alertmanager.${DOMAIN} + - VIRTUAL_PORT=9093 + - LETSENCRYPT_HOST=alertmanager.${DOMAIN} + - LETSENCRYPT_EMAIL=admin@${DOMAIN} + labels: + - "com.centurylinklabs.watchtower.enable=true" + - flame.type=application + - flame.name=Alertmanager + - flame.url=https://alertmanager.${DOMAIN} + - flame.icon=monitor-dashboard + + ntfy-alertmanager: + image: xenrox/ntfy-alertmanager:latest + #build: builds/ntfy-alertmanager/docker/. + container_name: ntfy-alertmanager + volumes: + - ./ntfy-alertmanager/config.scfg:/etc/ntfy-alertmanager/config + #ports: + # - 127.0.0.1:8080:8080 + restart: unless-stopped + networks: + - default + + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - ./node_exporter/textfile_collector/:/node_exporter/textfile_collector:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--path.rootfs=/rootfs' + - '--collector.filesystem.mount-points-exclude="^(/rootfs|/host|)/(sys|proc|dev|host|etc)($$|/)"' + - '--collector.filesystem.fs-types-exclude="^(sys|proc|auto|cgroup|devpts|ns|au|fuse\.lxc|mqueue)(fs|)$$"' + - '--collector.textfile.directory=/node_exporter/textfile_collector/' + networks: + - default + labels: + - "com.centurylinklabs.watchtower.enable=true" + + cadvisor: + image: gcr.io/cadvisor/cadvisor + #image: gcr.io/cadvisor/cadvisor-arm64:v0.47.2 + container_name: cadvisor + restart: unless-stopped + #ports: + # - '8889:8080' + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + networks: + - default + - proxy + - edge-tier + environment: + - VIRTUAL_HOST=cadvisor.${DOMAIN} + - VIRTUAL_PORT=8080 + - LETSENCRYPT_HOST=cadvisor.${DOMAIN} + - LETSENCRYPT_EMAIL=admin@${DOMAIN} + labels: + - "com.centurylinklabs.watchtower.enable=true" + - flame.type=application + - flame.name=Cadvisor-Dashboard + - flame.url=https://cadvisor.${DOMAIN} + - flame.icon=monitor-dashboard + + blackbox: + image: 'prom/blackbox-exporter:latest' + #ports: + # - 9115/tcp + command: + - '--config.file=/config/blackbox.yml' + container_name: blackbox_exporter + restart: always + volumes: + - './blackbox:/config' + networks: + default: + dns: + ipv4_address: 172.28.0.5 + labels: + - "com.centurylinklabs.watchtower.enable=true" + + loki: + image: grafana/loki:latest + volumes: + #- './loki-data:/loki' + - ./loki/loki-config.yaml:/etc/loki/local-config.yaml + ports: + - "127.0.0.1:3100:3100" + command: -config.file=/etc/loki/local-config.yaml + restart: always + depends_on: + - promtail + networks: + - default + labels: + - "com.centurylinklabs.watchtower.enable=true" + + promtail: + image: grafana/promtail:latest + volumes: + - /var/log:/var/log + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /opt/containers/nginx-proxy/logs/:/config/log/nginx/:ro + - ./promtail/promtail-config.yaml:/etc/promtail/promtail-config.yaml + command: -config.file=/etc/promtail/promtail-config.yaml + restart: always + networks: + - default + labels: + - "com.centurylinklabs.watchtower.enable=true" + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/rules.yml:/etc/prometheus/rules.yml + - ./prometheus/test.yml:/etc/prometheus/test.yml + #- ./prom-data:/prometheus + - ./from-docker-labels.json:/tmp/from-docker-labels.json + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.size=20GB' + depends_on: + - node-exporter + - cadvisor + - blackbox + networks: + default: + proxy: + edge-tier: + backend: + dns: + ipv4_address: 172.28.0.8 + environment: + - VIRTUAL_HOST=prometheus.${DOMAIN} + - VIRTUAL_PORT=9090 + - LETSENCRYPT_HOST=prometheus.${DOMAIN} + - LETSENCRYPT_EMAIL=admin@${DOMAIN} + labels: + - "com.centurylinklabs.watchtower.enable=true" + - flame.type=application + - flame.name=Prometheus-Dashboard + - flame.url=https://prometheus.${DOMAIN} + - flame.icon=monitor-dashboard + + grafana: + image: grafana/grafana-oss + container_name: grafana + restart: unless-stopped + environment: + GF_SERVER_ROOT_URL: https://dashboard.${DOMAIN}/ + GF_INSTALL_PLUGINS: grafana-clock-panel + GF_AUTH_ANONYMOUS_ENABLED: true + GF_AUTH_ANONYMOUS_ORG_NAME: Public + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + GF_AUTH_GENERIC_OAUTH_ENABLED: "true" + GF_AUTH_GENERIC_OAUTH_NAME: "SingleSignOn" + GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "true" + GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "Grafana" + GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "${SECRET}" + GF_AUTH_GENERIC_OAUTH_SCOPES: "openid profile email offline_access roles Grafana-Mapper" + GF_AUTH_GENERIC_OAUTH_AUTH_URL: "https://auth.${DOMAIN}/realms/master/protocol/openid-connect/auth" + GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "https://auth.${DOMAIN}/realms/master/protocol/openid-connect/token" + GF_AUTH_GENERIC_OAUTH_API_URL: "https://auth.${DOMAIN}/realms/master/protocol/openid-connect/userinfo" + #GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(roles[*], 'Admin') && 'Admin' || contains(roles[*], 'Editor') && 'Editor' || 'Viewer'" + GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(realm_access.roles[*], 'grafana-admin') && 'Admin' || contains(realm_access.roles[*], 'Editor') && 'Editor' || 'Viewer'" + GF_ALLOW_ASSIGN_GRAFANA_ADMIN: true + GF_SMTP_ENABLED: true + GF_SMTP_HOST: mail.${DOMAIN}:587 + GF_SMTP_USER: grafana@${DOMAIN} + GF_SMTP_PASSWORD: ${SMTP_PASSWORD} + GF_SMTP_FROM_ADDRESS: grafana@${DOMAIN} + GF_FEATURE_TOGGLES_ENABLE: publicDashboards + VIRTUAL_HOST: dashboard.${DOMAIN} + VIRTUAL_PORT: 3000 + LETSENCRYPT_HOST: dashboard.${DOMAIN} + LETSENCRYPT_EMAIL: admin@${DOMAIN} + + #ports: + # - '3000:3000' + volumes: + - './grafana_storage:/var/lib/grafana' + + depends_on: + - prometheus + - loki + + labels: + - "com.centurylinklabs.watchtower.enable=true" + - flame.type=application + - flame.name=Grafana-Dashboard + - flame.url=https://dashboard.${DOMAIN} + - flame.icon=monitor-dashboard + + networks: + default: + proxy: + edge-tier: + dns: + ipv4_address: 172.28.0.27 + +networks: + proxy: + name: nginx-proxy + external: true + edge-tier: + name: edge + external: true + dns: + name: dns + external: true + backend: + driver: bridge +~~~ + +*prometheus/prometheus.yml* + +~~~ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - rules.yml +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - 'alertmanager:9093' + +scrape_configs: + - job_name: 'alertmanager' + static_configs: + - targets: ['alertmanager:9093'] + + - job_name: 'prometheus' + static_configs: + - targets: ['prometheus:9090'] + + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter:9100'] + + - job_name: 'cadvisor' + static_configs: + - targets: ['cadvisor:8080'] + + - job_name: 'blackbox' + metrics_path: /probe + params: + module: [http_2xx] # Look for a HTTP 200 response. + static_configs: + - targets: + - https://${DOMAIN} # Target to probe with https. + - https://dashboard.${DOMAIN} + - https://ntfy.${DOMAIN} + - https://auth.${DOMAIN} + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox:9115 # The blackbox exporter's real hostname:port. +~~~ + +*prometheus/rules.yml* + +~~~ +groups: +- name: alert.rules + rules: + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: "Critical" + annotations: + summary: "Endpoint {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." + + - alert: EndpointDown + expr: probe_success == 0 + for: 10s + labels: + severity: "critical" + annotations: + summary: "Endpoint {{ $labels.instance }} down" + +### Prometheus + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAllTargetsMissing + expr: sum by (job) (up) == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissingWithWarmupTime + expr: sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600)) + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) + description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + #- alert: PrometheusAlertmanagerE2eDeadManSwitch + # expr: vector(1) + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTimeseriesCardinality + expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +### Node-Exporter + + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + - alert: HostMemoryIsUnderutilized + expr: (100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostDiskWillFillIn24Hours + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + #- alert: HostFilesystemDeviceError + # expr: node_filesystem_device_error == 1 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Host filesystem device error (instance {{ $labels.instance }}) + # description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostHighCpuLoad + expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + #- alert: HostCpuIsUnderutilized + # expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + # for: 1w + # labels: + # severity: info + # annotations: + # summary: Host CPU is underutilized (instance {{ $labels.instance }}) + # description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # 10000 context switches is an arbitrary number. + # The alert threshold depends on the nature of the application. + # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 + - alert: HostContextSwitching + expr: ((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: (node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +### cAdvisor + + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. + - alert: ContainerKilled + expr: time() - container_last_seen > 600 + for: 0m + labels: + severity: warning + annotations: + summary: Container killed (instance {{ $labels.instance }}) + description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. + - alert: ContainerAbsent + expr: absent(container_last_seen) + for: 5m + labels: + severity: warning + annotations: + summary: Container absent (instance {{ $labels.instance }}) + description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighCpuUtilization + expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container High CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d + - alert: ContainerHighMemoryUsage + expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container High Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + #- alert: ContainerVolumeUsage + # expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 + # for: 2m + # labels: + # severity: warning + # annotations: + # summary: Container Volume usage (instance {{ $labels.instance }}) + # description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighThrottleRate + expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 + for: 2m + labels: + severity: warning + annotations: + summary: Container high throttle rate (instance {{ $labels.instance }}) + description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + #- alert: ContainerLowCpuUtilization + # expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20 + # for: 7d + # labels: + # severity: info + # annotations: + # summary: Container Low CPU utilization (instance {{ $labels.instance }}) + # description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + #- alert: ContainerLowMemoryUsage + # expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20 + # for: 7d + # labels: + # severity: info + # annotations: + # summary: Container Low Memory usage (instance {{ $labels.instance }}) + # description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +### Custom-Shit + + - alert: DataDiskSpace15%Free +# expr: 100 - ((node_filesystem_avail_bytes{device="/dev/sda1",mountpoint="/opt/data"} * 100) / node_filesystem_size_bytes{device="/dev/sda1",mountpoint="/opt/data"}) > 85 + expr: 100 - (100 * ((node_filesystem_avail_bytes{mountpoint="/opt/data",fstype!="rootfs"} ) / (node_filesystem_size_bytes{mountpoint="/opt/data",fstype!="rootfs"}) )) > 85 + labels: + severity: moderate + annotations: + summary: "Instance {{ $labels.instance }} is low on disk space" + description: "diskspace on {{ $labels.instance }} is used over {{ $value }}% ." + + - alert: DataDisk2Space15%Free +# expr: 100 - ((node_filesystem_avail_bytes{device="/dev/sdb1",mountpoint="/opt/data2"} * 100) / node_filesystem_size_bytes{device="/dev/sdb1",mountpoint="/opt/data2"}) > 85 + expr: 100 - (100 * ((node_filesystem_avail_bytes{mountpoint="/opt/data2",fstype!="rootfs"} ) / (node_filesystem_size_bytes{mountpoint="/opt/data2",fstype!="rootfs"}) )) > 85 + labels: + severity: moderate + annotations: + summary: "Instance {{ $labels.instance }} is low on disk space" + description: "diskspace on {{ $labels.instance }} is used over {{ $value }}% ." + + - alert: RootDiskSpace10%Free +# expr: 100 - ((node_filesystem_avail_bytes{device="/dev/nvme0n1p3",mountpoint="/etc/hostname"} * 100) / node_filesystem_size_bytes{device="/dev/nvme0n1p3",mountpoint="/etc/hostname"}) > 90 + expr: 100 - (100 * ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} ) / (node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) )) > 90 + labels: + severity: moderate + annotations: + summary: "Instance {{ $labels.instance }} is low on disk space" + description: "diskspace on {{ $labels.instance }} is used over {{ $value }}% ." +~~~ + +*alertmanager/config.yml* + +~~~ +global: + resolve_timeout: 1m + # The smarthost and SMTP sender used for mail notifications. + #smtp_smarthost: 'mail.{DOMAIN}:587' + #smtp_from: 'grafana@{DOMAIN}' + #smtp_auth_username: 'grafana@{DOMAIN}' + #smtp_auth_password: '${SECRET}' + +route: + receiver: 'ntfy' + #receiver: 'Mail Alerts' + +receivers: + - name: "ntfy" + webhook_configs: + - url: "http://ntfy-alertmanager:8080" + http_config: + basic_auth: + username: "${USER}" + password: "${SECRET}" + - name: 'Mail Alerts' + email_configs: + - smarthost: '${DOMAIN}:587' + auth_username: 'grafana@{DOMAIN}' + auth_password: "${SECRET}" + from: 'grafana@{DOMAIN}' + to: '${USER}@{DOMAIN}' + send_resolved: true + headers: + subject: 'Prometheus Mail Alerts' +~~~ + +*ntfy-alertmanager/config.scfg* + +~~~ +# Public facing base URL of the service (e.g. https://ntfy-alertmanager.xenrox.net) +# This setting is required for the "Silence" feature. +base-url https://ntfy-alertmanager.${DOMAIN} +# http listen address +http-address :8080 +# Log level (either debug, info, warning, error) +log-level debug +# Log format (either text or json) +log-format text +# When multiple alerts are grouped together by Alertmanager, they can either be sent +# each on their own (single mode) or be kept together (multi mode) (either single or multi; default is multi) +alert-mode single +# Optionally protect with HTTP basic authentication +#user webhookUser +#password webhookPass + +labels { + order "severity,instance" + + severity "critical" { + priority 5 + tags "rotating_light" + icon "https://foo.com/critical.png" + # Forward messages which severity "critical" to the specified email address. + #email-address ${USER}@${DOMAIN} + # Call the specified number. Use `yes` to pick the first of your verified numbers. + #call yes + } + + severity "warning" { + priority 3 + } + + severity "info" { + priority 1 + } + + instance "brothertec.eu" { + tags "computer,example" + } +} + +# Settings for resolved alerts +resolved { + tags "resolved,partying_face" + icon "https://foo.com/resolved.png" + priority 1 +} + +ntfy { + # URL of the ntfy topic - required + topic https://ntfy.brothertec.eu/alertmanager-alerts + # ntfy authentication via Basic Auth (https://docs.ntfy.sh/publish/#username-password) + user simono41 + password ${SECRET} + # ntfy authentication via access tokens (https://docs.ntfy.sh/publish/#access-tokens) + # Either access-token or a user/password combination can be used - not both. + #access-token foobar + # When using (self signed) certificates that cannot be verified, you can instead specify + # the SHA512 fingerprint. + # openssl can be used to obtain it: + # openssl s_client -connect HOST:PORT | openssl x509 -fingerprint -sha512 -noout + # For convenience ntfy-alertmanager will convert the certificate to lower case and remove all colons. + certificate-fingerprint 13:6D:2B:88:9C:57:36:D0:81:B4:B2:9C:79:09:27:62:92:CF:B8:6A:6B:D3:AD:46:35:CB:70:17:EB:99:6E:28:08:2A:B8:C6:79:4B:F6:2E:81:79:41:98:1D:53:C8:07:B3:5C:24:5F:B1:8E:B6:FB:66:B5:DD:B4:D0:5C:29:91 + # Forward all messages to the specified email address. + #email-address ${USER}@${DOMAIN} + # Call the specified number for all alerts. Use `yes` to pick the first of your verified numbers. + #call +49123456789 +} + +alertmanager { + # If set, the ntfy message will contain a "Silence" button, which can be used + # to create a silence via the Alertmanager API. Because of limitations in ntfy, + # the request will be proxied through ntfy-alertmanager. Therefore ntfy-alertmanager + # needs to be exposed to external network requests and base-url has to be set. + # + # When alert-mode is set to "single" all alert labels will be used to create the silence. + # When it is "multi" common labels between all the alerts will be used. WARNING: This + # could silence unwanted alerts. + silence-duration 24h + # Basic authentication (https://prometheus.io/docs/alerting/latest/https/) + user simono41 + password ${SECRET} + # By default the Alertmanager URL gets parsed from the webhook. In case that + # Alertmanger is not reachable under that URL, it can be overwritten here. + url https://alertmanager.${DOMAIN} +} + +# When the alert-mode is set to single, ntfy-alertmanager will cache each single alert +# to avoid sending recurrences. +cache { + # The type of cache that will be used (either disabled, memory or redis; default is disabled). + type redis + # How long messages stay in the cache for + duration 24h + + # Memory cache settings + # Interval in which the cache is cleaned up + #cleanup-interval 1h + + # Redis cache settings + # URL to connect to redis (default: redis://localhost:6379) + #redis-url redis://user:password@localhost:6789/3 + redis-url redis://redis:6379 +} +~~~ + +*blackbox/blackbox.yml* + +~~~ +modules: + http_2xx: + prober: http + http: + preferred_ip_protocol: "ip4" + http_post_2xx: + prober: http + http: + method: POST + http_basic_auth: + prober: http + timeout: 15s + http: + preferred_ip_protocol: "ip4" + method: POST + basic_auth: + username: "${USER}" + password: "${SECRET}" + tcp_connect: + prober: tcp + pop3s_banner: + prober: tcp + tcp: + query_response: + - expect: "^+OK" + tls: true + tls_config: + insecure_skip_verify: false + grpc: + prober: grpc + grpc: + tls: true + preferred_ip_protocol: "ip4" + grpc_plain: + prober: grpc + grpc: + tls: false + service: "service1" + ssh_banner: + prober: tcp + tcp: + query_response: + - expect: "^SSH-2.0-" + - send: "SSH-2.0-blackbox-ssh-check" + irc_banner: + prober: tcp + tcp: + query_response: + - send: "NICK prober" + - send: "USER prober prober prober :prober" + - expect: "PING :([^ ]+)" + send: "PONG ${1}" + - expect: "^:[^ ]+ 001" + icmp: + prober: icmp + icmp_ttl5: + prober: icmp + timeout: 5s + icmp: + ttl: 5 + + smtp_starttls: + prober: tcp + timeout: 20s + tcp: + preferred_ip_protocol: ip4 + tls_config: + insecure_skip_verify: true + query_response: + - expect: "^220 ([^ ]+) ESMTP (.+)$" + - send: "EHLO prober" + - expect: "^250-(.*)" + - send: "STARTTLS" + - expect: "^220" + - starttls: true + - send: "EHLO prober" + - expect: "^250-" + - send: "QUIT" +~~~ + +*loki/loki-config.yaml* + +~~~ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + retention_period: 360h + max_query_series: 100000 + max_query_parallelism: 2 + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://alertmanager:9093 + +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go +# Refer to the buildReport method to see what goes into a report. +# +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false +~~~ + +*promtail/promtail-config.yaml* + +~~~ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: +- job_name: system + static_configs: + - targets: + - localhost + labels: + job: varlogs + __path__: /var/log/*log + +- job_name: nginx + static_configs: + - targets: + - localhost + labels: + job: nginxlogs + __path__: /config/log/nginx/*log + +- job_name: containers + static_configs: + - targets: + - localhost + labels: + job: containerlogs + __path__: /var/lib/docker/containers/*/*log + + pipeline_stages: + - json: + expressions: + output: log + stream: stream + attrs: + - json: + expressions: + tag: + source: attrs + - regex: + expression: (?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])) + source: tag + - timestamp: + format: RFC3339Nano + source: time + - labels: + tag: + stream: + image_name: + container_name: + image_id: + container_id: + - output: + source: output +~~~ + +Führen Sie den folgenden Befehl im Verzeichnis aus, in dem sich die `docker-compose.yml`-Datei befindet: + +```bash +docker-compose up -d +``` + +Dies startet die Dienste im Hintergrund. + +## Anpassungen + +Passen Sie die Umgebungsvariablen, Volumes, Netzwerke und Labels in der `docker-compose.yml`-Datei an Ihre spezifischen Anforderungen an. + +## Weitere Informationen + +Für weitere Informationen zu den einzelnen Diensten und deren Konfigurationen, siehe die entsprechenden Dokumentationen: + +- [Prometheus](https://prometheus.io/docs/introduction/overview/) +- [cAdvisor](https://github.com/google/cadvisor) +- [node-exporter](https://github.com/prometheus/node_exporter) +- [Loki](https://grafana.com/oss/loki/) +- [Promtail](https://grafana.com/docs/loki/latest/clients/promtail/) +- [Grafana](https://grafana.com/docs/grafana/latest/) +- [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) +- [Ntfy](https://ntfy.sh/) +- [Watchtower](https://containrrr.dev/watchtower/) +- [Flame](https://github.com/pawelmalak/flame) \ No newline at end of file