From e615b033be3aea2830de6879cdbb854522350e1f Mon Sep 17 00:00:00 2001 From: Thomas Boerger Date: Mon, 21 Feb 2022 12:35:42 +0100 Subject: [PATCH] feat: add alerts for npode exporter --- server/prometheusrule.yml | 18 ++---------------- services/apiserver.yml | 4 ++-- services/kubelet.yml | 2 +- services/node-exporter.yml | 29 ++++++++++++++++++++++++++++- services/scheduler.yml | 2 +- 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/server/prometheusrule.yml b/server/prometheusrule.yml index d9fbe81..de43215 100644 --- a/server/prometheusrule.yml +++ b/server/prometheusrule.yml @@ -2,20 +2,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule -metadata: - name: prometheus-server - labels: - prometheus: server - role: rules - -spec: - groups: [] - -... ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule - metadata: name: kubernetes-general labels: @@ -24,7 +10,7 @@ metadata: spec: groups: - - name: kubernetes-general.rules + - name: kubernetes-general rules: - expr: | sum by (cluster, namespace, pod, container) ( @@ -152,7 +138,7 @@ metadata: spec: groups: - - name: kubernetes-node.rules + - name: kubernetes-node rules: - expr: | topk by(namespace, pod) (1, diff --git a/services/apiserver.yml b/services/apiserver.yml index e9abee2..56a06ac 100644 --- a/services/apiserver.yml +++ b/services/apiserver.yml @@ -95,7 +95,7 @@ metadata: spec: groups: - - name: kube-apiserver-general.rules + - name: kube-apiserver-general rules: - expr: | ( @@ -436,7 +436,7 @@ spec: quantile: "0.5" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - name: kube-apiserver-availability.rules + - name: kube-apiserver-availability interval: 3m rules: - expr: | diff --git a/services/kubelet.yml b/services/kubelet.yml index ed2a283..9c4c2a8 100644 --- a/services/kubelet.yml +++ b/services/kubelet.yml @@ -113,7 +113,7 @@ metadata: spec: groups: - - name: kubelet.rules + - name: kube-kubelet rules: - expr: | histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kube-kubelet", metrics_path="/metrics"}) diff --git a/services/node-exporter.yml b/services/node-exporter.yml index c280521..29a5a9d 100644 --- a/services/node-exporter.yml +++ b/services/node-exporter.yml @@ -42,7 +42,34 @@ metadata: spec: groups: - - name: node-exporter.rules + - name: node-alerts + rules: + - alert: HostDiskWillFillIn24Hours + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host disk will fill in 24 hours on {{ $labels.instance }}" + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate" + - alert: HostInodesWillFillIn24Hours + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host inodes will fill in 24 hours on {{ $labels.instance }}" + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate" + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected on {{ $labels.instance }}" + description: "OOM kill detected" + + - name: node-exporter rules: - expr: | count without (cpu) ( diff --git a/services/scheduler.yml b/services/scheduler.yml index 2131cf1..64d99b7 100644 --- a/services/scheduler.yml +++ b/services/scheduler.yml @@ -32,7 +32,7 @@ metadata: spec: groups: - - name: kube-scheduler.rules + - name: kube-scheduler rules: - expr: | histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))