Skip to content

Commit

Permalink
feat: add alerts for npode exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
tboerger committed Feb 21, 2022
1 parent f3805d6 commit e615b03
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 21 deletions.
18 changes: 2 additions & 16 deletions server/prometheusrule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,6 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule

metadata:
name: prometheus-server
labels:
prometheus: server
role: rules

spec:
groups: []

...
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule

metadata:
name: kubernetes-general
labels:
Expand All @@ -24,7 +10,7 @@ metadata:

spec:
groups:
- name: kubernetes-general.rules
- name: kubernetes-general
rules:
- expr: |
sum by (cluster, namespace, pod, container) (
Expand Down Expand Up @@ -152,7 +138,7 @@ metadata:

spec:
groups:
- name: kubernetes-node.rules
- name: kubernetes-node
rules:
- expr: |
topk by(namespace, pod) (1,
Expand Down
4 changes: 2 additions & 2 deletions services/apiserver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ metadata:

spec:
groups:
- name: kube-apiserver-general.rules
- name: kube-apiserver-general
rules:
- expr: |
(
Expand Down Expand Up @@ -436,7 +436,7 @@ spec:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- name: kube-apiserver-availability.rules
- name: kube-apiserver-availability
interval: 3m
rules:
- expr: |
Expand Down
2 changes: 1 addition & 1 deletion services/kubelet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ metadata:

spec:
groups:
- name: kubelet.rules
- name: kube-kubelet
rules:
- expr: |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kube-kubelet", metrics_path="/metrics"})
Expand Down
29 changes: 28 additions & 1 deletion services/node-exporter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,34 @@ metadata:

spec:
groups:
- name: node-exporter.rules
- name: node-alerts
rules:
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 24 hours on {{ $labels.instance }}"
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate"
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Host inodes will fill in 24 hours on {{ $labels.instance }}"
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected on {{ $labels.instance }}"
description: "OOM kill detected"

- name: node-exporter
rules:
- expr: |
count without (cpu) (
Expand Down
2 changes: 1 addition & 1 deletion services/scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:

spec:
groups:
- name: kube-scheduler.rules
- name: kube-scheduler
rules:
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
Expand Down

0 comments on commit e615b03

Please sign in to comment.