* kube-prometheus: fix alert rule K8SManyNodesNotReady * fix alert "K8SManyNodesNotReady" in helm templates & make generate * Use sync_kube_prometheus.py to make rules in helm in sync
49 lines
1.8 KiB
YAML
49 lines
1.8 KiB
YAML
groups:
|
|
- name: kubelet.rules
|
|
rules:
|
|
- alert: K8SNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
|
or has set itself to NotReady, for more than an hour
|
|
summary: Node status is NotReady
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
|
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
|
0) / count(kube_node_status_condition{condition="Ready",status="true"})) * 100 > 20
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
|
- alert: K8SKubeletDown
|
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
summary: Prometheus failed to scrape
|
|
- alert: K8SKubeletDown
|
|
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
|
* 100 > 10
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
|
have disappeared from service discovery.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletTooManyPods
|
|
expr: kubelet_running_pod_count > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
|
to the limit of 110
|
|
summary: Kubelet is close to pod limit
|