ALERT K8SNodeNotReady IF kube_node_status_condition{condition="Ready", status="true"} == 0 FOR 1h LABELS { severity = "warning", } ANNOTATIONS { summary = "Node status is NotReady", description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } ALERT K8SManyNodesNotReady IF count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 AND ( count(kube_node_status_condition{condition="Ready", status="true"} == 0) / count(kube_node_status_condition{condition="Ready", status="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { summary = "Many Kubernetes nodes are Not Ready", description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape {{ $value }}% of kubelets.", } ALERT K8SKubeletDown IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { severity = "warning", } ANNOTATIONS { summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", }