kube-prometheus: add alerting rules

2017-05-27 10:44:33 +02:00
parent f0851d5e4d
commit c4b382be6f
12 changed files with 828 additions and 598 deletions
--- a/assets/prometheus/rules/kube-apiserver.rules
+++ b/assets/prometheus/rules/kube-apiserver.rules
@@ -0,0 +1,38 @@
+ALERT K8SApiserverDown
+  IF up{job="apiserver"} == 0
+  FOR 15m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "API server unreachable",
+    description = "An API server could not be scraped.",
+  }
+
+# Disable for non HA kubernetes setups.
+ALERT K8SApiserverDown
+  IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
+  FOR 5m
+  LABELS {
+    severity = "critical"
+  }
+  ANNOTATIONS {
+    summary = "API server unreachable",
+    description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
+  }
+
+# Some verbs excluded because they are expected to be long-lasting:
+# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
+ALERT K8SApiServerLatency
+  IF histogram_quantile(
+      0.99,
+      sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
+    ) / 1e6 > 1.0
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Kubernetes apiserver latency is high",
+    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
+  }