kube-prometheus: add alerting rules

2017-05-27 10:44:33 +02:00
parent f0851d5e4d
commit c4b382be6f
12 changed files with 828 additions and 598 deletions
--- a/assets/prometheus/rules/kubernetes.rules
+++ b/assets/prometheus/rules/kubernetes.rules
@@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
  histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
  histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-
-ALERT K8SNodeDown
-  IF up{job="kubelet"} == 0
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubelet cannot be scraped",
-    description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
-  }
-
-ALERT K8SNodeNotReady
-  IF kube_node_status_ready{condition="true"} == 0
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node status is NotReady",
-    description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-  }
-
-ALERT K8SManyNodesNotReady
-  IF
-    count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
-    AND
-      (
-        count by (cluster) (kube_node_status_ready{condition="true"} == 0)
-      /
-        count by (cluster) (kube_node_status_ready{condition="true"})
-      ) > 0.2
-  FOR 1m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many K8s nodes are Not Ready",
-    description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
-  }
-
-ALERT K8SKubeletNodeExporterDown
-  IF up{job="node-exporter"} == 0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubelet node_exporter cannot be scraped",
-    description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
-  }
-
-ALERT K8SKubeletDown
-  IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
-  }
-
-ALERT K8SApiserverDown
-  IF up{job="kubernetes"} == 0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "An API server could not be scraped.",
-  }
-
-# Disable for non HA kubernetes setups.
-ALERT K8SApiserverDown
-  IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
-  }
-
-ALERT K8SSchedulerDown
-  IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Scheduler is down",
-    description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
-  }
-
-ALERT K8SControllerManagerDown
-  IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Controller manager is down",
-    description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-  }
-
-ALERT K8SConntrackTableFull
-  IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Number of tracked connections is near the limit",
-    description = "The nf_conntrack table is {{ $value }}% full.",
-  }
-
-ALERT K8SConntrackTableFull
-  IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Number of tracked connections is near the limit",
-    description = "The nf_conntrack table is {{ $value }}% full.",
-  }
-
-# To catch the conntrack sysctl de-tuning when it happens
-ALERT K8SConntrackTuningMissing
-  IF node_nf_conntrack_udp_timeout > 10
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node does not have the correct conntrack tunings",
-    description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
-  }
-
-ALERT K8STooManyOpenFiles
-  IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "{{ $labels.job }} has too many open file descriptors",
-    description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-ALERT K8STooManyOpenFiles
-  IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "{{ $labels.job }} has too many open file descriptors",
-    description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-# Some verbs excluded because they are expected to be long-lasting:
-# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-ALERT K8SApiServerLatency
-  IF histogram_quantile(
-      0.99,
-      sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
-    ) / 1e6 > 1.0
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubernetes apiserver latency is high",
-    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-  }
-
-ALERT K8SApiServerEtcdAccessLatency
-  IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Access to etcd is slow",
-    description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
-  }
-
-ALERT K8SKubeletTooManyPods
-  IF kubelet_running_pod_count > 100
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Kubelet is close to pod limit",
-    description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-  }
-