Merge pull request #391 from brancz/alerting

Alerting
2017-06-07 14:08:23 +02:00
parent 3238ba257a 373e5cf096
commit a152fabfd3
28 changed files with 856 additions and 667 deletions
--- a/assets/alertmanager/alertmanager.yaml
+++ b/assets/alertmanager/alertmanager.yaml
@@ -5,8 +5,10 @@ route:
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
-  receiver: 'webhook'
+  receiver: 'null'
+  routes:
+  - match:
+      alertname: DeadMansSwitch
+    receiver: 'null'
 receivers:
- name: 'webhook'
-  webhook_configs:
-  - url: 'http://alertmanagerwh:30500/'
+- name: 'null'
--- a/assets/prometheus/rules/alertmanager.rules
+++ b/assets/prometheus/rules/alertmanager.rules
@@ -0,0 +1,36 @@
+ALERT AlertmanagerConfigInconsistent
+  IF   count_values by (service) ("config_hash", alertmanager_config_hash)
+     / on(service) group_left
+       label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
+  FOR 5m
+  LABELS {
+    severity = "critical"
+  }
+  ANNOTATIONS {
+    summary = "Alertmanager configurations are inconsistent",
+    description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
+  }
+
+ALERT AlertmanagerDownOrMissing
+  IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
+     / on(job) group_right
+       sum by(job) (up) != 1
+  FOR 5m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Alertmanager down or not discovered",
+    description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
+  }
+
+ALERT FailedReload
+  IF alertmanager_config_last_reload_successful == 0
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Alertmanager configuration reload has failed",
+    description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
+  }
--- a/assets/prometheus/rules/etcd2.rules
+++ b/assets/prometheus/rules/etcd2.rules
@@ -1,121 +0,0 @@
-### General cluster availability ###
-
-# alert if another failed peer will result in an unavailable cluster
-ALERT InsufficientPeers
-  IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
-  FOR 3m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Etcd cluster small",
-    description = "If one more etcd peer goes down the cluster will be unavailable",
-  }
-
-### HTTP requests alerts ###
-
-# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
-ALERT HighNumberOfFailedHTTPRequests
-  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
-    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "a high number of HTTP requests are failing",
-    description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-  }
-
-# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
-ALERT HighNumberOfFailedHTTPRequests
-  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
-    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
-  FOR 5m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "a high number of HTTP requests are failing",
-    description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-  }
-
-# alert if 50% of requests get a 4xx response
-ALERT HighNumberOfFailedHTTPRequests
-  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
-    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "a high number of HTTP requests are failing",
-    description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
-  }
-
-# alert if the 99th percentile of HTTP requests take more than 150ms
-ALERT HTTPRequestsSlow
-  IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "slow HTTP requests",
-    description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
-  }
-
-### File descriptor alerts ###
-
-instance:fd_utilization = process_open_fds / process_max_fds
-
-# alert if file descriptors are likely to exhaust within the next 4 hours
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
-  }
-
-# alert if file descriptors are likely to exhaust within the next hour
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
-  }
-
-### etcd proposal alerts ###
-
-# alert if there are several failed proposals within an hour
-ALERT HighNumberOfFailedProposals
-  IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "a high number of failed proposals within the etcd cluster are happening",
-    description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
-  }
-
-### etcd disk io latency alerts ###
-
-# alert if 99th percentile of fsync durations is higher than 500ms
-ALERT HighFsyncDurations
-  IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "high fsync durations",
-    description = "ectd instance {{ $labels.instance }} fync durations are high",
-  }
--- a/assets/prometheus/rules/etcd3.rules
+++ b/assets/prometheus/rules/etcd3.rules
@@ -0,0 +1,177 @@
+# general cluster availability
+
+# alert if another failed member will result in an unavailable cluster
+ALERT InsufficientMembers
+IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+FOR 3m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "etcd cluster insufficient members",
+  description = "If one more etcd member goes down the cluster will be unavailable",
+}
+
+# etcd leader alerts
+# ==================
+
+# alert if any etcd instance has no leader
+ALERT NoLeader
+IF etcd_server_has_leader{job="etcd"} == 0
+FOR 1m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "etcd member has no leader",
+  description = "etcd member {{ $labels.instance }} has no leader",
+}
+
+# alert if there are lots of leader changes
+ALERT HighNumberOfLeaderChanges
+IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of leader changes within the etcd cluster are happening",
+  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
+}
+
+# gRPC request alerts
+# ===================
+
+# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
+ALERT HighNumberOfFailedGRPCRequests
+IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of gRPC requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+}
+
+# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
+ALERT HighNumberOfFailedGRPCRequests
+IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
+FOR 5m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "a high number of gRPC requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+}
+
+# alert if the 99th percentile of gRPC method calls take more than 150ms
+ALERT GRPCRequestsSlow
+IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "slow gRPC requests",
+  description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
+}
+
+# HTTP requests alerts
+# ====================
+
+# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
+ALERT HighNumberOfFailedHTTPRequests
+IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
+  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of HTTP requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+}
+
+# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
+ALERT HighNumberOfFailedHTTPRequests
+IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
+  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
+FOR 5m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "a high number of HTTP requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+}
+
+# alert if the 99th percentile of HTTP requests take more than 150ms
+ALERT HTTPRequestsSlow
+IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "slow HTTP requests",
+  description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
+}
+
+# etcd member communication alerts
+# ================================
+
+# alert if 99th percentile of round trips take 150ms
+ALERT EtcdMemberCommunicationSlow
+IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "etcd member communication is slow",
+  description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow",
+}
+
+# etcd proposal alerts
+# ====================
+
+# alert if there are several failed proposals within an hour
+ALERT HighNumberOfFailedProposals
+IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of proposals within the etcd cluster are failing",
+  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
+}
+
+# etcd disk io latency alerts
+# ===========================
+
+# alert if 99th percentile of fsync durations is higher than 500ms
+ALERT HighFsyncDurations
+IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "high fsync durations",
+  description = "etcd instance {{ $labels.instance }} fync durations are high",
+}
+
+# alert if 99th percentile of commit durations is higher than 250ms
+ALERT HighCommitDurations
+IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "high commit durations",
+  description = "etcd instance {{ $labels.instance }} commit durations are high",
+}
--- a/assets/prometheus/rules/general.rules
+++ b/assets/prometheus/rules/general.rules
@@ -0,0 +1,63 @@
+### Up Alerting ###
+
+Alert TargetDown
+  IF 100 * (count(up == 0) / count(up)) > 3
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Targets are down",
+    description = "More than {{ $value }}% of targets are down."
+  }
+
+### Dead man's switch ###
+
+ALERT DeadMansSwitch
+  IF vector(1)
+  LABELS {
+    severity = "none",
+  }
+  ANNOTATIONS {
+    summary = "Alerting DeadMansSwitch",
+    description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
+  }
+
+### File descriptor alerts ###
+
+ALERT TooManyOpenFileDescriptors
+  IF 100 * (process_open_fds / process_max_fds) > 95
+  FOR 10m
+  LABELS {
+    severity = "critical"
+  }
+  ANNOTATIONS {
+    summary = "too many open file descriptors",
+    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
+  }
+
+instance:fd_utilization = process_open_fds / process_max_fds
+
+# alert if file descriptors are likely to exhaust within the next 4 hours
+ALERT FdExhaustionClose
+  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "file descriptors soon exhausted",
+    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
+  }
+
+# alert if file descriptors are likely to exhaust within the next hour
+ALERT FdExhaustionClose
+  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
+  FOR 10m
+  LABELS {
+    severity = "critical"
+  }
+  ANNOTATIONS {
+    summary = "file descriptors soon exhausted",
+    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
+  }
--- a/assets/prometheus/rules/kube-apiserver.rules
+++ b/assets/prometheus/rules/kube-apiserver.rules
@@ -0,0 +1,28 @@
+ALERT K8SApiserverDown
+  IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
+  FOR 5m
+  LABELS {
+    severity = "critical"
+  }
+  ANNOTATIONS {
+    summary = "API server unreachable",
+    description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
+  }
+
+# Some verbs excluded because they are expected to be long-lasting:
+# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
+#
+# apiserver_request_latencies' unit is microseconds
+ALERT K8SApiServerLatency
+  IF histogram_quantile(
+      0.99,
+      sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
+    ) / 1e6 > 1.0
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Kubernetes apiserver latency is high",
+    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
+  }
--- a/assets/prometheus/rules/kube-controller-manager.rules
+++ b/assets/prometheus/rules/kube-controller-manager.rules
@@ -0,0 +1,10 @@
+ALERT K8SControllerManagerDown
+  IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
+  FOR 5m
+  LABELS {
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    summary = "Controller manager is down",
+    description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
+  }
--- a/assets/prometheus/rules/kube-scheduler.rules
+++ b/assets/prometheus/rules/kube-scheduler.rules
@@ -0,0 +1,10 @@
+ALERT K8SSchedulerDown
+  IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
+  FOR 5m
+  LABELS {
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    summary = "Scheduler is down",
+    description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
+  }
--- a/assets/prometheus/rules/kubelet.rules
+++ b/assets/prometheus/rules/kubelet.rules
@@ -0,0 +1,60 @@
+ALERT K8SNodeNotReady
+  IF kube_node_status_ready{condition="true"} == 0
+  FOR 1h
+  LABELS {
+    severity = "warning",
+  }
+  ANNOTATIONS {
+    summary = "Node status is NotReady",
+    description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
+  }
+
+ALERT K8SManyNodesNotReady
+  IF
+    count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
+    AND
+      (
+        count by (cluster) (kube_node_status_ready{condition="true"} == 0)
+      /
+        count by (cluster) (kube_node_status_ready{condition="true"})
+      ) > 0.2
+  FOR 1m
+  LABELS {
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    summary = "Many K8s nodes are Not Ready",
+    description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
+  }
+
+ALERT K8SKubeletDown
+  IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03
+  FOR 1h
+  LABELS {
+    severity = "warning",
+  }
+  ANNOTATIONS {
+    summary = "Many Kubelets cannot be scraped",
+    description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
+  }
+
+ALERT K8SKubeletDown
+  IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
+  FOR 1h
+  LABELS {
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    summary = "Many Kubelets cannot be scraped",
+    description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
+  }
+
+ALERT K8SKubeletTooManyPods
+  IF kubelet_running_pod_count > 100
+  LABELS {
+    severity = "warning",
+  }
+  ANNOTATIONS {
+    summary = "Kubelet is close to pod limit",
+    description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
+  }
--- a/assets/prometheus/rules/kubernetes.rules
+++ b/assets/prometheus/rules/kubernetes.rules
@@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
  histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
  histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-
-ALERT K8SNodeDown
-  IF up{job="kubelet"} == 0
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubelet cannot be scraped",
-    description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
-  }
-
-ALERT K8SNodeNotReady
-  IF kube_node_status_ready{condition="true"} == 0
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node status is NotReady",
-    description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-  }
-
-ALERT K8SManyNodesNotReady
-  IF
-    count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
-    AND
-      (
-        count by (cluster) (kube_node_status_ready{condition="true"} == 0)
-      /
-        count by (cluster) (kube_node_status_ready{condition="true"})
-      ) > 0.2
-  FOR 1m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many K8s nodes are Not Ready",
-    description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
-  }
-
-ALERT K8SKubeletNodeExporterDown
-  IF up{job="node-exporter"} == 0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubelet node_exporter cannot be scraped",
-    description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
-  }
-
-ALERT K8SKubeletDown
-  IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
-  FOR 1h
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
-  }
-
-ALERT K8SApiserverDown
-  IF up{job="kubernetes"} == 0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "An API server could not be scraped.",
-  }
-
-# Disable for non HA kubernetes setups.
-ALERT K8SApiserverDown
-  IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
-  }
-
-ALERT K8SSchedulerDown
-  IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Scheduler is down",
-    description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
-  }
-
-ALERT K8SControllerManagerDown
-  IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
-  FOR 5m
-  LABELS {
-    service = "k8s",
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Controller manager is down",
-    description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-  }
-
-ALERT K8SConntrackTableFull
-  IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Number of tracked connections is near the limit",
-    description = "The nf_conntrack table is {{ $value }}% full.",
-  }
-
-ALERT K8SConntrackTableFull
-  IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Number of tracked connections is near the limit",
-    description = "The nf_conntrack table is {{ $value }}% full.",
-  }
-
-# To catch the conntrack sysctl de-tuning when it happens
-ALERT K8SConntrackTuningMissing
-  IF node_nf_conntrack_udp_timeout > 10
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node does not have the correct conntrack tunings",
-    description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
-  }
-
-ALERT K8STooManyOpenFiles
-  IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "{{ $labels.job }} has too many open file descriptors",
-    description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-ALERT K8STooManyOpenFiles
-  IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "{{ $labels.job }} has too many open file descriptors",
-    description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-# Some verbs excluded because they are expected to be long-lasting:
-# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-ALERT K8SApiServerLatency
-  IF histogram_quantile(
-      0.99,
-      sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
-    ) / 1e6 > 1.0
-  FOR 10m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubernetes apiserver latency is high",
-    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-  }
-
-ALERT K8SApiServerEtcdAccessLatency
-  IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
-  FOR 15m
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Access to etcd is slow",
-    description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
-  }
-
-ALERT K8SKubeletTooManyPods
-  IF kubelet_running_pod_count > 100
-  LABELS {
-    service = "k8s",
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Kubelet is close to pod limit",
-    description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-  }
-
--- a/assets/prometheus/rules/node.rules
+++ b/assets/prometheus/rules/node.rules
@@ -0,0 +1,10 @@
+ALERT NodeExporterDown
+  IF up{job="node-exporter"} == 0
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "node-exporter cannot be scraped",
+    description = "Prometheus could not scrape a node-exporter for more than 10m.",
+  }
--- a/assets/prometheus/rules/prometheus.rules
+++ b/assets/prometheus/rules/prometheus.rules
@@ -0,0 +1,10 @@
+ALERT FailedReload
+  IF prometheus_config_last_reload_successful == 0
+  FOR 10m
+  LABELS {
+    severity = "warning"
+  }
+  ANNOTATIONS {
+    summary = "Prometheus configuration reload has failed",
+    description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
+  }
--- a/manifests/alertmanager/alertmanager-config.yaml
+++ b/manifests/alertmanager/alertmanager-config.yaml
@@ -3,4 +3,4 @@ kind: Secret
 metadata:
  name: alertmanager-main
 data:
-  alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
+  alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
--- a/manifests/examples/example-app/prometheus-frontend.yaml
+++ b/manifests/examples/example-app/prometheus-frontend.yaml
@@ -6,7 +6,7 @@ metadata:
  labels:
    prometheus: frontend
 spec:
-  version: v1.6.3
+  version: v1.7.0
  serviceMonitorSelector:
    matchLabels:
      tier: frontend
--- a/manifests/prometheus-operator/prometheus-operator-service.yaml
+++ b/manifests/prometheus-operator/prometheus-operator-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-operator
+  labels:
+    k8s-app: prometheus-operator
+spec:
+  type: ClusterIP
+  ports:
+  - name: http
+    port: 8080
+    targetPort: http
+    protocol: TCP
+  selector:
+    k8s-app: prometheus-operator
--- a/manifests/prometheus-operator/prometheus-operator.yaml
+++ b/manifests/prometheus-operator/prometheus-operator.yaml
@@ -3,25 +3,28 @@ kind: Deployment
 metadata:
  name: prometheus-operator
  labels:
-    operator: prometheus
+    k8s-app: prometheus-operator
 spec:
  replicas: 1
  template:
    metadata:
      labels:
-        operator: prometheus
+        k8s-app: prometheus-operator
    spec:
      serviceAccountName: prometheus-operator
      containers:
-       - name: prometheus-operator
-         image: quay.io/coreos/prometheus-operator:v0.9.1
-         args:
-         - "--kubelet-service=kube-system/kubelet"
-         - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
-         resources:
-           requests:
-             cpu: 100m
-             memory: 50Mi
-           limits:
-             cpu: 200m
-             memory: 300Mi
+      - name: prometheus-operator
+        image: quay.io/coreos/prometheus-operator:v0.9.1
+        args:
+        - "--kubelet-service=kube-system/kubelet"
+        - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
+        ports:
+        - name: http
+          containerPort: 8080
+        resources:
+          requests:
+            cpu: 100m
+            memory: 50Mi
+          limits:
+            cpu: 200m
+            memory: 300Mi
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -6,76 +6,260 @@ metadata:
    role: prometheus-rulefiles
    prometheus: k8s
 data:
-  etcd2.rules: |+
-    ### General cluster availability ###
-    
-    # alert if another failed peer will result in an unavailable cluster
-    ALERT InsufficientPeers
-      IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
-      FOR 3m
-      LABELS {
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "Etcd cluster small",
-        description = "If one more etcd peer goes down the cluster will be unavailable",
-      }
-    
-    ### HTTP requests alerts ###
-    
-    # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
-    ALERT HighNumberOfFailedHTTPRequests
-      IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
-        / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "a high number of HTTP requests are failing",
-        description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-      }
-    
-    # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
-    ALERT HighNumberOfFailedHTTPRequests
-      IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
-        / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
+  alertmanager.rules: |+
+    ALERT AlertmanagerConfigInconsistent
+      IF   count_values by (service) ("config_hash", alertmanager_config_hash)
+         / on(service) group_left
+           label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
      FOR 5m
      LABELS {
        severity = "critical"
      }
      ANNOTATIONS {
-        summary = "a high number of HTTP requests are failing",
-        description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+        summary = "Alertmanager configurations are inconsistent",
+        description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
      }
    
-    # alert if 50% of requests get a 4xx response
-    ALERT HighNumberOfFailedHTTPRequests
-      IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
-        / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
-      FOR 10m
+    ALERT AlertmanagerDownOrMissing
+      IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
+         / on(job) group_right
+           sum by(job) (up) != 1
+      FOR 5m
      LABELS {
-        severity = "critical"
+        severity = "warning"
      }
      ANNOTATIONS {
-        summary = "a high number of HTTP requests are failing",
-        description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
+        summary = "Alertmanager down or not discovered",
+        description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
      }
    
-    # alert if the 99th percentile of HTTP requests take more than 150ms
-    ALERT HTTPRequestsSlow
-      IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
+    ALERT FailedReload
+      IF alertmanager_config_last_reload_successful == 0
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
-        summary = "slow HTTP requests",
-        description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
+        summary = "Alertmanager configuration reload has failed",
+        description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
+      }
+  etcd3.rules: |+
+    # general cluster availability
+    
+    # alert if another failed member will result in an unavailable cluster
+    ALERT InsufficientMembers
+    IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+    FOR 3m
+    LABELS {
+      severity = "critical"
+    }
+    ANNOTATIONS {
+      summary = "etcd cluster insufficient members",
+      description = "If one more etcd member goes down the cluster will be unavailable",
+    }
+    
+    # etcd leader alerts
+    # ==================
+    
+    # alert if any etcd instance has no leader
+    ALERT NoLeader
+    IF etcd_server_has_leader{job="etcd"} == 0
+    FOR 1m
+    LABELS {
+      severity = "critical"
+    }
+    ANNOTATIONS {
+      summary = "etcd member has no leader",
+      description = "etcd member {{ $labels.instance }} has no leader",
+    }
+    
+    # alert if there are lots of leader changes
+    ALERT HighNumberOfLeaderChanges
+    IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "a high number of leader changes within the etcd cluster are happening",
+      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
+    }
+    
+    # gRPC request alerts
+    # ===================
+    
+    # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
+    ALERT HighNumberOfFailedGRPCRequests
+    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "a high number of gRPC requests are failing",
+      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+    }
+    
+    # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
+    ALERT HighNumberOfFailedGRPCRequests
+    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
+    FOR 5m
+    LABELS {
+      severity = "critical"
+    }
+    ANNOTATIONS {
+      summary = "a high number of gRPC requests are failing",
+      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+    }
+    
+    # alert if the 99th percentile of gRPC method calls take more than 150ms
+    ALERT GRPCRequestsSlow
+    IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+    FOR 10m
+    LABELS {
+      severity = "critical"
+    }
+    ANNOTATIONS {
+      summary = "slow gRPC requests",
+      description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
+    }
+    
+    # HTTP requests alerts
+    # ====================
+    
+    # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
+    ALERT HighNumberOfFailedHTTPRequests
+    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
+      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "a high number of HTTP requests are failing",
+      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+    }
+    
+    # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
+    ALERT HighNumberOfFailedHTTPRequests
+    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
+      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
+    FOR 5m
+    LABELS {
+      severity = "critical"
+    }
+    ANNOTATIONS {
+      summary = "a high number of HTTP requests are failing",
+      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+    }
+    
+    # alert if the 99th percentile of HTTP requests take more than 150ms
+    ALERT HTTPRequestsSlow
+    IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "slow HTTP requests",
+      description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
+    }
+    
+    # etcd member communication alerts
+    # ================================
+    
+    # alert if 99th percentile of round trips take 150ms
+    ALERT EtcdMemberCommunicationSlow
+    IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "etcd member communication is slow",
+      description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow",
+    }
+    
+    # etcd proposal alerts
+    # ====================
+    
+    # alert if there are several failed proposals within an hour
+    ALERT HighNumberOfFailedProposals
+    IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "a high number of proposals within the etcd cluster are failing",
+      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
+    }
+    
+    # etcd disk io latency alerts
+    # ===========================
+    
+    # alert if 99th percentile of fsync durations is higher than 500ms
+    ALERT HighFsyncDurations
+    IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "high fsync durations",
+      description = "etcd instance {{ $labels.instance }} fync durations are high",
+    }
+    
+    # alert if 99th percentile of commit durations is higher than 250ms
+    ALERT HighCommitDurations
+    IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+    FOR 10m
+    LABELS {
+      severity = "warning"
+    }
+    ANNOTATIONS {
+      summary = "high commit durations",
+      description = "etcd instance {{ $labels.instance }} commit durations are high",
+    }
+  general.rules: |+
+    ### Up Alerting ###
+    
+    Alert TargetDown
+      IF 100 * (count(up == 0) / count(up)) > 3
+      FOR 10m
+      LABELS {
+        severity = "warning"
+      }
+      ANNOTATIONS {
+        summary = "Targets are down",
+        description = "More than {{ $value }}% of targets are down."
+      }
+    
+    ### Dead man's switch ###
+    
+    ALERT DeadMansSwitch
+      IF vector(1)
+      LABELS {
+        severity = "none",
+      }
+      ANNOTATIONS {
+        summary = "Alerting DeadMansSwitch",
+        description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
      }
    
    ### File descriptor alerts ###
    
+    ALERT TooManyOpenFileDescriptors
+      IF 100 * (process_open_fds / process_max_fds) > 95
+      FOR 10m
+      LABELS {
+        severity = "critical"
+      }
+      ANNOTATIONS {
+        summary = "too many open file descriptors",
+        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
+      }
+    
    instance:fd_utilization = process_open_fds / process_max_fds
    
    # alert if file descriptors are likely to exhaust within the next 4 hours
@@ -87,7 +271,7 @@ data:
      }
      ANNOTATIONS {
        summary = "file descriptors soon exhausted",
-        description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
+        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
      }
    
    # alert if file descriptors are likely to exhaust within the next hour
@@ -99,34 +283,108 @@ data:
      }
      ANNOTATIONS {
        summary = "file descriptors soon exhausted",
-        description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
+        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
      }
-    
-    ### etcd proposal alerts ###
-    
-    # alert if there are several failed proposals within an hour
-    ALERT HighNumberOfFailedProposals
-      IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
+  kube-apiserver.rules: |+
+    ALERT K8SApiserverDown
+      IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
+      FOR 5m
      LABELS {
-        severity = "warning"
+        severity = "critical"
      }
      ANNOTATIONS {
-        summary = "a high number of failed proposals within the etcd cluster are happening",
-        description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
+        summary = "API server unreachable",
+        description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
      }
    
-    ### etcd disk io latency alerts ###
-    
-    # alert if 99th percentile of fsync durations is higher than 500ms
-    ALERT HighFsyncDurations
-      IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
+    # Some verbs excluded because they are expected to be long-lasting:
+    # WATCHLIST is long-poll, CONNECT is `kubectl exec`.
+    #
+    # apiserver_request_latencies' unit is microseconds
+    ALERT K8SApiServerLatency
+      IF histogram_quantile(
+          0.99,
+          sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
+        ) / 1e6 > 1.0
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
-        summary = "high fsync durations",
-        description = "ectd instance {{ $labels.instance }} fync durations are high",
+        summary = "Kubernetes apiserver latency is high",
+        description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
+      }
+  kube-controller-manager.rules: |+
+    ALERT K8SControllerManagerDown
+      IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
+      FOR 5m
+      LABELS {
+        severity = "critical",
+      }
+      ANNOTATIONS {
+        summary = "Controller manager is down",
+        description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
+      }
+  kubelet.rules: |+
+    ALERT K8SNodeNotReady
+      IF kube_node_status_ready{condition="true"} == 0
+      FOR 1h
+      LABELS {
+        severity = "warning",
+      }
+      ANNOTATIONS {
+        summary = "Node status is NotReady",
+        description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
+      }
+    
+    ALERT K8SManyNodesNotReady
+      IF
+        count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
+        AND
+          (
+            count by (cluster) (kube_node_status_ready{condition="true"} == 0)
+          /
+            count by (cluster) (kube_node_status_ready{condition="true"})
+          ) > 0.2
+      FOR 1m
+      LABELS {
+        severity = "critical",
+      }
+      ANNOTATIONS {
+        summary = "Many K8s nodes are Not Ready",
+        description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
+      }
+    
+    ALERT K8SKubeletDown
+      IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03
+      FOR 1h
+      LABELS {
+        severity = "warning",
+      }
+      ANNOTATIONS {
+        summary = "Many Kubelets cannot be scraped",
+        description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
+      }
+    
+    ALERT K8SKubeletDown
+      IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
+      FOR 1h
+      LABELS {
+        severity = "critical",
+      }
+      ANNOTATIONS {
+        summary = "Many Kubelets cannot be scraped",
+        description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
+      }
+    
+    ALERT K8SKubeletTooManyPods
+      IF kubelet_running_pod_count > 100
+      LABELS {
+        severity = "warning",
+      }
+      ANNOTATIONS {
+        summary = "Kubelet is close to pod limit",
+        description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
      }
  kubernetes.rules: |+
    # NOTE: These rules were kindly contributed by the SoundCloud engineering team.
@@ -300,220 +558,36 @@ data:
      histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
      histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-    
-    ALERT K8SNodeDown
-      IF up{job="kubelet"} == 0
-      FOR 1h
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Kubelet cannot be scraped",
-        description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
-      }
-    
-    ALERT K8SNodeNotReady
-      IF kube_node_status_ready{condition="true"} == 0
-      FOR 1h
-      LABELS {
-        service = "k8s",
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Node status is NotReady",
-        description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-      }
-    
-    ALERT K8SManyNodesNotReady
-      IF
-        count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
-        AND
-          (
-            count by (cluster) (kube_node_status_ready{condition="true"} == 0)
-          /
-            count by (cluster) (kube_node_status_ready{condition="true"})
-          ) > 0.2
-      FOR 1m
-      LABELS {
-        service = "k8s",
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Many K8s nodes are Not Ready",
-        description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
-      }
-    
-    ALERT K8SKubeletNodeExporterDown
-      IF up{job="node-exporter"} == 0
-      FOR 15m
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Kubelet node_exporter cannot be scraped",
-        description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
-      }
-    
-    ALERT K8SKubeletDown
-      IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
-      FOR 1h
-      LABELS {
-        service = "k8s",
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "Many Kubelets cannot be scraped",
-        description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
-      }
-    
-    ALERT K8SApiserverDown
-      IF up{job="kubernetes"} == 0
-      FOR 15m
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "API server unreachable",
-        description = "An API server could not be scraped.",
-      }
-    
-    # Disable for non HA kubernetes setups.
-    ALERT K8SApiserverDown
-      IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
-      FOR 5m
-      LABELS {
-        service = "k8s",
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "API server unreachable",
-        description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
-      }
-    
+  kube-scheduler.rules: |+
    ALERT K8SSchedulerDown
      IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
      FOR 5m
      LABELS {
-        service = "k8s",
        severity = "critical",
      }
      ANNOTATIONS {
        summary = "Scheduler is down",
        description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
      }
-    
-    ALERT K8SControllerManagerDown
-      IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
-      FOR 5m
-      LABELS {
-        service = "k8s",
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Controller manager is down",
-        description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-      }
-    
-    ALERT K8SConntrackTableFull
-      IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
+  node.rules: |+
+    ALERT NodeExporterDown
+      IF up{job="node-exporter"} == 0
      FOR 10m
      LABELS {
-        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
-        summary = "Number of tracked connections is near the limit",
-        description = "The nf_conntrack table is {{ $value }}% full.",
+        summary = "node-exporter cannot be scraped",
+        description = "Prometheus could not scrape a node-exporter for more than 10m.",
      }
-    
-    ALERT K8SConntrackTableFull
-      IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
-      LABELS {
-        service = "k8s",
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "Number of tracked connections is near the limit",
-        description = "The nf_conntrack table is {{ $value }}% full.",
-      }
-    
-    # To catch the conntrack sysctl de-tuning when it happens
-    ALERT K8SConntrackTuningMissing
-      IF node_nf_conntrack_udp_timeout > 10
+  prometheus.rules: |+
+    ALERT FailedReload
+      IF prometheus_config_last_reload_successful == 0
      FOR 10m
      LABELS {
-        service = "k8s",
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Node does not have the correct conntrack tunings",
-        description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
-      }
-    
-    ALERT K8STooManyOpenFiles
-      IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
-      FOR 10m
-      LABELS {
-        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
-        summary = "{{ $labels.job }} has too many open file descriptors",
-        description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
+        summary = "Prometheus configuration reload has failed",
+        description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
      }
-    
-    ALERT K8STooManyOpenFiles
-      IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
-      FOR 10m
-      LABELS {
-        service = "k8s",
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "{{ $labels.job }} has too many open file descriptors",
-        description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
-      }
-    
-    # Some verbs excluded because they are expected to be long-lasting:
-    # WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-    ALERT K8SApiServerLatency
-      IF histogram_quantile(
-          0.99,
-          sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
-        ) / 1e6 > 1.0
-      FOR 10m
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Kubernetes apiserver latency is high",
-        description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-      }
-    
-    ALERT K8SApiServerEtcdAccessLatency
-      IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
-      FOR 15m
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Access to etcd is slow",
-        description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
-      }
-    
-    ALERT K8SKubeletTooManyPods
-      IF kubelet_running_pod_count > 100
-      LABELS {
-        service = "k8s",
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Kubelet is close to pod limit",
-        description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-      }
-
--- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml
@@ -1,12 +1,16 @@
 apiVersion: monitoring.coreos.com/v1alpha1
 kind: ServiceMonitor
 metadata:
-  labels:
-    alertmanager: main
  name: alertmanager
+  labels:
+    app: alertmanager
 spec:
+  selector:
+    matchLabels:
+      alertmanager: main
+  namespaceSelector:
+    matchNames:
+    - monitoring
  endpoints:
  - port: web
-  selector:
-    matchExpressions:
-    - {key: alertmanager, operator: In, values: [main]}
+    interval: 30s
--- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml
@@ -3,9 +3,9 @@ kind: ServiceMonitor
 metadata:
  name: kube-apiserver
  labels:
-    k8s-apps: https
+    k8s-app: apiserver
 spec:
-  jobLabel: provider
+  jobLabel: component
  selector:
    matchLabels:
      component: apiserver
@@ -15,7 +15,7 @@ spec:
    - default
  endpoints:
  - port: https
-    interval: 15s
+    interval: 30s
    scheme: https
    tlsConfig:
      caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
--- a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml
@@ -1,23 +0,0 @@
-apiVersion: monitoring.coreos.com/v1alpha1
-kind: ServiceMonitor
-metadata:
-  name: k8s-apps-http
-  namespace: monitoring
-  labels:
-    k8s-apps: http
-spec:
-  jobLabel: k8s-app
-  selector:
-    matchExpressions:
-    - {key: k8s-app, operator: Exists}
-    - {key: k8s-app, operator: NotIn, values: [kubelet]}
-  namespaceSelector:
-    matchNames:
-    - kube-system
-  endpoints:
-  - port: http-metrics
-    interval: 15s
-  - port: http-metrics-dnsmasq
-    interval: 15s
-  - port: http-metrics-skydns
-    interval: 15s
--- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml
@@ -0,0 +1,17 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ServiceMonitor
+metadata:
+  name: kube-controller-manager
+  labels:
+    k8s-app: kube-controller-manager
+spec:
+  jobLabel: k8s-app
+  endpoints:
+  - port: http-metrics
+    interval: 30s
+  selector:
+    matchLabels:
+      k8s-app: kube-controller-manager
+  namespaceSelector:
+    matchNames:
+    - kube-system
--- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml
@@ -0,0 +1,17 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ServiceMonitor
+metadata:
+  name: kube-scheduler
+  labels:
+    k8s-app: kube-scheduler
+spec:
+  jobLabel: k8s-app
+  endpoints:
+  - port: http-metrics
+    interval: 30s
+  selector:
+    matchLabels:
+      k8s-app: kube-scheduler
+  namespaceSelector:
+    matchNames:
+    - kube-system
--- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml
@@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1
 kind: ServiceMonitor
 metadata:
  name: kube-state-metrics
-  namespace: monitoring
  labels:
-    k8s-apps: http
+    k8s-app: kube-state-metrics
 spec:
  jobLabel: k8s-app
  selector:
@@ -15,5 +14,5 @@ spec:
    - monitoring
  endpoints:
  - port: http-metrics
-    interval: 15s
+    interval: 30s
    honorLabels: true
--- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml
@@ -3,16 +3,16 @@ kind: ServiceMonitor
 metadata:
  name: kubelet
  labels:
-    k8s-apps: http
+    k8s-app: kubelet
 spec:
  jobLabel: k8s-app
+  endpoints:
+  - port: http-metrics
+    interval: 30s
+    honorLabels: true
  selector:
    matchLabels:
      k8s-app: kubelet
  namespaceSelector:
    matchNames:
    - kube-system
-  endpoints:
-  - port: http-metrics
-    interval: 15s
-    honorLabels: true
--- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml
@@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1
 kind: ServiceMonitor
 metadata:
  name: node-exporter
-  namespace: monitoring
  labels:
-    k8s-apps: http
+    k8s-app: node-exporter
 spec:
  jobLabel: k8s-app
  selector:
@@ -15,4 +14,4 @@ spec:
    - monitoring
  endpoints:
  - port: http-metrics
-    interval: 15s
+    interval: 30s
--- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml
@@ -0,0 +1,12 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ServiceMonitor
+metadata:
+  name: prometheus-operator
+  labels:
+    k8s-app: prometheus-operator
+spec:
+  endpoints:
+  - port: http
+  selector:
+    matchLabels:
+      k8s-app: prometheus-operator
--- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml
@@ -3,10 +3,14 @@ kind: ServiceMonitor
 metadata:
  name: prometheus
  labels:
-    prometheus: k8s
+    app: prometheus
 spec:
+  selector:
+    matchLabels:
+      prometheus: k8s
+  namespaceSelector:
+    matchNames:
+    - monitoring
  endpoints:
  - port: web
-  selector:
-    matchExpressions:
-    - {key: prometheus, operator: In, values: [k8s]}
+    interval: 30s
--- a/manifests/prometheus/prometheus-k8s.yaml
+++ b/manifests/prometheus/prometheus-k8s.yaml
@@ -6,7 +6,7 @@ metadata:
    prometheus: k8s
 spec:
  replicas: 2
-  version: v1.6.3
+  version: v1.7.0
  serviceAccountName: prometheus-k8s
  serviceMonitorSelector:
    matchExpression: