diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml index f08a2106..6b5789b5 100644 --- a/assets/alertmanager/alertmanager.yaml +++ b/assets/alertmanager/alertmanager.yaml @@ -5,8 +5,10 @@ route: group_wait: 30s group_interval: 5m repeat_interval: 12h - receiver: 'webhook' + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' receivers: -- name: 'webhook' - webhook_configs: - - url: 'http://alertmanagerwh:30500/' +- name: 'null' diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules new file mode 100644 index 00000000..71bdc687 --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules @@ -0,0 +1,36 @@ +ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." + } + +ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." + } + +ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules deleted file mode 100644 index 10fa5e8d..00000000 --- a/assets/prometheus/rules/etcd2.rules +++ /dev/null @@ -1,121 +0,0 @@ -### General cluster availability ### - -# alert if another failed peer will result in an unavailable cluster -ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - -### HTTP requests alerts ### - -# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if 50% of requests get a 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", - } - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", - } - -### File descriptor alerts ### - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -### etcd proposal alerts ### - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - -### etcd disk io latency alerts ### - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", - } diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules new file mode 100644 index 00000000..a3b2cddd --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules @@ -0,0 +1,177 @@ +# general cluster availability + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers +IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) +FOR 3m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", +} + +# etcd leader alerts +# ================== + +# alert if any etcd instance has no leader +ALERT NoLeader +IF etcd_server_has_leader{job="etcd"} == 0 +FOR 1m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", +} + +# alert if there are lots of leader changes +ALERT HighNumberOfLeaderChanges +IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", +} + +# gRPC request alerts +# =================== + +# alert if more than 1% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of gRPC method calls take more than 150ms +ALERT GRPCRequestsSlow +IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", +} + +# HTTP requests alerts +# ==================== + +# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow +IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", +} + +# etcd member communication alerts +# ================================ + +# alert if 99th percentile of round trips take 150ms +ALERT EtcdMemberCommunicationSlow +IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", +} + +# etcd proposal alerts +# ==================== + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals +IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", +} + +# etcd disk io latency alerts +# =========================== + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations +IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", +} + +# alert if 99th percentile of commit durations is higher than 250ms +ALERT HighCommitDurations +IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", +} diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules new file mode 100644 index 00000000..9e26ab9a --- /dev/null +++ b/assets/prometheus/rules/general.rules @@ -0,0 +1,63 @@ +### Up Alerting ### + +Alert TargetDown + IF 100 * (count(up == 0) / count(up)) > 3 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." + } + +### Dead man's switch ### + +ALERT DeadMansSwitch + IF vector(1) + LABELS { + severity = "none", + } + ANNOTATIONS { + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", + } + +### File descriptor alerts ### + +ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", + } + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", + } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules new file mode 100644 index 00000000..c041881a --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -0,0 +1,28 @@ +ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +# +# apiserver_request_latencies' unit is microseconds +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules new file mode 100644 index 00000000..f75e2768 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -0,0 +1,10 @@ +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules new file mode 100644 index 00000000..6eff4bcd --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -0,0 +1,10 @@ +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules new file mode 100644 index 00000000..cbcd576c --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules @@ -0,0 +1,60 @@ +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletDown + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index 157eb3fa..084d11e5 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - -ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - -ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. -ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - } - -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -# To catch the conntrack sysctl de-tuning when it happens -ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - -ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules new file mode 100644 index 00000000..8fd5b7d0 --- /dev/null +++ b/assets/prometheus/rules/node.rules @@ -0,0 +1,10 @@ +ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", + } diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules new file mode 100644 index 00000000..05c278f1 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules @@ -0,0 +1,10 @@ +ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index eee36b33..62d39016 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -3,4 +3,4 @@ kind: Secret metadata: name: alertmanager-main data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 84a3238a..c092d8e2 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.6.3 + version: v1.7.0 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/prometheus-operator/prometheus-operator-service.yaml new file mode 100644 index 00000000..8882d4a7 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + type: ClusterIP + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + selector: + k8s-app: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 573aaf29..97b1cafb 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -3,25 +3,28 @@ kind: Deployment metadata: name: prometheus-operator labels: - operator: prometheus + k8s-app: prometheus-operator spec: replicas: 1 template: metadata: labels: - operator: prometheus + k8s-app: prometheus-operator spec: serviceAccountName: prometheus-operator containers: - - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.9.1 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" - resources: - requests: - cpu: 100m - memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + - name: prometheus-operator + image: quay.io/coreos/prometheus-operator:v0.9.1 + args: + - "--kubelet-service=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + ports: + - name: http + containerPort: 8080 + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 300Mi diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 71f72da1..cb062db1 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,76 +6,260 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - etcd2.rules: |+ - ### General cluster availability ### - - # alert if another failed peer will result in an unavailable cluster - ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - - ### HTTP requests alerts ### - - # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + alertmanager.rules: |+ + ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." } - # alert if 50% of requests get a 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m + ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." } - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } + etcd3.rules: |+ + # general cluster availability + + # alert if another failed member will result in an unavailable cluster + ALERT InsufficientMembers + IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", + } + + # etcd leader alerts + # ================== + + # alert if any etcd instance has no leader + ALERT NoLeader + IF etcd_server_has_leader{job="etcd"} == 0 + FOR 1m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", + } + + # alert if there are lots of leader changes + ALERT HighNumberOfLeaderChanges + IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", + } + + # gRPC request alerts + # =================== + + # alert if more than 1% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of gRPC method calls take more than 150ms + ALERT GRPCRequestsSlow + IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + } + + # HTTP requests alerts + # ==================== + + # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + # etcd member communication alerts + # ================================ + + # alert if 99th percentile of round trips take 150ms + ALERT EtcdMemberCommunicationSlow + IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + } + + # etcd proposal alerts + # ==================== + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + # etcd disk io latency alerts + # =========================== + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", + } + + # alert if 99th percentile of commit durations is higher than 250ms + ALERT HighCommitDurations + IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", + } + general.rules: |+ + ### Up Alerting ### + + Alert TargetDown + IF 100 * (count(up == 0) / count(up)) > 3 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." + } + + ### Dead man's switch ### + + ALERT DeadMansSwitch + IF vector(1) + LABELS { + severity = "none", + } + ANNOTATIONS { + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", } ### File descriptor alerts ### + ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", + } + instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours @@ -87,7 +271,7 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -99,34 +283,108 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } - - ### etcd proposal alerts ### - - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + kube-apiserver.rules: |+ + ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m LABELS { - severity = "warning" + severity = "critical" } ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + summary = "API server unreachable", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } - ### etcd disk io latency alerts ### - - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + # + # apiserver_request_latencies' unit is microseconds + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + ) / 1e6 > 1.0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + kube-controller-manager.rules: |+ + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + kubelet.rules: |+ + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletDown + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. @@ -300,220 +558,36 @@ data: histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - - ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - - ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - - ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - - # Disable for non HA kubernetes setups. - ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - + kube-scheduler.rules: |+ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + node.rules: |+ + ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 FOR 10m LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - # To catch the conntrack sysctl de-tuning when it happens - ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 + prometheus.rules: |+ + ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - - ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml index d193b676..29d68c82 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -1,12 +1,16 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: - labels: - alertmanager: main name: alertmanager + labels: + app: alertmanager spec: + selector: + matchLabels: + alertmanager: main + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: alertmanager, operator: In, values: [main]} + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml index 1fd793e5..09a87c2e 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -3,9 +3,9 @@ kind: ServiceMonitor metadata: name: kube-apiserver labels: - k8s-apps: https + k8s-app: apiserver spec: - jobLabel: provider + jobLabel: component selector: matchLabels: component: apiserver @@ -15,7 +15,7 @@ spec: - default endpoints: - port: https - interval: 15s + interval: 30s scheme: https tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml deleted file mode 100644 index fbfcda97..00000000 --- a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: k8s-apps-http - namespace: monitoring - labels: - k8s-apps: http -spec: - jobLabel: k8s-app - selector: - matchExpressions: - - {key: k8s-app, operator: Exists} - - {key: k8s-app, operator: NotIn, values: [kubelet]} - namespaceSelector: - matchNames: - - kube-system - endpoints: - - port: http-metrics - interval: 15s - - port: http-metrics-dnsmasq - interval: 15s - - port: http-metrics-skydns - interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml new file mode 100644 index 00000000..eef95a84 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-controller-manager + labels: + k8s-app: kube-controller-manager +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-controller-manager + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml new file mode 100644 index 00000000..663f8cfb --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-scheduler + labels: + k8s-app: kube-scheduler +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-scheduler + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml index c4ed1afc..a276702a 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: kube-state-metrics - namespace: monitoring labels: - k8s-apps: http + k8s-app: kube-state-metrics spec: jobLabel: k8s-app selector: @@ -15,5 +14,5 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 5729d8f0..cdc3ffb6 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -3,16 +3,16 @@ kind: ServiceMonitor metadata: name: kubelet labels: - k8s-apps: http + k8s-app: kubelet spec: jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + honorLabels: true selector: matchLabels: k8s-app: kubelet namespaceSelector: matchNames: - kube-system - endpoints: - - port: http-metrics - interval: 15s - honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml index a7b20301..b68ed89f 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: node-exporter - namespace: monitoring labels: - k8s-apps: http + k8s-app: node-exporter spec: jobLabel: k8s-app selector: @@ -15,4 +14,4 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml new file mode 100644 index 00000000..23c04073 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + endpoints: + - port: http + selector: + matchLabels: + k8s-app: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml index 5e5d17be..be74cd6d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -3,10 +3,14 @@ kind: ServiceMonitor metadata: name: prometheus labels: - prometheus: k8s + app: prometheus spec: + selector: + matchLabels: + prometheus: k8s + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: prometheus, operator: In, values: [k8s]} + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index b7060ba6..63e9c3f7 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.6.3 + version: v1.7.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: