apiVersion: v1 kind: ConfigMap metadata: name: prometheus-k8s-rules labels: role: prometheus-rulefiles prometheus: k8s data: alertmanager.rules: |+ ALERT AlertmanagerConfigInconsistent IF count_values by (service) ("config_hash", alertmanager_config_hash) / on(service) group_left label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { summary = "Alertmanager configurations are inconsistent", description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." } ALERT AlertmanagerDownOrMissing IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / on(job) group_right sum by(job) (up) != 1 FOR 5m LABELS { severity = "warning" } ANNOTATIONS { summary = "Alertmanager down or not discovered", description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." } ALERT FailedReload IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Alertmanager configuration reload has failed", description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." } etcd3.rules: |+ # general cluster availability # alert if another failed member will result in an unavailable cluster ALERT InsufficientMembers IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) FOR 3m LABELS { severity = "critical" } ANNOTATIONS { summary = "etcd cluster insufficient members", description = "If one more etcd member goes down the cluster will be unavailable", } # etcd leader alerts # ================== # alert if any etcd instance has no leader ALERT NoLeader IF etcd_server_has_leader{job="etcd"} == 0 FOR 1m LABELS { severity = "critical" } ANNOTATIONS { summary = "etcd member has no leader", description = "etcd member {{ $labels.instance }} has no leader", } # alert if there are lots of leader changes ALERT HighNumberOfLeaderChanges IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of leader changes within the etcd cluster are happening", description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", } # gRPC request alerts # =================== # alert if more than 1% of gRPC method calls have failed within the last 5 minutes ALERT HighNumberOfFailedGRPCRequests IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of gRPC requests are failing", description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", } # alert if more than 5% of gRPC method calls have failed within the last 5 minutes ALERT HighNumberOfFailedGRPCRequests IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { summary = "a high number of gRPC requests are failing", description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", } # alert if the 99th percentile of gRPC method calls take more than 150ms ALERT GRPCRequestsSlow IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "slow gRPC requests", description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", } # HTTP requests alerts # ==================== # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes ALERT HighNumberOfFailedHTTPRequests IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of HTTP requests are failing", description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", } # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes ALERT HighNumberOfFailedHTTPRequests IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { summary = "a high number of HTTP requests are failing", description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", } # alert if the 99th percentile of HTTP requests take more than 150ms ALERT HTTPRequestsSlow IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "slow HTTP requests", description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", } # etcd member communication alerts # ================================ # alert if 99th percentile of round trips take 150ms ALERT EtcdMemberCommunicationSlow IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "etcd member communication is slow", description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", } # etcd proposal alerts # ==================== # alert if there are several failed proposals within an hour ALERT HighNumberOfFailedProposals IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of proposals within the etcd cluster are failing", description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", } # etcd disk io latency alerts # =========================== # alert if 99th percentile of fsync durations is higher than 500ms ALERT HighFsyncDurations IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "high fsync durations", description = "etcd instance {{ $labels.instance }} fync durations are high", } # alert if 99th percentile of commit durations is higher than 250ms ALERT HighCommitDurations IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "high commit durations", description = "etcd instance {{ $labels.instance }} commit durations are high", } general.rules: |+ ### Up Alerting ### Alert TargetDown IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### ALERT DeadMansSwitch IF vector(1) LABELS { severity = "none", } ANNOTATIONS { summary = "Alerting DeadMansSwitch", description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", } ### File descriptor alerts ### ALERT TooManyOpenFileDescriptors IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } kube-apiserver.rules: |+ ALERT K8SApiserverDown IF absent(up{job="apiserver"} == 1) FOR 5m LABELS { severity = "critical" } ANNOTATIONS { summary = "API server unreachable", description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. # # apiserver_request_latencies' unit is microseconds ALERT K8SApiServerLatency IF histogram_quantile( 0.99, sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) ) / 1e6 > 1.0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Kubernetes apiserver latency is high", description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", } kube-controller-manager.rules: |+ ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"} == 1) FOR 5m LABELS { severity = "critical", } ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", } kubelet.rules: |+ ALERT K8SNodeNotReady IF kube_node_status_condition{condition="Ready", status="true"} == 0 FOR 1h LABELS { severity = "warning", } ANNOTATIONS { summary = "Node status is NotReady", description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } ALERT K8SManyNodesNotReady IF count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 AND ( count(kube_node_status_condition{condition="Ready", status="true"} == 0) / count(kube_node_status_condition{condition="Ready", status="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { summary = "Many Kubernetes nodes are Not Ready", description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape {{ $value }}% of kubelets.", } ALERT K8SKubeletDown IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { severity = "warning", } ANNOTATIONS { summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. ### Container resources ### cluster_namespace_controller_pod_container:spec_memory_limit_bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:spec_cpu_shares = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:cpu_usage:rate = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( irate( container_cpu_usage_seconds_total{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_working_set:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_rss:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_cache:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:disk_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_pagefaults:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( irate( container_memory_failures_total{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_oom:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( irate( container_memory_failcnt{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) ### Cluster resources ### cluster:memory_allocation:percent = 100 * sum by (cluster) ( container_spec_memory_limit_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) cluster:memory_used:percent = 100 * sum by (cluster) ( container_memory_usage_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) cluster:cpu_allocation:percent = 100 * sum by (cluster) ( container_spec_cpu_shares{pod_name!=""} ) / sum by (cluster) ( container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores ) cluster:node_cpu_use:percent = 100 * sum by (cluster) ( rate(node_cpu{mode!="idle"}[5m]) ) / sum by (cluster) ( machine_cpu_cores ) ### API latency ### # Raw metrics are in microseconds. Convert to seconds. cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = histogram_quantile( 0.99, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = histogram_quantile( 0.9, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = histogram_quantile( 0.5, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 ### Scheduling latency ### cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 kube-scheduler.rules: |+ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"} == 1) FOR 5m LABELS { severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", } node.rules: |+ ALERT NodeExporterDown IF absent(up{job="node-exporter"} == 1) FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } ALERT K8SNodeOutOfDisk IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 LABELS { service = "k8s", severity = "critical" } ANNOTATIONS { summary = "Node ran out of disk space.", description = "{{ $labels.node }} has run out of disk space.", } ALERT K8SNodeMemoryPressure IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Node is under memory pressure.", description = "{{ $labels.node }} is under memory pressure.", } ALERT K8SNodeDiskPressure IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Node is under disk pressure.", description = "{{ $labels.node }} is under disk pressure.", } prometheus.rules: |+ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Prometheus configuration reload has failed", description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." }