kube-prometheus: add alerting rules

This commit is contained in:
Frederic Branczyk
2017-05-27 10:44:33 +02:00
parent f0851d5e4d
commit c4b382be6f
12 changed files with 828 additions and 598 deletions

View File

@@ -6,76 +6,259 @@ metadata:
role: prometheus-rulefiles
prometheus: k8s
data:
etcd2.rules: |+
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
alertmanager.rules: |+
ALERT AlertmanagerConfigInconsistent
IF count_values by (service) ("config_hash", alertmanager_config_hash)
/ on(service) group_left
label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
summary = "Alertmanager configurations are inconsistent",
description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
FOR 10m
ALERT AlertmanagerDownOrMissing
IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
/ on(job) group_right
sum by(job) (up) != 1
FOR 5m
LABELS {
severity = "critical"
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
summary = "Alertmanager down or not discovered",
description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
ALERT FailedReload
IF alertmanager_config_last_reload_successful == 0
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
summary = "Alertmanager configuration reload has failed",
description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
}
etcd3.rules: |+
# general cluster availability
# alert if another failed member will result in an unavailable cluster
ALERT InsufficientMembers
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "etcd cluster insufficient members",
description = "If one more etcd member goes down the cluster will be unavailable",
}
# etcd leader alerts
# ==================
# alert if any etcd instance has no leader
ALERT NoLeader
IF etcd_server_has_leader{job="etcd"} == 0
FOR 1m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "etcd member has no leader",
description = "etcd member {{ $labels.instance }} has no leader",
}
# alert if there are lots of leader changes
ALERT HighNumberOfLeaderChanges
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of leader changes within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
}
# gRPC request alerts
# ===================
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
ALERT HighNumberOfFailedGRPCRequests
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of gRPC requests are failing",
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
ALERT HighNumberOfFailedGRPCRequests
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of gRPC requests are failing",
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of gRPC method calls take more than 150ms
ALERT GRPCRequestsSlow
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "slow gRPC requests",
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
}
# HTTP requests alerts
# ====================
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
# etcd member communication alerts
# ================================
# alert if 99th percentile of round trips take 150ms
ALERT EtcdMemberCommunicationSlow
IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "etcd member communication is slow",
description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow",
}
# etcd proposal alerts
# ====================
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of proposals within the etcd cluster are failing",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
# etcd disk io latency alerts
# ===========================
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "etcd instance {{ $labels.instance }} fync durations are high",
}
# alert if 99th percentile of commit durations is higher than 250ms
ALERT HighCommitDurations
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high commit durations",
description = "etcd instance {{ $labels.instance }} commit durations are high",
}
general.rules: |+
### Up Alerting ###
Alert TargetDown
IF up == 0
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "target is down",
description = "A target of type {{ $labels.job }} is down."
}
### File descriptor alerts ###
ALERT TooManyOpenFiles
IF 100*process_open_fds / process_max_fds > 50
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "too many open file descriptors",
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds / process_max_fds > 80
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "too many open file descriptors",
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
}
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
@@ -87,7 +270,7 @@ data:
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
@@ -99,34 +282,154 @@ data:
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
}
### etcd proposal alerts ###
### Contrack alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
# To catch the conntrack sysctl de-tuning when it happens
ALERT ConntrackTuningMissing
IF node_nf_conntrack_udp_timeout > 10
FOR 10m
LABELS {
severity = "warning"
severity = "warning",
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
summary = "Node does not have the correct conntrack tunings",
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
}
### etcd disk io latency alerts ###
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
ALERT ConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
}
ALERT ConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
}
kube-apiserver.rules: |+
ALERT K8SApiserverDown
IF up{job="apiserver"} == 0
FOR 15m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "An API server could not be scraped.",
}
# Disable for non HA kubernetes setups.
ALERT K8SApiserverDown
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
}
# Some verbs excluded because they are expected to be long-lasting:
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
ALERT K8SApiServerLatency
IF histogram_quantile(
0.99,
sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "Kubernetes apiserver latency is high",
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
}
kube-controller-manager.rules: |+
ALERT K8SControllerManagerDown
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
FOR 5m
LABELS {
severity = "critical",
}
ANNOTATIONS {
summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
}
kubelet.rules: |+
ALERT K8SNodeDown
IF up{job="kubelet"} == 0
FOR 1h
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "Kubelet cannot be scraped",
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
}
ALERT K8SNodeNotReady
IF kube_node_status_ready{condition="true"} == 0
FOR 1h
LABELS {
severity = "warning",
}
ANNOTATIONS {
summary = "Node status is NotReady",
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
}
ALERT K8SManyNodesNotReady
IF
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
AND
(
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
/
count by (cluster) (kube_node_status_ready{condition="true"})
) > 0.2
FOR 1m
LABELS {
severity = "critical",
}
ANNOTATIONS {
summary = "Many K8s nodes are Not Ready",
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
}
ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
FOR 1h
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Many Kubelets cannot be scraped",
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
}
ALERT K8SKubeletTooManyPods
IF kubelet_running_pod_count > 100
LABELS {
severity = "warning",
}
ANNOTATIONS {
summary = "Kubelet is close to pod limit",
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
}
kubernetes.rules: |+
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
@@ -300,220 +603,36 @@ data:
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
ALERT K8SNodeDown
IF up{job="kubelet"} == 0
FOR 1h
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubelet cannot be scraped",
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
}
ALERT K8SNodeNotReady
IF kube_node_status_ready{condition="true"} == 0
FOR 1h
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Node status is NotReady",
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
}
ALERT K8SManyNodesNotReady
IF
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
AND
(
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
/
count by (cluster) (kube_node_status_ready{condition="true"})
) > 0.2
FOR 1m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Many K8s nodes are Not Ready",
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
}
ALERT K8SKubeletNodeExporterDown
IF up{job="node-exporter"} == 0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubelet node_exporter cannot be scraped",
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
}
ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
FOR 1h
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Many Kubelets cannot be scraped",
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
}
ALERT K8SApiserverDown
IF up{job="kubernetes"} == 0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "An API server could not be scraped.",
}
# Disable for non HA kubernetes setups.
ALERT K8SApiserverDown
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
FOR 5m
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
}
kube-scheduler.rules: |+
ALERT K8SSchedulerDown
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
FOR 5m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Scheduler is down",
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
}
ALERT K8SControllerManagerDown
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
FOR 5m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
node.rules: |+
ALERT NodeExporterDown
IF up{job="node-exporter"} == 0
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
}
# To catch the conntrack sysctl de-tuning when it happens
ALERT K8SConntrackTuningMissing
IF node_nf_conntrack_udp_timeout > 10
prometheus.rules: |+
ALERT FailedReload
IF prometheus_config_last_reload_successful == 0
FOR 10m
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Node does not have the correct conntrack tunings",
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "{{ $labels.job }} has too many open file descriptors",
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
summary = "Prometheus configuration reload has failed",
description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
FOR 10m
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "{{ $labels.job }} has too many open file descriptors",
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
}
# Some verbs excluded because they are expected to be long-lasting:
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
ALERT K8SApiServerLatency
IF histogram_quantile(
0.99,
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubernetes apiserver latency is high",
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
}
ALERT K8SApiServerEtcdAccessLatency
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Access to etcd is slow",
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
}
ALERT K8SKubeletTooManyPods
IF kubelet_running_pod_count > 100
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Kubelet is close to pod limit",
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
}