@@ -5,8 +5,10 @@ route:
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'webhook'
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanagerwh:30500/'
|
||||
- name: 'null'
|
||||
|
36
assets/prometheus/rules/alertmanager.rules
Normal file
36
assets/prometheus/rules/alertmanager.rules
Normal file
@@ -0,0 +1,36 @@
|
||||
ALERT AlertmanagerConfigInconsistent
|
||||
IF count_values by (service) ("config_hash", alertmanager_config_hash)
|
||||
/ on(service) group_left
|
||||
label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager configurations are inconsistent",
|
||||
description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
|
||||
}
|
||||
|
||||
ALERT AlertmanagerDownOrMissing
|
||||
IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
|
||||
/ on(job) group_right
|
||||
sum by(job) (up) != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager down or not discovered",
|
||||
description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
|
||||
}
|
||||
|
||||
ALERT FailedReload
|
||||
IF alertmanager_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager configuration reload has failed",
|
||||
description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
@@ -1,121 +0,0 @@
|
||||
### General cluster availability ###
|
||||
|
||||
# alert if another failed peer will result in an unavailable cluster
|
||||
ALERT InsufficientPeers
|
||||
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Etcd cluster small",
|
||||
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
### HTTP requests alerts ###
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if 50% of requests get a 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
### etcd proposal alerts ###
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
### etcd disk io latency alerts ###
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
177
assets/prometheus/rules/etcd3.rules
Normal file
177
assets/prometheus/rules/etcd3.rules
Normal file
@@ -0,0 +1,177 @@
|
||||
# general cluster availability
|
||||
|
||||
# alert if another failed member will result in an unavailable cluster
|
||||
ALERT InsufficientMembers
|
||||
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd cluster insufficient members",
|
||||
description = "If one more etcd member goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
# etcd leader alerts
|
||||
# ==================
|
||||
|
||||
# alert if any etcd instance has no leader
|
||||
ALERT NoLeader
|
||||
IF etcd_server_has_leader{job="etcd"} == 0
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member has no leader",
|
||||
description = "etcd member {{ $labels.instance }} has no leader",
|
||||
}
|
||||
|
||||
# alert if there are lots of leader changes
|
||||
ALERT HighNumberOfLeaderChanges
|
||||
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of leader changes within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
|
||||
}
|
||||
|
||||
# gRPC request alerts
|
||||
# ===================
|
||||
|
||||
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of gRPC method calls take more than 150ms
|
||||
ALERT GRPCRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow gRPC requests",
|
||||
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
|
||||
}
|
||||
|
||||
# HTTP requests alerts
|
||||
# ====================
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
}
|
||||
|
||||
# etcd member communication alerts
|
||||
# ================================
|
||||
|
||||
# alert if 99th percentile of round trips take 150ms
|
||||
ALERT EtcdMemberCommunicationSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member communication is slow",
|
||||
description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow",
|
||||
}
|
||||
|
||||
# etcd proposal alerts
|
||||
# ====================
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of proposals within the etcd cluster are failing",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
# etcd disk io latency alerts
|
||||
# ===========================
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
|
||||
# alert if 99th percentile of commit durations is higher than 250ms
|
||||
ALERT HighCommitDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high commit durations",
|
||||
description = "etcd instance {{ $labels.instance }} commit durations are high",
|
||||
}
|
63
assets/prometheus/rules/general.rules
Normal file
63
assets/prometheus/rules/general.rules
Normal file
@@ -0,0 +1,63 @@
|
||||
### Up Alerting ###
|
||||
|
||||
Alert TargetDown
|
||||
IF 100 * (count(up == 0) / count(up)) > 3
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Targets are down",
|
||||
description = "More than {{ $value }}% of targets are down."
|
||||
}
|
||||
|
||||
### Dead man's switch ###
|
||||
|
||||
ALERT DeadMansSwitch
|
||||
IF vector(1)
|
||||
LABELS {
|
||||
severity = "none",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alerting DeadMansSwitch",
|
||||
description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
ALERT TooManyOpenFileDescriptors
|
||||
IF 100 * (process_open_fds / process_max_fds) > 95
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "too many open file descriptors",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
28
assets/prometheus/rules/kube-apiserver.rules
Normal file
28
assets/prometheus/rules/kube-apiserver.rules
Normal file
@@ -0,0 +1,28 @@
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
#
|
||||
# apiserver_request_latencies' unit is microseconds
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
10
assets/prometheus/rules/kube-controller-manager.rules
Normal file
10
assets/prometheus/rules/kube-controller-manager.rules
Normal file
@@ -0,0 +1,10 @@
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
10
assets/prometheus/rules/kube-scheduler.rules
Normal file
10
assets/prometheus/rules/kube-scheduler.rules
Normal file
@@ -0,0 +1,10 @@
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
60
assets/prometheus/rules/kubelet.rules
Normal file
60
assets/prometheus/rules/kubelet.rules
Normal file
@@ -0,0 +1,60 @@
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_ready{condition="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
@@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
ALERT K8SNodeDown
|
||||
IF up{job="kubelet"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
|
||||
}
|
||||
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_ready{condition="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletNodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet node_exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SApiserverDown
|
||||
IF up{job="kubernetes"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "An API server could not be scraped.",
|
||||
}
|
||||
|
||||
# Disable for non HA kubernetes setups.
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
||||
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT K8SConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SApiServerEtcdAccessLatency
|
||||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Access to etcd is slow",
|
||||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
|
||||
|
10
assets/prometheus/rules/node.rules
Normal file
10
assets/prometheus/rules/node.rules
Normal file
@@ -0,0 +1,10 @@
|
||||
ALERT NodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "node-exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a node-exporter for more than 10m.",
|
||||
}
|
10
assets/prometheus/rules/prometheus.rules
Normal file
10
assets/prometheus/rules/prometheus.rules
Normal file
@@ -0,0 +1,10 @@
|
||||
ALERT FailedReload
|
||||
IF prometheus_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Prometheus configuration reload has failed",
|
||||
description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
@@ -3,4 +3,4 @@ kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
|
||||
|
@@ -6,7 +6,7 @@ metadata:
|
||||
labels:
|
||||
prometheus: frontend
|
||||
spec:
|
||||
version: v1.6.3
|
||||
version: v1.7.0
|
||||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
tier: frontend
|
||||
|
@@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
k8s-app: prometheus-operator
|
@@ -3,25 +3,28 @@ kind: Deployment
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
operator: prometheus
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
operator: prometheus
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
serviceAccountName: prometheus-operator
|
||||
containers:
|
||||
- name: prometheus-operator
|
||||
image: quay.io/coreos/prometheus-operator:v0.9.1
|
||||
args:
|
||||
- "--kubelet-service=kube-system/kubelet"
|
||||
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 300Mi
|
||||
- name: prometheus-operator
|
||||
image: quay.io/coreos/prometheus-operator:v0.9.1
|
||||
args:
|
||||
- "--kubelet-service=kube-system/kubelet"
|
||||
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 300Mi
|
||||
|
@@ -6,76 +6,260 @@ metadata:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
etcd2.rules: |+
|
||||
### General cluster availability ###
|
||||
|
||||
# alert if another failed peer will result in an unavailable cluster
|
||||
ALERT InsufficientPeers
|
||||
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Etcd cluster small",
|
||||
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
### HTTP requests alerts ###
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||
alertmanager.rules: |+
|
||||
ALERT AlertmanagerConfigInconsistent
|
||||
IF count_values by (service) ("config_hash", alertmanager_config_hash)
|
||||
/ on(service) group_left
|
||||
label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
summary = "Alertmanager configurations are inconsistent",
|
||||
description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
|
||||
}
|
||||
|
||||
# alert if 50% of requests get a 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
||||
FOR 10m
|
||||
ALERT AlertmanagerDownOrMissing
|
||||
IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
|
||||
/ on(job) group_right
|
||||
sum by(job) (up) != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
||||
summary = "Alertmanager down or not discovered",
|
||||
description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||
ALERT FailedReload
|
||||
IF alertmanager_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
summary = "Alertmanager configuration reload has failed",
|
||||
description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
||||
etcd3.rules: |+
|
||||
# general cluster availability
|
||||
|
||||
# alert if another failed member will result in an unavailable cluster
|
||||
ALERT InsufficientMembers
|
||||
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd cluster insufficient members",
|
||||
description = "If one more etcd member goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
# etcd leader alerts
|
||||
# ==================
|
||||
|
||||
# alert if any etcd instance has no leader
|
||||
ALERT NoLeader
|
||||
IF etcd_server_has_leader{job="etcd"} == 0
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member has no leader",
|
||||
description = "etcd member {{ $labels.instance }} has no leader",
|
||||
}
|
||||
|
||||
# alert if there are lots of leader changes
|
||||
ALERT HighNumberOfLeaderChanges
|
||||
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of leader changes within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
|
||||
}
|
||||
|
||||
# gRPC request alerts
|
||||
# ===================
|
||||
|
||||
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of gRPC method calls take more than 150ms
|
||||
ALERT GRPCRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow gRPC requests",
|
||||
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
|
||||
}
|
||||
|
||||
# HTTP requests alerts
|
||||
# ====================
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
}
|
||||
|
||||
# etcd member communication alerts
|
||||
# ================================
|
||||
|
||||
# alert if 99th percentile of round trips take 150ms
|
||||
ALERT EtcdMemberCommunicationSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member communication is slow",
|
||||
description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow",
|
||||
}
|
||||
|
||||
# etcd proposal alerts
|
||||
# ====================
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of proposals within the etcd cluster are failing",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
# etcd disk io latency alerts
|
||||
# ===========================
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
|
||||
# alert if 99th percentile of commit durations is higher than 250ms
|
||||
ALERT HighCommitDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high commit durations",
|
||||
description = "etcd instance {{ $labels.instance }} commit durations are high",
|
||||
}
|
||||
general.rules: |+
|
||||
### Up Alerting ###
|
||||
|
||||
Alert TargetDown
|
||||
IF 100 * (count(up == 0) / count(up)) > 3
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Targets are down",
|
||||
description = "More than {{ $value }}% of targets are down."
|
||||
}
|
||||
|
||||
### Dead man's switch ###
|
||||
|
||||
ALERT DeadMansSwitch
|
||||
IF vector(1)
|
||||
LABELS {
|
||||
severity = "none",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alerting DeadMansSwitch",
|
||||
description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
ALERT TooManyOpenFileDescriptors
|
||||
IF 100 * (process_open_fds / process_max_fds) > 95
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "too many open file descriptors",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
@@ -87,7 +271,7 @@ data:
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
@@ -99,34 +283,108 @@ data:
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
||||
|
||||
### etcd proposal alerts ###
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
||||
kube-apiserver.rules: |+
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
### etcd disk io latency alerts ###
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
#
|
||||
# apiserver_request_latencies' unit is microseconds
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
kube-controller-manager.rules: |+
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
kubelet.rules: |+
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_ready{condition="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
kubernetes.rules: |+
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
@@ -300,220 +558,36 @@ data:
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
ALERT K8SNodeDown
|
||||
IF up{job="kubelet"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
|
||||
}
|
||||
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_ready{condition="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletNodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet node_exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SApiserverDown
|
||||
IF up{job="kubernetes"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "An API server could not be scraped.",
|
||||
}
|
||||
|
||||
# Disable for non HA kubernetes setups.
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
kube-scheduler.rules: |+
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
||||
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
node.rules: |+
|
||||
ALERT NodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
summary = "node-exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a node-exporter for more than 10m.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT K8SConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
prometheus.rules: |+
|
||||
ALERT FailedReload
|
||||
IF prometheus_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
summary = "Prometheus configuration reload has failed",
|
||||
description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SApiServerEtcdAccessLatency
|
||||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Access to etcd is slow",
|
||||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
|
||||
|
@@ -1,12 +1,16 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
alertmanager: main
|
||||
name: alertmanager
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
alertmanager: main
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: web
|
||||
selector:
|
||||
matchExpressions:
|
||||
- {key: alertmanager, operator: In, values: [main]}
|
||||
interval: 30s
|
||||
|
@@ -3,9 +3,9 @@ kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-apiserver
|
||||
labels:
|
||||
k8s-apps: https
|
||||
k8s-app: apiserver
|
||||
spec:
|
||||
jobLabel: provider
|
||||
jobLabel: component
|
||||
selector:
|
||||
matchLabels:
|
||||
component: apiserver
|
||||
@@ -15,7 +15,7 @@ spec:
|
||||
- default
|
||||
endpoints:
|
||||
- port: https
|
||||
interval: 15s
|
||||
interval: 30s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
|
@@ -1,23 +0,0 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: k8s-apps-http
|
||||
namespace: monitoring
|
||||
labels:
|
||||
k8s-apps: http
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchExpressions:
|
||||
- {key: k8s-app, operator: Exists}
|
||||
- {key: k8s-app, operator: NotIn, values: [kubelet]}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 15s
|
||||
- port: http-metrics-dnsmasq
|
||||
interval: 15s
|
||||
- port: http-metrics-skydns
|
||||
interval: 15s
|
@@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-controller-manager
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-controller-manager
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
@@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-scheduler
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-scheduler
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
@@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
labels:
|
||||
k8s-apps: http
|
||||
k8s-app: kube-state-metrics
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
@@ -15,5 +14,5 @@ spec:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 15s
|
||||
interval: 30s
|
||||
honorLabels: true
|
||||
|
@@ -3,16 +3,16 @@ kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kubelet
|
||||
labels:
|
||||
k8s-apps: http
|
||||
k8s-app: kubelet
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
honorLabels: true
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kubelet
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 15s
|
||||
honorLabels: true
|
||||
|
@@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
k8s-apps: http
|
||||
k8s-app: node-exporter
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
@@ -15,4 +14,4 @@ spec:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 15s
|
||||
interval: 30s
|
||||
|
@@ -0,0 +1,12 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
endpoints:
|
||||
- port: http
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: prometheus-operator
|
@@ -3,10 +3,14 @@ kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus
|
||||
labels:
|
||||
prometheus: k8s
|
||||
app: prometheus
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
prometheus: k8s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: web
|
||||
selector:
|
||||
matchExpressions:
|
||||
- {key: prometheus, operator: In, values: [k8s]}
|
||||
interval: 30s
|
||||
|
@@ -6,7 +6,7 @@ metadata:
|
||||
prometheus: k8s
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.6.3
|
||||
version: v1.7.0
|
||||
serviceAccountName: prometheus-k8s
|
||||
serviceMonitorSelector:
|
||||
matchExpression:
|
||||
|
Reference in New Issue
Block a user