contrib/kube-prometheus: Run jb update and generate all manifests.

This commit is contained in:
David Lefever
2018-09-26 22:57:11 +02:00
parent 59fd4cd63e
commit 57a0e161ff
3 changed files with 78 additions and 60 deletions

View File

@@ -8,7 +8,7 @@
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
} }
}, },
"version": "bffc85d6e76f6341d5370af68ea980030ab402e8" "version": "2694cabc85ed89b3c8ac0865bcbc29d72e52eb2f"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@@ -18,7 +18,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "83f20ee933bcd13fcf4ad1b49a40c92135c5569c" "version": "ed0796f3cb97ebc35ae54f543b1814a7c8dae305"
}, },
{ {
"name": "kubernetes-mixin", "name": "kubernetes-mixin",
@@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "c70814dcafce1b51357938e09ee1192998a95706" "version": "19da1eb2f2558dad0f8d9e280cc1fe7bc835677b"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@@ -38,7 +38,7 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "7be7f8e4e8da37cac104d2655ca22fdb8a93ebcd" "version": "64147daa1267a2571ef95609550b782ec9807c52"
}, },
{ {
"name": "grafana-builder", "name": "grafana-builder",
@@ -48,7 +48,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "e6fe81715dd802b4c9d9c64f2c44ba6ee56d2000" "version": "94aef231932810633416bfe596a41dbad2b1ebb9"
}, },
{ {
"name": "grafana", "name": "grafana",
@@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "001bbb97ccea05cb0d5f6e97c3939654244e8998" "version": "a3e242d80ae1a13ae57904fc12e91fe4c9ecf972"
} }
] ]
} }

View File

@@ -3643,7 +3643,7 @@ items:
}, },
"yaxes": [ "yaxes": [
{ {
"format": "short", "format": "decbytes",
"label": null, "label": null,
"logBase": 1, "logBase": 1,
"max": null, "max": null,

View File

@@ -564,8 +564,8 @@ spec:
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} / second' }}) is restarting {{ printf "%.2f" $value }} times / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0
@@ -574,7 +574,8 @@ spec:
severity: critical severity: critical
- alert: KubePodNotReady - alert: KubePodNotReady
annotations: annotations:
message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
@@ -583,8 +584,9 @@ spec:
severity: critical severity: critical
- alert: KubeDeploymentGenerationMismatch - alert: KubeDeploymentGenerationMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
mismatch }} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: | expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"} kube_deployment_status_observed_generation{job="kube-state-metrics"}
@@ -595,8 +597,8 @@ spec:
severity: critical severity: critical
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
mismatch matched the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: | expr: |
kube_deployment_spec_replicas{job="kube-state-metrics"} kube_deployment_spec_replicas{job="kube-state-metrics"}
@@ -607,8 +609,8 @@ spec:
severity: critical severity: critical
- alert: KubeStatefulSetReplicasMismatch - alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
mismatch not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: | expr: |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"} kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
@@ -619,8 +621,9 @@ spec:
severity: critical severity: critical
- alert: KubeStatefulSetGenerationMismatch - alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
mismatch }} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: | expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@@ -629,10 +632,30 @@ spec:
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: Only {{$value}}% of desired pods scheduled and ready for daemon set message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
{{$labels.namespace}}/{{$labels.daemonset}} }}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: | expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"} kube_daemonset_status_number_ready{job="kube-state-metrics"}
@@ -643,8 +666,8 @@ spec:
severity: critical severity: critical
- alert: KubeDaemonSetNotScheduled - alert: KubeDaemonSetNotScheduled
annotations: annotations:
message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
are not scheduled. }} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: | expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@@ -655,8 +678,8 @@ spec:
severity: warning severity: warning
- alert: KubeDaemonSetMisScheduled - alert: KubeDaemonSetMisScheduled
annotations: annotations:
message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
are running where they are not supposed to run. }} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: | expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@@ -676,7 +699,7 @@ spec:
- alert: KubeJobCompletion - alert: KubeJobCompletion
annotations: annotations:
message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than
1h to complete. one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: | expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@@ -696,8 +719,8 @@ spec:
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
message: Overcommited CPU resource requests on Pods, cannot tolerate node message: Cluster has overcommitted CPU resource requests for Pods and cannot
failure. tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: | expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
@@ -710,8 +733,8 @@ spec:
severity: warning severity: warning
- alert: KubeMemOvercommit - alert: KubeMemOvercommit
annotations: annotations:
message: Overcommited Memory resource requests on Pods, cannot tolerate node message: Cluster has overcommitted memory resource requests for Pods and cannot
failure. tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: | expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
@@ -726,7 +749,7 @@ spec:
severity: warning severity: warning
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
message: Overcommited CPU resource request quota on Namespaces. message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
@@ -738,7 +761,7 @@ spec:
severity: warning severity: warning
- alert: KubeMemOvercommit - alert: KubeMemOvercommit
annotations: annotations:
message: Overcommited Memory resource request quota on Namespaces. message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
@@ -750,13 +773,13 @@ spec:
severity: warning severity: warning
- alert: KubeQuotaExceeded - alert: KubeQuotaExceeded
annotations: annotations:
message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
namespace {{ $labels.namespace }}.' }}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: | expr: |
100 * kube_resourcequota{job="kube-state-metrics", type="used"} 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type) / ignoring(instance, job, type)
kube_resourcequota{job="kube-state-metrics", type="hard"} (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90 > 90
for: 15m for: 15m
labels: labels:
@@ -765,9 +788,9 @@ spec:
rules: rules:
- alert: KubePersistentVolumeUsageCritical - alert: KubePersistentVolumeUsageCritical
annotations: annotations:
message: The persistent volume claimed by {{ $labels.persistentvolumeclaim message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value
free. }}% free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: | expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"} 100 * kubelet_volume_stats_available_bytes{job="kubelet"}
@@ -779,8 +802,8 @@ spec:
severity: critical severity: critical
- alert: KubePersistentVolumeFullInFourDays - alert: KubePersistentVolumeFullInFourDays
annotations: annotations:
message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in namespace {{ $labels.namespace }} is expected to fill up within four }} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value }} bytes are available. days. Currently {{ $value }} bytes are available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: | expr: |
@@ -792,7 +815,7 @@ spec:
rules: rules:
- alert: KubeNodeNotReady - alert: KubeNodeNotReady
annotations: annotations:
message: '{{ $labels.node }} has been unready for more than an hour' message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
expr: | expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
@@ -825,7 +848,7 @@ spec:
- alert: KubeClientErrors - alert: KubeClientErrors
annotations: annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: | expr: |
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
@@ -834,8 +857,8 @@ spec:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
annotations: annotations:
message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
the limit of 110. to the limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
@@ -845,7 +868,7 @@ spec:
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
annotations: annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}. for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
@@ -855,7 +878,7 @@ spec:
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
annotations: annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}. for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
@@ -864,18 +887,18 @@ spec:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is erroring for {{ $value }}% of requests. message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/ /
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is erroring for {{ $value }}% of requests. message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
@@ -894,7 +917,7 @@ spec:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: Kubernetes API certificate is expiring in less than 1 day. message: Kubernetes API certificate is expiring in less than 24 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
@@ -904,9 +927,8 @@ spec:
rules: rules:
- alert: AlertmanagerConfigInconsistent - alert: AlertmanagerConfigInconsistent
annotations: annotations:
description: The configuration of the instances of the Alertmanager cluster message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
`{{$labels.service}}` are out of sync. are out of sync.
summary: Configuration out of sync
expr: | expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m for: 5m
@@ -914,9 +936,8 @@ spec:
severity: critical severity: critical
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
annotations: annotations:
description: An unexpected number of Alertmanagers were scraped or disappeared message: An unexpected number of Alertmanagers were scraped or disappeared
from discovery. from discovery.
summary: Alertmanager down or missing
expr: | expr: |
label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1
for: 5m for: 5m
@@ -924,9 +945,8 @@ spec:
severity: warning severity: warning
- alert: AlertmanagerFailedReload - alert: AlertmanagerFailedReload
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
summary: Alertmanager's configuration reload failed
expr: | expr: |
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
for: 10m for: 10m
@@ -936,17 +956,15 @@ spec:
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
description: '{{ $value }}% of the {{ $labels.job }} targets are down.' message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
summary: Targets are down
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- alert: DeadMansSwitch - alert: DeadMansSwitch
annotations: annotations:
description: This is a DeadMansSwitch meant to ensure that the entire alerting message: This is a DeadMansSwitch meant to ensure that the entire alerting
pipeline is functional. pipeline is functional.
summary: Alerting DeadMansSwitch
expr: vector(1) expr: vector(1)
labels: labels:
severity: none severity: none