Update dependencies and most importantly PromOp to v0.42.0

This commit is contained in:
Matthias Loibl
2020-09-11 13:14:22 +02:00
parent e55b6a8272
commit 2d3c10e3f7
14 changed files with 284 additions and 101 deletions

View File

@@ -26,7 +26,7 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "release-0.41" "version": "release-0.42"
}, },
{ {
"source": { "source": {

View File

@@ -18,7 +18,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "ed27d9d2de4313f5e766eecc79996d011d5de4a8", "version": "76e769ce95ca0d4d0e3486712d96956260db04b8",
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ=" "sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ="
}, },
{ {
@@ -28,8 +28,8 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "8338be68387b5811355aa919d031882ead0c9c6e", "version": "cc1626a1b4dee45c99b78ddd9714dfd5f5d7816e",
"sum": "RixjNQccHT+UlCSvlR4HBiFcTRVdwDz5goWSHEXIf58=" "sum": "nkgrtMYPCq/YB4r3mKyToepaLhicwWnxDdGIodPpzz0="
}, },
{ {
"source": { "source": {
@@ -38,7 +38,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "de586e2ac76e9bcee87d34f0042abe1a2ef7cdf3", "version": "797035a3d20100a6e8f7c973cee70e465f37b880",
"sum": "R5WJe6wW0R9vMpOAHaGFwcK8q4NmGZ0aLhdZGKDHeMU=" "sum": "R5WJe6wW0R9vMpOAHaGFwcK8q4NmGZ0aLhdZGKDHeMU="
}, },
{ {
@@ -59,8 +59,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "dc563cbb03da396d23bc49f33d4f7ae28db514a4", "version": "64aa37e837b0e93bfc6fab9430f57bd7366e5a83",
"sum": "ZBRziwnNo3LPC4XhIjpWahz+gT+w3i2+klIcHx2r7d0=" "sum": "1GDIeGMtvcen1PVF/XXLWv3JbCdV2ZbHcvecU9xZCFo="
}, },
{ {
"source": { "source": {
@@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet" "subdir": "lib/promgrafonnet"
} }
}, },
"version": "dc563cbb03da396d23bc49f33d4f7ae28db514a4", "version": "64aa37e837b0e93bfc6fab9430f57bd7366e5a83",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
}, },
{ {
@@ -79,7 +79,7 @@
"subdir": "jsonnet/kube-state-metrics" "subdir": "jsonnet/kube-state-metrics"
} }
}, },
"version": "44818d1538841379e1a1ea88d555f0249c1f464b", "version": "5fd7281987d57c02e65ca5479c90099814e68ee5",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA=" "sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
}, },
{ {
@@ -89,8 +89,8 @@
"subdir": "jsonnet/kube-state-metrics-mixin" "subdir": "jsonnet/kube-state-metrics-mixin"
} }
}, },
"version": "44818d1538841379e1a1ea88d555f0249c1f464b", "version": "5fd7281987d57c02e65ca5479c90099814e68ee5",
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8=" "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
}, },
{ {
"source": { "source": {
@@ -99,8 +99,8 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "312d675008306b13c24d241bf4f0a882dbfa90d8", "version": "96094ad1ab039950537df448b95bbcc04c57bfc4",
"sum": "NPuLvqEmYZ+dCQ/9U4wXtobBD6hYreEx3jPpLQKS/ig=" "sum": "ReamRYoS2C39Of7KtXGqkSWdfHw5Fy/Ix6ujOmBLFAg="
}, },
{ {
"source": { "source": {
@@ -109,8 +109,8 @@
"subdir": "docs/node-mixin" "subdir": "docs/node-mixin"
} }
}, },
"version": "3b035c8fa1f75c4c00e57acc14fb71dfd62e31ee", "version": "d8a1585f59ef1169837d08979ecc92dcea8aa58a",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc=" "sum": "EE+C+Krf518EGLjA/x3ZvKfenCI0J7YuwFJVBscypRw="
}, },
{ {
"source": { "source": {

View File

@@ -2030,6 +2030,9 @@ items:
"id": 5, "id": 5,
"lines": true, "lines": true,
"linewidth": 1, "linewidth": 1,
"links": [
],
"minSpan": 24, "minSpan": 24,
"nullPointMode": "null as zero", "nullPointMode": "null as zero",
"renderer": "flot", "renderer": "flot",
@@ -11381,7 +11384,7 @@ items:
"step": 10 "step": 10
}, },
{ {
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table", "format": "table",
"instant": true, "instant": true,
"intervalFactor": 2, "intervalFactor": 2,
@@ -11399,7 +11402,7 @@ items:
"step": 10 "step": 10
}, },
{ {
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table", "format": "table",
"instant": true, "instant": true,
"intervalFactor": 2, "intervalFactor": 2,
@@ -19273,6 +19276,9 @@ items:
"id": 5, "id": 5,
"lines": true, "lines": true,
"linewidth": 1, "linewidth": 1,
"links": [
],
"minSpan": 24, "minSpan": 24,
"nullPointMode": "null as zero", "nullPointMode": "null as zero",
"renderer": "flot", "renderer": "flot",
@@ -20644,6 +20650,9 @@ items:
"id": 5, "id": 5,
"lines": true, "lines": true,
"linewidth": 1, "linewidth": 1,
"links": [
],
"minSpan": 24, "minSpan": 24,
"nullPointMode": "null as zero", "nullPointMode": "null as zero",
"renderer": "flot", "renderer": "flot",

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@@ -19,4 +19,4 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0

View File

@@ -788,10 +788,11 @@ spec:
rules: rules:
- alert: KubeStateMetricsListErrors - alert: KubeStateMetricsListErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in description: kube-state-metrics is experiencing errors at an elevated rate
list operations. This is likely causing it to not be able to expose metrics in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all. about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: | expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/ /
@@ -802,10 +803,11 @@ spec:
severity: critical severity: critical
- alert: KubeStateMetricsWatchErrors - alert: KubeStateMetricsWatchErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in description: kube-state-metrics is experiencing errors at an elevated rate
watch operations. This is likely causing it to not be able to expose metrics in watch operations. This is likely causing it to not be able to expose
about Kubernetes objects correctly or at all. metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: | expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/ /
@@ -1020,13 +1022,36 @@ spec:
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="fail"} > 0
labels:
severity: warning
- name: kubernetes-apps - name: kubernetes-apps
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
summary: Pod is crash looping.
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m for: 15m
@@ -1034,9 +1059,10 @@ spec:
severity: warning severity: warning
- alert: KubePodNotReady - alert: KubePodNotReady
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: | expr: |
sum by (namespace, pod) ( sum by (namespace, pod) (
max by(namespace, pod) ( max by(namespace, pod) (
@@ -1050,10 +1076,11 @@ spec:
severity: warning severity: warning
- alert: KubeDeploymentGenerationMismatch - alert: KubeDeploymentGenerationMismatch
annotations: annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has }} does not match, this indicates that the Deployment has failed but has
not been rolled back. not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: | expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"} kube_deployment_status_observed_generation{job="kube-state-metrics"}
!= !=
@@ -1063,9 +1090,10 @@ spec:
severity: warning severity: warning
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
matched the expected number of replicas for longer than 15 minutes. not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: | expr: |
( (
kube_deployment_spec_replicas{job="kube-state-metrics"} kube_deployment_spec_replicas{job="kube-state-metrics"}
@@ -1081,9 +1109,10 @@ spec:
severity: warning severity: warning
- alert: KubeStatefulSetReplicasMismatch - alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
not matched the expected number of replicas for longer than 15 minutes. has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: | expr: |
( (
kube_statefulset_status_replicas_ready{job="kube-state-metrics"} kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
@@ -1099,10 +1128,11 @@ spec:
severity: warning severity: warning
- alert: KubeStatefulSetGenerationMismatch - alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has }} does not match, this indicates that the StatefulSet has failed but has
not been rolled back. not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: | expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!= !=
@@ -1112,9 +1142,10 @@ spec:
severity: warning severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut - alert: KubeStatefulSetUpdateNotRolledOut
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
has not been rolled out. update has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: | expr: |
( (
max without (revision) ( max without (revision) (
@@ -1138,9 +1169,10 @@ spec:
severity: warning severity: warning
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
finished or progressed for at least 15 minutes. not finished or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: | expr: |
( (
( (
@@ -1170,9 +1202,10 @@ spec:
severity: warning severity: warning
- alert: KubeContainerWaiting - alert: KubeContainerWaiting
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour. has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: | expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h for: 1h
@@ -1180,9 +1213,10 @@ spec:
severity: warning severity: warning
- alert: KubeDaemonSetNotScheduled - alert: KubeDaemonSetNotScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.' }} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: | expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
- -
@@ -1192,9 +1226,10 @@ spec:
severity: warning severity: warning
- alert: KubeDaemonSetMisScheduled - alert: KubeDaemonSetMisScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.' }} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: | expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m for: 15m
@@ -1202,9 +1237,10 @@ spec:
severity: warning severity: warning
- alert: KubeJobCompletion - alert: KubeJobCompletion
annotations: annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
than 12 hours to complete. more than 12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
summary: Job did not complete in time
expr: | expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 12h for: 12h
@@ -1212,8 +1248,10 @@ spec:
severity: warning severity: warning
- alert: KubeJobFailed - alert: KubeJobFailed
annotations: annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
expr: | expr: |
kube_job_failed{job="kube-state-metrics"} > 0 kube_job_failed{job="kube-state-metrics"} > 0
for: 15m for: 15m
@@ -1221,9 +1259,10 @@ spec:
severity: warning severity: warning
- alert: KubeHpaReplicasMismatch - alert: KubeHpaReplicasMismatch
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
desired number of replicas for longer than 15 minutes. the desired number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: | expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"} (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
!= !=
@@ -1235,9 +1274,10 @@ spec:
severity: warning severity: warning
- alert: KubeHpaMaxedOut - alert: KubeHpaMaxedOut
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
max replicas for longer than 15 minutes. at max replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
summary: HPA is running at max replicas
expr: | expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"} kube_hpa_status_current_replicas{job="kube-state-metrics"}
== ==
@@ -1249,9 +1289,10 @@ spec:
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot description: Cluster has overcommitted CPU resource requests for Pods and
tolerate node failure. cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
/ /
@@ -1263,9 +1304,10 @@ spec:
severity: warning severity: warning
- alert: KubeMemoryOvercommit - alert: KubeMemoryOvercommit
annotations: annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot description: Cluster has overcommitted memory resource requests for Pods and
tolerate node failure. cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
/ /
@@ -1279,8 +1321,9 @@ spec:
severity: warning severity: warning
- alert: KubeCPUQuotaOvercommit - alert: KubeCPUQuotaOvercommit
annotations: annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces. description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
/ /
@@ -1291,8 +1334,9 @@ spec:
severity: warning severity: warning
- alert: KubeMemoryQuotaOvercommit - alert: KubeMemoryQuotaOvercommit
annotations: annotations:
message: Cluster has overcommitted memory resource requests for Namespaces. description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
/ /
@@ -1301,25 +1345,55 @@ spec:
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
- alert: KubeQuotaFullyUsed - alert: KubeQuotaAlmostFull
annotations: annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota. }} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type) / ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
>= 1 > 0.9 < 1
for: 15m for: 15m
labels: labels:
severity: info severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
summary: Namespace quota is fully used.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh - alert: CPUThrottlingHigh
annotations: annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ {{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.' $labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: | expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
/ /
@@ -1332,10 +1406,11 @@ spec:
rules: rules:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free. }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: | expr: |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/ /
@@ -1346,10 +1421,12 @@ spec:
severity: critical severity: critical
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim description: Based on recent sampling, the PersistentVolume claimed by {{
}} in Namespace {{ $labels.namespace }} is expected to fill up within four $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is
days. Currently {{ $value | humanizePercentage }} is available. expected to fill up within four days. Currently {{ $value | humanizePercentage
}} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: | expr: |
( (
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
@@ -1363,9 +1440,10 @@ spec:
severity: warning severity: warning
- alert: KubePersistentVolumeErrors - alert: KubePersistentVolumeErrors
annotations: annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{ description: The persistent volume {{ $labels.persistentvolume }} has status
$labels.phase }}. {{ $labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: | expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m for: 5m
@@ -1375,9 +1453,10 @@ spec:
rules: rules:
- alert: KubeVersionMismatch - alert: KubeVersionMismatch
annotations: annotations:
message: There are {{ $value }} different semantic versions of Kubernetes description: There are {{ $value }} different semantic versions of Kubernetes
components running. components running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: | expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m for: 15m
@@ -1385,9 +1464,10 @@ spec:
severity: warning severity: warning
- alert: KubeClientErrors - alert: KubeClientErrors
annotations: annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.' }}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: | expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/ /
@@ -1400,8 +1480,9 @@ spec:
rules: rules:
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and and
@@ -1413,8 +1494,9 @@ spec:
short: 5m short: 5m
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and and
@@ -1426,8 +1508,9 @@ spec:
short: 30m short: 30m
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and and
@@ -1439,8 +1522,9 @@ spec:
short: 2h short: 2h
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and and
@@ -1454,47 +1538,52 @@ spec:
rules: rules:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring description: A client certificate used to authenticate to the apiserver is
in less than 7.0 days. expiring in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels: labels:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring description: A client certificate used to authenticate to the apiserver is
in less than 24.0 hours. expiring in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels: labels:
severity: critical severity: critical
- alert: AggregatedAPIErrors - alert: AggregatedAPIErrors
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
reported errors. The number of errors have increased for it in the past has reported errors. The number of errors have increased for it in the past
five minutes. High values indicate that the availability of the service five minutes. High values indicate that the availability of the service
changes too often. changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
summary: An aggregated API has reported errors.
expr: | expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels: labels:
severity: warning severity: warning
- alert: AggregatedAPIDown - alert: AggregatedAPIDown
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
been only {{ $value | humanize }}% available over the last 5m. has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
summary: An aggregated API is down.
expr: | expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90 (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
- alert: KubeAPIDown - alert: KubeAPIDown
annotations: annotations:
message: KubeAPI has disappeared from Prometheus target discovery. description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="apiserver"} == 1) absent(up{job="apiserver"} == 1)
for: 15m for: 15m
@@ -1504,8 +1593,9 @@ spec:
rules: rules:
- alert: KubeNodeNotReady - alert: KubeNodeNotReady
annotations: annotations:
message: '{{ $labels.node }} has been unready for more than 15 minutes.' description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
summary: Node is not ready.
expr: | expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m for: 15m
@@ -1513,17 +1603,21 @@ spec:
severity: warning severity: warning
- alert: KubeNodeUnreachable - alert: KubeNodeUnreachable
annotations: annotations:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' description: '{{ $labels.node }} is unreachable and some workloads may be
rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
summary: Node is unreachable.
expr: | expr: |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels: labels:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
annotations: annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity. }} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
summary: Kubelet is running at capacity.
expr: | expr: |
count by(node) ( count by(node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
@@ -1537,9 +1631,10 @@ spec:
severity: warning severity: warning
- alert: KubeNodeReadinessFlapping - alert: KubeNodeReadinessFlapping
annotations: annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value description: The readiness status of node {{ $labels.node }} has changed {{
}} times in the last 15 minutes. $value }} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: | expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
for: 15m for: 15m
@@ -1547,9 +1642,10 @@ spec:
severity: warning severity: warning
- alert: KubeletPlegDurationHigh - alert: KubeletPlegDurationHigh
annotations: annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
of {{ $value }} seconds on node {{ $labels.node }}. duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: | expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m for: 5m
@@ -1557,18 +1653,82 @@ spec:
severity: warning severity: warning
- alert: KubeletPodStartUpLatencyHigh - alert: KubeletPodStartUpLatencyHigh
annotations: annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}. on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: | expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: |
increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletDown - alert: KubeletDown
annotations: annotations:
message: Kubelet has disappeared from Prometheus target discovery. description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1) absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m for: 15m
@@ -1578,8 +1738,9 @@ spec:
rules: rules:
- alert: KubeSchedulerDown - alert: KubeSchedulerDown
annotations: annotations:
message: KubeScheduler has disappeared from Prometheus target discovery. description: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kube-scheduler"} == 1) absent(up{job="kube-scheduler"} == 1)
for: 15m for: 15m
@@ -1589,8 +1750,10 @@ spec:
rules: rules:
- alert: KubeControllerManagerDown - alert: KubeControllerManagerDown
annotations: annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery. description: KubeControllerManager has disappeared from Prometheus target
discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kube-controller-manager"} == 1) absent(up{job="kube-controller-manager"} == 1)
for: 15m for: 15m

View File

@@ -1717,6 +1717,12 @@ spec:
under. This is necessary to generate correct URLs. This is necessary under. This is necessary to generate correct URLs. This is necessary
if Alertmanager is not served from root of a DNS name. if Alertmanager is not served from root of a DNS name.
type: string type: string
forceEnableClusterMode:
description: ForceEnableClusterMode ensures Alertmanager does not
deactivate the cluster mode when running with a single replica.
Use case is e.g. spanning an Alertmanager cluster across Kubernetes
clusters with a single replica in each.
type: boolean
image: image:
description: Image if specified has precedence over baseImage, tag description: Image if specified has precedence over baseImage, tag
and sha combinations. Specifying the version is still necessary and sha combinations. Specifying the version is still necessary

View File

@@ -757,6 +757,10 @@ spec:
scheme: scheme:
description: Scheme to use when firing alerts. description: Scheme to use when firing alerts.
type: string type: string
timeout:
description: Timeout is a per-target Alertmanager timeout
when pushing alerts.
type: string
tlsConfig: tlsConfig:
description: TLS Config to use for alertmanager connection. description: TLS Config to use for alertmanager connection.
properties: properties:

View File

@@ -238,8 +238,9 @@ spec:
anyOf: anyOf:
- type: integer - type: integer
- type: string - type: string
description: Name or number of the pod port this endpoint refers description: Name or number of the target port of the Pod behind
to. Mutually exclusive with port. the Service, the port must be specified with container port
property. Mutually exclusive with port.
x-kubernetes-int-or-string: true x-kubernetes-int-or-string: true
tlsConfig: tlsConfig:
description: TLS configuration to use when scraping the endpoint description: TLS configuration to use when scraping the endpoint

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
rules: rules:
- apiGroups: - apiGroups:

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@@ -18,15 +18,15 @@ spec:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
spec: spec:
containers: containers:
- args: - args:
- --kubelet-service=kube-system/kubelet - --kubelet-service=kube-system/kubelet
- --logtostderr=true - --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0 - --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.41.1 - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0
image: quay.io/coreos/prometheus-operator:v0.41.1 image: quay.io/prometheus-operator/prometheus-operator:v0.42.0
name: prometheus-operator name: prometheus-operator
ports: ports:
- containerPort: 8080 - containerPort: 8080

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:

View File

@@ -4,6 +4,6 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1 app.kubernetes.io/version: v0.42.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring