Update dependencies and most importantly PromOp to v0.42.0

This commit is contained in:
Matthias Loibl
2020-09-11 13:14:22 +02:00
parent e55b6a8272
commit 2d3c10e3f7
14 changed files with 284 additions and 101 deletions

View File

@@ -26,7 +26,7 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "release-0.41"
"version": "release-0.42"
},
{
"source": {

View File

@@ -18,7 +18,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "ed27d9d2de4313f5e766eecc79996d011d5de4a8",
"version": "76e769ce95ca0d4d0e3486712d96956260db04b8",
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ="
},
{
@@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "8338be68387b5811355aa919d031882ead0c9c6e",
"sum": "RixjNQccHT+UlCSvlR4HBiFcTRVdwDz5goWSHEXIf58="
"version": "cc1626a1b4dee45c99b78ddd9714dfd5f5d7816e",
"sum": "nkgrtMYPCq/YB4r3mKyToepaLhicwWnxDdGIodPpzz0="
},
{
"source": {
@@ -38,7 +38,7 @@
"subdir": "grafana-builder"
}
},
"version": "de586e2ac76e9bcee87d34f0042abe1a2ef7cdf3",
"version": "797035a3d20100a6e8f7c973cee70e465f37b880",
"sum": "R5WJe6wW0R9vMpOAHaGFwcK8q4NmGZ0aLhdZGKDHeMU="
},
{
@@ -59,8 +59,8 @@
"subdir": ""
}
},
"version": "dc563cbb03da396d23bc49f33d4f7ae28db514a4",
"sum": "ZBRziwnNo3LPC4XhIjpWahz+gT+w3i2+klIcHx2r7d0="
"version": "64aa37e837b0e93bfc6fab9430f57bd7366e5a83",
"sum": "1GDIeGMtvcen1PVF/XXLWv3JbCdV2ZbHcvecU9xZCFo="
},
{
"source": {
@@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "dc563cbb03da396d23bc49f33d4f7ae28db514a4",
"version": "64aa37e837b0e93bfc6fab9430f57bd7366e5a83",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
@@ -79,7 +79,7 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "44818d1538841379e1a1ea88d555f0249c1f464b",
"version": "5fd7281987d57c02e65ca5479c90099814e68ee5",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
@@ -89,8 +89,8 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "44818d1538841379e1a1ea88d555f0249c1f464b",
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
"version": "5fd7281987d57c02e65ca5479c90099814e68ee5",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
"source": {
@@ -99,8 +99,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "312d675008306b13c24d241bf4f0a882dbfa90d8",
"sum": "NPuLvqEmYZ+dCQ/9U4wXtobBD6hYreEx3jPpLQKS/ig="
"version": "96094ad1ab039950537df448b95bbcc04c57bfc4",
"sum": "ReamRYoS2C39Of7KtXGqkSWdfHw5Fy/Ix6ujOmBLFAg="
},
{
"source": {
@@ -109,8 +109,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "3b035c8fa1f75c4c00e57acc14fb71dfd62e31ee",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
"version": "d8a1585f59ef1169837d08979ecc92dcea8aa58a",
"sum": "EE+C+Krf518EGLjA/x3ZvKfenCI0J7YuwFJVBscypRw="
},
{
"source": {

View File

@@ -2030,6 +2030,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",
@@ -11381,7 +11384,7 @@ items:
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11399,7 +11402,7 @@ items:
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -19273,6 +19276,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",
@@ -20644,6 +20650,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
namespace: monitoring
spec:
@@ -19,4 +19,4 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0

View File

@@ -788,10 +788,11 @@ spec:
rules:
- alert: KubeStateMetricsListErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
list operations. This is likely causing it to not be able to expose metrics
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
@@ -802,10 +803,11 @@ spec:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
watch operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
@@ -1020,13 +1022,36 @@ spec:
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="fail"} > 0
labels:
severity: warning
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
summary: Pod is crash looping.
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m
@@ -1034,9 +1059,10 @@ spec:
severity: warning
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |
sum by (namespace, pod) (
max by(namespace, pod) (
@@ -1050,10 +1076,11 @@ spec:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
@@ -1063,9 +1090,10 @@ spec:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than 15 minutes.
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
(
kube_deployment_spec_replicas{job="kube-state-metrics"}
@@ -1081,9 +1109,10 @@ spec:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
@@ -1099,10 +1128,11 @@ spec:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
@@ -1112,9 +1142,10 @@ spec:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
update has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |
(
max without (revision) (
@@ -1138,9 +1169,10 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
not finished or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |
(
(
@@ -1170,9 +1202,10 @@ spec:
severity: warning
- alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h
@@ -1180,9 +1213,10 @@ spec:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
@@ -1192,9 +1226,10 @@ spec:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m
@@ -1202,9 +1237,10 @@ spec:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than 12 hours to complete.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
more than 12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
summary: Job did not complete in time
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 12h
@@ -1212,8 +1248,10 @@ spec:
severity: warning
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
expr: |
kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
@@ -1221,9 +1259,10 @@ spec:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
desired number of replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
the desired number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
!=
@@ -1235,9 +1274,10 @@ spec:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
max replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
at max replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
summary: HPA is running at max replicas
expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
==
@@ -1249,9 +1289,10 @@ spec:
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
description: Cluster has overcommitted CPU resource requests for Pods and
cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
/
@@ -1263,9 +1304,10 @@ spec:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
description: Cluster has overcommitted memory resource requests for Pods and
cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
/
@@ -1279,8 +1321,9 @@ spec:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
/
@@ -1291,8 +1334,9 @@ spec:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
/
@@ -1301,25 +1345,55 @@ spec:
for: 5m
labels:
severity: warning
- alert: KubeQuotaFullyUsed
- alert: KubeQuotaAlmostFull
annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
>= 1
> 0.9 < 1
for: 15m
labels:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
summary: Namespace quota is fully used.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
/
@@ -1332,10 +1406,11 @@ spec:
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
@@ -1346,10 +1421,12 @@ spec:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value | humanizePercentage }} is available.
description: Based on recent sampling, the PersistentVolume claimed by {{
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is
expected to fill up within four days. Currently {{ $value | humanizePercentage
}} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
@@ -1363,9 +1440,10 @@ spec:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
description: The persistent volume {{ $labels.persistentvolume }} has status
{{ $labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
@@ -1375,9 +1453,10 @@ spec:
rules:
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes
description: There are {{ $value }} different semantic versions of Kubernetes
components running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
@@ -1385,9 +1464,10 @@ spec:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
@@ -1400,8 +1480,9 @@ spec:
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
@@ -1413,8 +1494,9 @@ spec:
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
@@ -1426,8 +1508,9 @@ spec:
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
@@ -1439,8 +1522,9 @@ spec:
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
@@ -1454,47 +1538,52 @@ spec:
rules:
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
- alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
reported errors. The number of errors have increased for it in the past
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has reported errors. The number of errors have increased for it in the past
five minutes. High values indicate that the availability of the service
changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
summary: An aggregated API has reported errors.
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
severity: warning
- alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
been only {{ $value | humanize }}% available over the last 5m.
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
summary: An aggregated API is down.
expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
for: 5m
labels:
severity: warning
- alert: KubeAPIDown
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="apiserver"} == 1)
for: 15m
@@ -1504,8 +1593,9 @@ spec:
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
summary: Node is not ready.
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
@@ -1513,17 +1603,21 @@ spec:
severity: warning
- alert: KubeNodeUnreachable
annotations:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
description: '{{ $labels.node }} is unreachable and some workloads may be
rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
summary: Node is unreachable.
expr: |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |
count by(node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
@@ -1537,9 +1631,10 @@ spec:
severity: warning
- alert: KubeNodeReadinessFlapping
annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
description: The readiness status of node {{ $labels.node }} has changed {{
$value }} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
for: 15m
@@ -1547,9 +1642,10 @@ spec:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
@@ -1557,18 +1653,82 @@ spec:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: |
increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
@@ -1578,8 +1738,9 @@ spec:
rules:
- alert: KubeSchedulerDown
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
description: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-scheduler"} == 1)
for: 15m
@@ -1589,8 +1750,10 @@ spec:
rules:
- alert: KubeControllerManagerDown
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
description: KubeControllerManager has disappeared from Prometheus target
discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-controller-manager"} == 1)
for: 15m

View File

@@ -1717,6 +1717,12 @@ spec:
under. This is necessary to generate correct URLs. This is necessary
if Alertmanager is not served from root of a DNS name.
type: string
forceEnableClusterMode:
description: ForceEnableClusterMode ensures Alertmanager does not
deactivate the cluster mode when running with a single replica.
Use case is e.g. spanning an Alertmanager cluster across Kubernetes
clusters with a single replica in each.
type: boolean
image:
description: Image if specified has precedence over baseImage, tag
and sha combinations. Specifying the version is still necessary

View File

@@ -757,6 +757,10 @@ spec:
scheme:
description: Scheme to use when firing alerts.
type: string
timeout:
description: Timeout is a per-target Alertmanager timeout
when pushing alerts.
type: string
tlsConfig:
description: TLS Config to use for alertmanager connection.
properties:

View File

@@ -238,8 +238,9 @@ spec:
anyOf:
- type: integer
- type: string
description: Name or number of the pod port this endpoint refers
to. Mutually exclusive with port.
description: Name or number of the target port of the Pod behind
the Service, the port must be specified with container port
property. Mutually exclusive with port.
x-kubernetes-int-or-string: true
tlsConfig:
description: TLS configuration to use when scraping the endpoint

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
rules:
- apiGroups:

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
namespace: monitoring
spec:
@@ -18,15 +18,15 @@ spec:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.41.1
image: quay.io/coreos/prometheus-operator:v0.41.1
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0
image: quay.io/prometheus-operator/prometheus-operator:v0.42.0
name: prometheus-operator
ports:
- containerPort: 8080

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
namespace: monitoring
spec:

View File

@@ -4,6 +4,6 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.41.1
app.kubernetes.io/version: v0.42.0
name: prometheus-operator
namespace: monitoring