@@ -27,7 +27,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "e3d6d8ebb1789af0e17fb1f60171aaf64926a3a1"
|
"version": "d9b461b0692ddfff6c5d2a189443cfe4beefb3b2"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@@ -47,7 +47,7 @@
|
|||||||
"subdir": "grafana-builder"
|
"subdir": "grafana-builder"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "3c44dfa9bfe2b66985733d4b16e0afd29094b4a0"
|
"version": "2b9b14d0d91adf8781e5b2c9b62dc8cb180a9886"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
@@ -57,7 +57,7 @@
|
|||||||
"subdir": "grafana"
|
"subdir": "grafana"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "c27d2792764867cdaf6484f067cc875cb8aef2f6"
|
"version": "5df496bc1199b40bd066a8c228d94d9653173645"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "prometheus-operator",
|
"name": "prometheus-operator",
|
||||||
@@ -77,7 +77,7 @@
|
|||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "7948f39790fbbc979729ca6f990740a20d4a2a76"
|
"version": "efd1fc634b58a629903990e605f2cb9d5633706d"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "prometheus",
|
"name": "prometheus",
|
||||||
@@ -87,7 +87,7 @@
|
|||||||
"subdir": "documentation/prometheus-mixin"
|
"subdir": "documentation/prometheus-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "3638e4ab18ac320c3ed0b607f07aea309dadee45"
|
"version": "08c55c119f39093e18b2bb9cba5c5619dc4ea0e1"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "node-mixin",
|
"name": "node-mixin",
|
||||||
@@ -97,7 +97,7 @@
|
|||||||
"subdir": "docs/node-mixin"
|
"subdir": "docs/node-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "e7c2dbed4e0278731b59e9870eb9a9d046047aa8"
|
"version": "27b8c93a5afc21632239890c4558c7300cca17d2"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "promgrafonnet",
|
"name": "promgrafonnet",
|
||||||
@@ -107,7 +107,7 @@
|
|||||||
"subdir": "lib/promgrafonnet"
|
"subdir": "lib/promgrafonnet"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "24ea0d6e33a415e07ec7b675d74dea3cf01fde73"
|
"version": "d9b461b0692ddfff6c5d2a189443cfe4beefb3b2"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@@ -5485,7 +5485,7 @@ items:
|
|||||||
"unit": "bytes"
|
"unit": "bytes"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"alias": "Memory Usage (Swap",
|
"alias": "Memory Usage (Swap)",
|
||||||
"colorMode": null,
|
"colorMode": null,
|
||||||
"colors": [
|
"colors": [
|
||||||
|
|
||||||
@@ -19485,7 +19485,7 @@ items:
|
|||||||
"options": [
|
"options": [
|
||||||
|
|
||||||
],
|
],
|
||||||
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, cluster=\"$cluster\", namespace)",
|
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
|
||||||
"refresh": 2,
|
"refresh": 2,
|
||||||
"regex": "",
|
"regex": "",
|
||||||
"sort": 0,
|
"sort": 0,
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
apiVersion: apps/v1beta2
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
|
@@ -497,7 +497,7 @@ spec:
|
|||||||
state for longer than 15 minutes.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -630,7 +630,33 @@ spec:
|
|||||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_status_failed{job="kube-state-metrics"} > 0
|
kube_job_failed{job="kube-state-metrics"} > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeHpaReplicasMismatch
|
||||||
|
annotations:
|
||||||
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
|
||||||
|
desired number of replicas for longer than 15 minutes.
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||||
|
expr: |
|
||||||
|
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||||
|
!=
|
||||||
|
kube_hpa_status_current_replicas{job="kube-state-metrics"})
|
||||||
|
and
|
||||||
|
changes(kube_hpa_status_current_replicas[15m]) == 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeHpaMaxedOut
|
||||||
|
annotations:
|
||||||
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
|
||||||
|
max replicas for longer than 15 minutes.
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||||
|
expr: |
|
||||||
|
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||||
|
==
|
||||||
|
kube_hpa_spec_max_replicas{job="kube-state-metrics"}
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -761,7 +787,7 @@ spec:
|
|||||||
rules:
|
rules:
|
||||||
- alert: KubeNodeNotReady
|
- alert: KubeNodeNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.node }} has been unready for more than an hour.'
|
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||||
expr: |
|
expr: |
|
||||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||||
@@ -791,23 +817,13 @@ spec:
|
|||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientErrors
|
|
||||||
annotations:
|
|
||||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
|
||||||
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
|
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
|
||||||
expr: |
|
|
||||||
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletTooManyPods
|
- alert: KubeletTooManyPods
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
|
message: Kubelet '{{ $labels.node }}' is running at {{ printf "%.4g" $value
|
||||||
to the limit of 110.
|
}}% of its Pod capacity.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
expr: |
|
expr: |
|
||||||
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
100 * max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 95
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -991,17 +1007,6 @@ spec:
|
|||||||
for: 4h
|
for: 4h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusTSDBWALCorruptions
|
|
||||||
annotations:
|
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
|
|
||||||
{{$value | humanize}} corruptions of the write-ahead log (WAL) over the
|
|
||||||
last 3h.
|
|
||||||
summary: Prometheus is detecting WAL corruptions.
|
|
||||||
expr: |
|
|
||||||
increase(tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
|
||||||
for: 4h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: PrometheusNotIngestingSamples
|
- alert: PrometheusNotIngestingSamples
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
|
||||||
@@ -1015,7 +1020,8 @@ spec:
|
|||||||
- alert: PrometheusDuplicateTimestamps
|
- alert: PrometheusDuplicateTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
||||||
{{$value | humanize}} samples/s with different values but duplicated timestamp.
|
{{ printf "%.4g" $value }} samples/s with different values but duplicated
|
||||||
|
timestamp.
|
||||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
@@ -1025,7 +1031,7 @@ spec:
|
|||||||
- alert: PrometheusOutOfOrderTimestamps
|
- alert: PrometheusOutOfOrderTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
||||||
{{$value | humanize}} samples/s with timestamps arriving out of order.
|
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||||
summary: Prometheus drops samples with out-of-order timestamps.
|
summary: Prometheus drops samples with out-of-order timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
@@ -1069,6 +1075,25 @@ spec:
|
|||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
- alert: PrometheusRemoteWriteDesiredShards
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
|
||||||
|
desired shards calculation wants to run {{ printf $value }} shards, which
|
||||||
|
is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
|
||||||
|
$labels.instance | query | first | value }}.
|
||||||
|
summary: Prometheus remote write desired shards calculation wants to run more
|
||||||
|
than configured max shards.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
(
|
||||||
|
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
> on(job, instance) group_right
|
||||||
|
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- alert: PrometheusRuleFailures
|
- alert: PrometheusRuleFailures
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
|
||||||
|
Reference in New Issue
Block a user