Grafana dashboards update

This commit is contained in:
Karol Pucynski
2019-10-14 16:00:06 +02:00
parent 1ebce4955a
commit e7c3ca314d
4 changed files with 65 additions and 57 deletions

View File

@@ -3093,7 +3093,7 @@ items:
"decimals": 0,
"link": true,
"linkTooltip": "Drill down to pods",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #A",
"thresholds": [
@@ -3111,7 +3111,7 @@ items:
"decimals": 0,
"link": true,
"linkTooltip": "Drill down to workloads",
"linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #B",
"thresholds": [
@@ -3219,7 +3219,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down to pods",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [
@@ -3512,7 +3512,7 @@ items:
"decimals": 0,
"link": true,
"linkTooltip": "Drill down to pods",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #A",
"thresholds": [
@@ -3530,7 +3530,7 @@ items:
"decimals": 0,
"link": true,
"linkTooltip": "Drill down to workloads",
"linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #B",
"thresholds": [
@@ -3638,7 +3638,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down to pods",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [
@@ -4130,7 +4130,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [
@@ -4549,7 +4549,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [
@@ -6067,7 +6067,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [
@@ -6432,7 +6432,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [
@@ -7005,7 +7005,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"pattern": "workload",
"thresholds": [
@@ -7415,7 +7415,7 @@ items:
"decimals": 2,
"link": true,
"linkTooltip": "Drill down",
"linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"pattern": "workload",
"thresholds": [

View File

@@ -242,7 +242,7 @@ spec:
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} < 0.4
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
@@ -260,7 +260,7 @@ spec:
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} < 0.2
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
@@ -308,7 +308,7 @@ spec:
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} < 0.4
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
@@ -326,7 +326,7 @@ spec:
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} < 0.2
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
@@ -573,13 +573,13 @@ spec:
severity: critical
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
{{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
for: 15m
labels:
severity: critical
@@ -718,25 +718,28 @@ spec:
severity: warning
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: |
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90
> 0.90
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\",
}[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
by (container, pod, namespace)\n > 25 \n"
expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: warning
@@ -745,14 +748,14 @@ spec:
- alert: KubePersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
}}% free.
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3
< 0.03
for: 1m
labels:
severity: critical
@@ -760,14 +763,14 @@ spec:
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ printf "%0.2f" $value }}% is available.
days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
100 * (
(
kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
) < 15
) < 0.15
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m
@@ -807,23 +810,23 @@ spec:
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
> 0.01
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ printf "%.4g" $value
}}% of its Pod capacity.
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
100 * max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 95
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
for: 15m
labels:
severity: warning
@@ -849,47 +852,51 @@ spec:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05
for: 10m
labels:
severity: warning