Merge pull request #191 from paulfantom/node-mixins

Add node_exporter mixin
This commit is contained in:
Paweł Krupa
2019-08-26 13:46:01 +02:00
committed by GitHub
8 changed files with 2174 additions and 96 deletions

View File

@@ -1,37 +1,6 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'kube-prometheus-node-alerting.rules',
rules: [
{
alert: 'NodeDiskRunningFull',
annotations: {
message: 'Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 24 hours.',
},
expr: |||
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
||| % $._config,
'for': '30m',
labels: {
severity: 'warning',
},
},
{
alert: 'NodeDiskRunningFull',
annotations: {
message: 'Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 2 hours.',
},
expr: |||
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
},
],
},
{
name: 'node-time',
rules: [
@@ -53,32 +22,6 @@
{
name: 'node-network',
rules: [
{
alert: 'NetworkReceiveErrors',
annotations: {
message: 'Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
},
expr: |||
rate(node_network_receive_errs_total{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 0
||| % $._config,
'for': '2m',
labels: {
severity: 'warning',
},
},
{
alert: 'NetworkTransmitErrors',
annotations: {
message: 'Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
},
expr: |||
rate(node_network_transmit_errs_total{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 0
||| % $._config,
'for': '2m',
labels: {
severity: 'warning',
},
},
{
alert: 'NodeNetworkInterfaceFlapping',
annotations: {

View File

@@ -59,6 +59,16 @@
}
},
"version": "master"
},
{
"name": "node-mixin",
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "master"
}
]
}

View File

@@ -5,6 +5,7 @@ local configMapList = k3.core.v1.configMapList;
(import 'grafana/grafana.libsonnet') +
(import 'kube-state-metrics/kube-state-metrics.libsonnet') +
(import 'node-exporter/node-exporter.libsonnet') +
(import 'node-mixin/mixin.libsonnet') +
(import 'alertmanager/alertmanager.libsonnet') +
(import 'prometheus-operator/prometheus-operator.libsonnet') +
(import 'prometheus/prometheus.libsonnet') +

View File

@@ -88,6 +88,16 @@
}
},
"version": "3638e4ab18ac320c3ed0b607f07aea309dadee45"
},
{
"name": "node-mixin",
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "154d59dee72b894f7245d8d78c9344d1211d521f"
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -72,6 +72,12 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/kubelet
name: grafana-dashboard-kubelet
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-rsrc-use
name: grafana-dashboard-node-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/nodes
name: grafana-dashboard-nodes
readOnly: false
@@ -141,6 +147,12 @@ spec:
- configMap:
name: grafana-dashboard-kubelet
name: grafana-dashboard-kubelet
- configMap:
name: grafana-dashboard-node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
- configMap:
name: grafana-dashboard-node-rsrc-use
name: grafana-dashboard-node-rsrc-use
- configMap:
name: grafana-dashboard-nodes
name: grafana-dashboard-nodes

View File

@@ -26,7 +26,7 @@ spec:
name: node-exporter
resources:
limits:
cpu: 102m
cpu: 250m
memory: 180Mi
requests:
cpu: 102m

View File

@@ -8,6 +8,67 @@ metadata:
namespace: monitoring
spec:
groups:
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
(
rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+
rate(node_vmstat_pgpgout{job="node-exporter"}[1m])
)
record: instance:node_memory_swap_io_pages:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: k8s.rules
rules:
- expr: |
@@ -345,6 +406,162 @@ spec:
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",} / node_filesystem_size_bytes{job="node-exporter",} < 0.4
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",} / node_filesystem_size_bytes{job="node-exporter",} < 0.2
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",} / node_filesystem_size_bytes{job="node-exporter",} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",} / node_filesystem_size_bytes{job="node-exporter",} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",} / node_filesystem_files{job="node-exporter",} < 0.4
and
predict_linear(node_filesystem_files_free{job="node-exporter",}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",} / node_filesystem_files{job="node-exporter",} < 0.2
and
predict_linear(node_filesystem_files_free{job="node-exporter",}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",} / node_filesystem_files{job="node-exporter",} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",} / node_filesystem_files{job="node-exporter",} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- name: kubernetes-absent
rules:
- alert: AlertmanagerDown
@@ -1097,26 +1314,6 @@ spec:
expr: vector(1)
labels:
severity: none
- name: kube-prometheus-node-alerting.rules
rules:
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
full within the next 24 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m
labels:
severity: warning
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
full within the next 2 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m
labels:
severity: critical
- name: node-time
rules:
- alert: ClockSkewDetected
@@ -1130,24 +1327,6 @@ spec:
severity: warning
- name: node-network
rules:
- alert: NetworkReceiveErrors
annotations:
message: Network interface "{{ $labels.device }}" showing receive errors on
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NetworkTransmitErrors
annotations:
message: Network interface "{{ $labels.device }}" showing transmit errors
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status