Fix node-mixin import

This commit is contained in:
Matthias Loibl
2020-09-11 18:09:05 +02:00
parent 09293866ae
commit e6111e58c4
4 changed files with 3196 additions and 24 deletions

View File

@@ -6,7 +6,7 @@ local configMapList = k3.core.v1.configMapList;
(import './kube-state-metrics/kube-state-metrics.libsonnet') + (import './kube-state-metrics/kube-state-metrics.libsonnet') +
(import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') +
(import './node-exporter/node-exporter.libsonnet') + (import './node-exporter/node-exporter.libsonnet') +
(import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') +
(import './alertmanager/alertmanager.libsonnet') + (import './alertmanager/alertmanager.libsonnet') +
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') + (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') +
(import './prometheus/prometheus.libsonnet') + (import './prometheus/prometheus.libsonnet') +

File diff suppressed because it is too large Load Diff

View File

@@ -81,6 +81,15 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/namespace-by-workload - mountPath: /grafana-dashboard-definitions/0/namespace-by-workload
name: grafana-dashboard-namespace-by-workload name: grafana-dashboard-namespace-by-workload
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-rsrc-use
name: grafana-dashboard-node-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/nodes
name: grafana-dashboard-nodes
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage - mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage
readOnly: false readOnly: false
@@ -157,6 +166,15 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-namespace-by-workload name: grafana-dashboard-namespace-by-workload
name: grafana-dashboard-namespace-by-workload name: grafana-dashboard-namespace-by-workload
- configMap:
name: grafana-dashboard-node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
- configMap:
name: grafana-dashboard-node-rsrc-use
name: grafana-dashboard-node-rsrc-use
- configMap:
name: grafana-dashboard-nodes
name: grafana-dashboard-nodes
- configMap: - configMap:
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage

View File

@@ -8,6 +8,63 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
groups: groups:
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: kube-apiserver.rules - name: kube-apiserver.rules
rules: rules:
- expr: | - expr: |
@@ -759,38 +816,234 @@ spec:
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
- name: kube-state-metrics - name: node-exporter
rules: rules:
- alert: KubeStateMetricsListErrors - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: kube-state-metrics is experiencing errors at an elevated rate description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
in list operations. This is likely causing it to not be able to expose metrics has only {{ printf "%.2f" $value }}% available space left and is filling
about Kubernetes objects correctly or at all. up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: kube-state-metrics is experiencing errors in list operations. summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: | expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) (
/ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) and
> 0.01 predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP
is configured on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
- alert: KubeStateMetricsWatchErrors - alert: NodeRAIDDiskFailure
annotations: annotations:
description: kube-state-metrics is experiencing errors at an elevated rate description: At least one device in RAID array on {{ $labels.instance }} failed.
in watch operations. This is likely causing it to not be able to expose Array '{{ $labels.device }}' needs attention and possibly a disk swap.
metrics about Kubernetes objects correctly or at all. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors summary: Failed device in RAID array
summary: kube-state-metrics is experiencing errors in watch operations.
expr: | expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) node_md_disks{state="fail"} > 0
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels: labels:
severity: critical severity: warning
- name: kubernetes-apps - name: kubernetes-apps
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping