Merge pull request #1268 from paulfantom/alerts-best-practices
Alerts best practices
This commit is contained in:
@@ -7,7 +7,8 @@
|
|||||||
{
|
{
|
||||||
alert: 'NodeNetworkInterfaceFlapping',
|
alert: 'NodeNetworkInterfaceFlapping',
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}',
|
summary: "Network interface is often changin it's status",
|
||||||
|
description: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}',
|
||||||
},
|
},
|
||||||
expr: |||
|
expr: |||
|
||||||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
|
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
|
||||||
|
|||||||
@@ -1,157 +0,0 @@
|
|||||||
# TODO(metalmatze): This file is temporarily saved here for later reference
|
|
||||||
# until we find out how to integrate the tests into our jsonnet stack.
|
|
||||||
|
|
||||||
rule_files:
|
|
||||||
- rules.yaml
|
|
||||||
|
|
||||||
evaluation_interval: 1m
|
|
||||||
|
|
||||||
tests:
|
|
||||||
- interval: 1m
|
|
||||||
input_series:
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
|
||||||
alert_rule_test:
|
|
||||||
- eval_time: 5m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
- eval_time: 11m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 17m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 23m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- interval: 1m
|
|
||||||
input_series:
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
|
||||||
alert_rule_test:
|
|
||||||
- eval_time: 5m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
- eval_time: 11m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 17m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 23m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
@@ -97,10 +97,11 @@
|
|||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.',
|
summary: 'EKS CNI is running low on available IPs',
|
||||||
|
description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs,
|
||||||
},
|
},
|
||||||
'for': $.values.eks.minimumAvailableIPsTime,
|
'for': $.values.eks.minimumAvailableIPsTime,
|
||||||
alert: 'EksAvailableIPs',
|
alert: 'EksCNILowAvailableIPs',
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -39,8 +39,9 @@ spec:
|
|||||||
rules:
|
rules:
|
||||||
- alert: NodeNetworkInterfaceFlapping
|
- alert: NodeNetworkInterfaceFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
|
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
|
||||||
|
summary: Network interface is often changin it's status
|
||||||
expr: |
|
expr: |
|
||||||
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||||
for: 2m
|
for: 2m
|
||||||
|
|||||||
Reference in New Issue
Block a user