Merge pull request #1268 from paulfantom/alerts-best-practices
Alerts best practices
This commit is contained in:
@@ -7,7 +7,8 @@
|
||||
{
|
||||
alert: 'NodeNetworkInterfaceFlapping',
|
||||
annotations: {
|
||||
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}',
|
||||
summary: "Network interface is often changin it's status",
|
||||
description: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}',
|
||||
},
|
||||
expr: |||
|
||||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
|
||||
|
||||
@@ -1,157 +0,0 @@
|
||||
# TODO(metalmatze): This file is temporarily saved here for later reference
|
||||
# until we find out how to integrate the tests into our jsonnet stack.
|
||||
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
- eval_time: 11m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 17m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 23m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
- eval_time: 11m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 17m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 23m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
@@ -97,10 +97,11 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.',
|
||||
summary: 'EKS CNI is running low on available IPs',
|
||||
description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs,
|
||||
},
|
||||
'for': $.values.eks.minimumAvailableIPsTime,
|
||||
alert: 'EksAvailableIPs',
|
||||
alert: 'EksCNILowAvailableIPs',
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
@@ -39,8 +39,9 @@ spec:
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changin it's status
|
||||
expr: |
|
||||
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
|
||||
Reference in New Issue
Block a user