From 8c357c6bde4bd287dbfdd04fe756860705397dcc Mon Sep 17 00:00:00 2001 From: paulfantom Date: Tue, 20 Jul 2021 10:53:49 +0200 Subject: [PATCH 1/3] jsonnet: align alert annotations with best practices Signed-off-by: paulfantom --- .../kube-prometheus/components/mixin/alerts/node.libsonnet | 3 ++- jsonnet/kube-prometheus/platforms/eks.libsonnet | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet index 74cfb4f2..d022c599 100644 --- a/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet @@ -7,7 +7,8 @@ { alert: 'NodeNetworkInterfaceFlapping', annotations: { - message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', + summary: "Network interface is often changin it's status", + description: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', }, expr: ||| changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 diff --git a/jsonnet/kube-prometheus/platforms/eks.libsonnet b/jsonnet/kube-prometheus/platforms/eks.libsonnet index 76eeb385..b81d1cce 100644 --- a/jsonnet/kube-prometheus/platforms/eks.libsonnet +++ b/jsonnet/kube-prometheus/platforms/eks.libsonnet @@ -97,10 +97,11 @@ severity: 'critical', }, annotations: { - message: 'Instance {{ $labels.instance }} has less than 10 IPs available.', + summary: 'EKS CNI is running low on available IPs', + description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs, }, 'for': $.values.eks.minimumAvailableIPsTime, - alert: 'EksAvailableIPs', + alert: 'EksCNILowAvailableIPs', }, ], }, From 02454b3f53ed83190ea5557a6beef894526dce92 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Tue, 20 Jul 2021 11:14:28 +0200 Subject: [PATCH 2/3] manifests: regenerate --- manifests/kube-prometheus-prometheusRule.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml index e3ee47fa..d203dd91 100644 --- a/manifests/kube-prometheus-prometheusRule.yaml +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -39,8 +39,9 @@ spec: rules: - alert: NodeNetworkInterfaceFlapping annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping + summary: Network interface is often changin it's status expr: | changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m From 46eb1713a5b0235099fb0ae099e6bdf363ee4b14 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Tue, 20 Jul 2021 11:14:38 +0200 Subject: [PATCH 3/3] jsonnet: remove unused alert unit tests as those are moved to alertmanager repository --- .../components/mixin/alerts/tests.yaml | 157 ------------------ 1 file changed, 157 deletions(-) delete mode 100644 jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml diff --git a/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml b/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml deleted file mode 100644 index 532bb895..00000000 --- a/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# TODO(metalmatze): This file is temporarily saved here for later reference -# until we find out how to integrate the tests into our jsonnet stack. - -rule_files: - - rules.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.'