From 909f51b3bd67e548c2a0ecf78364e6d676219cfd Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Fri, 16 Aug 2019 15:16:13 +0200 Subject: [PATCH 1/2] jsonnet/kube-prometheus: Prevent many-to-many matching If there is more than one prometheus-operator pod, which happens briefly when we delete the prometheus-operator pod, we can see the errors of many-to-many matching, this whitelists the labels matching, and excluded the pod. --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index bf58862d..bda69d00 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -10,7 +10,7 @@ message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', }, expr: ||| - count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 ||| % $._config, 'for': '5m', labels: { @@ -31,8 +31,8 @@ }, }, { - alert:'AlertmanagerMembersInconsistent', - annotations:{ + alert: 'AlertmanagerMembersInconsistent', + annotations: { message: 'Alertmanager has not found all other members of the cluster.', }, expr: ||| From c6e6f2e74f08420ae59180950b7bc58ba025acce Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Fri, 16 Aug 2019 16:13:43 +0200 Subject: [PATCH 2/2] manifests/prometheus-rules.yaml: Regenerate files --- manifests/prometheus-rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index f35510b5..2f0eaa75 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1054,7 +1054,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical