From 850326d6e0e8b3a74f8096413c71ea60efa41693 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Fri, 19 Jun 2020 10:40:30 +0200 Subject: [PATCH] Fix AlertmanagerConfigInconsistent alert Previously the alert would fire when the number of Alertmanager pods didn't match the number of replicas defined in the Alertmanager spec even though all the running pods had the same configuration hash. This type of issue is already covered by KubeStatefulSetUpdateNotRolledOut (and possibly KubePodNotReady), having AlertmanagerConfigInconsistent also active in this situation creates unnecessary noise. With this change, the alert expression only returns when Alertmanager pods have different configuration hash values irrespective of the number of pod replicas. The message annotation has also been enhanced to report the configuration hash for each pod. Signed-off-by: Simon Pasquier --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index bda69d00..bcabf4d9 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -7,10 +7,15 @@ { alert: 'AlertmanagerConfigInconsistent', annotations: { - message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', + message: ||| + The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. + {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} + Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" + {{ end }} + |||, }, expr: ||| - count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 + count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1 ||| % $._config, 'for': '5m', labels: {