Fix AlertmanagerConfigInconsistent alert

Previously the alert would fire when the number of Alertmanager pods didn't match the number of replicas defined in the Alertmanager spec even though all the running pods had the same configuration hash. This type of issue is already covered by KubeStatefulSetUpdateNotRolledOut (and possibly KubePodNotReady), having AlertmanagerConfigInconsistent also active in this situation creates unnecessary noise. With this change, the alert expression only returns when Alertmanager pods have different configuration hash values irrespective of the number of pod replicas. The message annotation has also been enhanced to report the configuration hash for each pod. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2020-06-19 10:40:30 +02:00
parent 98c8346efe
commit 850326d6e0
1 changed files with 7 additions and 2 deletions
--- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
@@ -7,10 +7,15 @@
          {
            alert: 'AlertmanagerConfigInconsistent',
            annotations: {
-              message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
+              message: |||
+                The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
+                {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
+                Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
+                {{ end }}
+              |||,
            },
            expr: |||
-              count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
+              count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
            ||| % $._config,
            'for': '5m',
            labels: {