kube-prometheus: Migrate kube-prometheus alerts to jsonnet

2018-05-28 16:54:39 +02:00
parent 309974fadb
commit 64db049d3a
13 changed files with 497 additions and 258 deletions
--- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
@@ -0,0 +1,53 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'alertmanager.rules',
+        rules: [
+          {
+            alert: 'AlertmanagerConfigInconsistent',
+            annotations: {
+              description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
+              summary: 'Configuration out of sync',
+            },
+            expr: |||
+              count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+          },
+          {
+            alert: 'AlertmanagerDownOrMissing',
+            annotations: {
+              description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.',
+              summary: 'Alertmanager down or missing',
+            },
+            expr: |||
+              label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'AlertmanagerFailedReload',
+            annotations: {
+              description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.",
+              summary: "Alertmanager's configuration reload failed",
+            },
+            expr: |||
+              alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
--- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet
@@ -0,0 +1,4 @@
+(import 'alertmanager.libsonnet') +
+(import 'general.libsonnet') +
+(import 'node.libsonnet') +
+(import 'prometheus.libsonnet')
--- a/jsonnet/kube-prometheus/alerts/general.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/general.libsonnet
@@ -0,0 +1,34 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'general.rules',
+        rules: [
+          {
+            alert: 'TargetDown',
+            annotations: {
+              description: '{{ $value }}% of {{ $labels.job }} targets are down.',
+              summary: 'Targets are down',
+            },
+            expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10',
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'DeadMansSwitch',
+            annotations: {
+              description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.',
+              summary: 'Alerting DeadMansSwitch',
+            },
+            expr: 'vector(1)',
+            labels: {
+              severity: 'none',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
--- a/jsonnet/kube-prometheus/alerts/node.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/node.libsonnet
@@ -0,0 +1,39 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'kube-prometheus-node-alerting.rules',
+        rules: [
+          {
+            alert: 'NodeDiskRunningFull',
+            annotations: {
+              description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})',
+              summary: 'Node disk is running full within 24 hours',
+            },
+            expr: |||
+              predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0
+            ||| % $._config,
+            'for': '30m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'NodeDiskRunningFull',
+            annotations: {
+              description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})',
+              summary: 'Node disk is running full within 2 hours',
+            },
+            expr: |||
+              predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
--- a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet
@@ -0,0 +1,151 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'prometheus.rules',
+        rules: [
+          {
+            alert: 'PrometheusConfigReloadFailed',
+            annotations: {
+              description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
+              summary: "Reloading Promehteus' configuration failed",
+            },
+            expr: |||
+              prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusNotificationQueueRunningFull',
+            annotations: {
+              description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
+              summary: "Prometheus' alert notification queue is running full",
+            },
+            expr: |||
+              predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s}
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusErrorSendingAlerts',
+            annotations: {
+              description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
+              summary: 'Errors while sending alert from Prometheus',
+            },
+            expr: |||
+              rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusErrorSendingAlerts',
+            annotations: {
+              description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
+              summary: 'Errors while sending alerts from Prometheus',
+            },
+            expr: |||
+              rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+          },
+          {
+            alert: 'PrometheusNotConnectedToAlertmanagers',
+            annotations: {
+              description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
+              summary: 'Prometheus is not connected to any Alertmanagers',
+            },
+            expr: |||
+              prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusTSDBReloadsFailing',
+            annotations: {
+              description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
+              summary: 'Prometheus has issues reloading data blocks from disk',
+            },
+            expr: |||
+              increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
+            ||| % $._config,
+            'for': '12h',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusTSDBCompactionsFailing',
+            annotations: {
+              description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
+              summary: 'Prometheus has issues compacting sample blocks',
+            },
+            expr: |||
+              increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
+            ||| % $._config,
+            'for': '12h',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusTSDBWALCorruptions',
+            annotations: {
+              description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
+              summary: 'Prometheus write-ahead log is corrupted',
+            },
+            expr: |||
+              tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
+            ||| % $._config,
+            'for': '4h',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusNotIngestingSamples',
+            annotations: {
+              description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
+              summary: "Prometheus isn't ingesting samples",
+            },
+            expr: |||
+              rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'PrometheusTargetScapesDuplicate',
+            annotations: {
+              description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
+              summary: 'Prometheus has many samples rejected',
+            },
+            expr: |||
+              increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}