Merge pull request #1891 from metalmatze/prometheus-operator-alerts

kube-prometheus: Add Prometheus Operator alerts
2018-09-14 14:22:10 +02:00
parent 7e34199dd8 df65f57fb4
commit bb2a9e87bc
6 changed files with 92 additions and 14 deletions
--- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet
@@ -1,4 +1,5 @@
 (import 'alertmanager.libsonnet') +
 (import 'general.libsonnet') +
 (import 'node.libsonnet') +
-(import 'prometheus.libsonnet')
+(import 'prometheus.libsonnet') +
+(import 'prometheus-operator.libsonnet')
--- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet
@@ -0,0 +1,50 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'prometheus-operator',
+        rules: [
+          {
+            alert: 'PrometheusOperatorAlertmanagerReconcileErrors',
+            expr: |||
+              rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.',
+            },
+            'for': '10m',
+          },
+          {
+            alert: 'PrometheusOperatorPrometheusReconcileErrors',
+            expr: |||
+              rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
+            },
+            'for': '10m',
+          },
+          {
+            alert: 'PrometheusOperatorNodeLookupErrors',
+            expr: |||
+              rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
+            },
+            'for': '10m',
+          },
+        ],
+      },
+    ],
+  },
+}
--- a/jsonnetfile.json
+++ b/jsonnetfile.json
@@ -11,4 +11,4 @@
            "version": "."
        }
    ]
-}
+}
--- a/jsonnetfile.lock.json
+++ b/jsonnetfile.lock.json
@@ -8,7 +8,7 @@
                    "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
                }
            },
-            "version": "ce4ab08d6791161267204d9a61588e64f1b57e05"
+            "version": "bffc85d6e76f6341d5370af68ea980030ab402e8"
        },
        {
            "name": "ksonnet",
@@ -28,7 +28,7 @@
                    "subdir": ""
                }
            },
-            "version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c"
+            "version": "c70814dcafce1b51357938e09ee1192998a95706"
        },
        {
            "name": "grafonnet",
@@ -78,7 +78,7 @@
                    "subdir": "Documentation/etcd-mixin"
                }
            },
-            "version": "1df1ddff4361ed7f2c0f33571923511889a115ce"
+            "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998"
        }
    ]
-}
+}
--- a/manifests/grafana-dashboardDefinitions.yaml
+++ b/manifests/grafana-dashboardDefinitions.yaml
@@ -4707,7 +4707,7 @@ items:
                                  "step": 10
                              },
                              {
-                                  "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)",
+                                  "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)",
                                  "format": "table",
                                  "instant": true,
                                  "intervalFactor": 2,
--- a/manifests/prometheus-rules.yaml
+++ b/manifests/prometheus-rules.yaml
@@ -838,7 +838,7 @@ spec:
          the limit of 110.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
      expr: |
-        kubelet_running_pod_count{job="kubelet"} > 100
+        kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
      for: 15m
      labels:
        severity: warning
@@ -914,8 +914,8 @@ spec:
        severity: critical
    - alert: AlertmanagerDownOrMissing
      annotations:
-        description: An unexpected number of Alertmanagers are scraped or Alertmanagers
-          disappeared from discovery.
+        description: An unexpected number of Alertmanagers were scraped or disappeared
+          from discovery.
        summary: Alertmanager down or missing
      expr: |
        label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1
@@ -936,7 +936,7 @@ spec:
    rules:
    - alert: TargetDown
      annotations:
-        description: '{{ $value }}% of {{ $labels.job }} targets are down.'
+        description: '{{ $value }}% of the {{ $labels.job }} targets are down.'
        summary: Targets are down
      expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
      for: 10m
@@ -944,7 +944,7 @@ spec:
        severity: warning
    - alert: DeadMansSwitch
      annotations:
-        description: This is a DeadMansSwitch meant to ensure that the entire Alerting
+        description: This is a DeadMansSwitch meant to ensure that the entire alerting
          pipeline is functional.
        summary: Alerting DeadMansSwitch
      expr: vector(1)
@@ -955,7 +955,7 @@ spec:
    - alert: NodeDiskRunningFull
      annotations:
        message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
-          }}/{{ $labels.pod }} is running full within the next 24 hours.
+          }}/{{ $labels.pod }} will be full within the next 24 hours.
      expr: |
        (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
      for: 30m
@@ -964,7 +964,7 @@ spec:
    - alert: NodeDiskRunningFull
      annotations:
        message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
-          }}/{{ $labels.pod }} is running full within the next 2 hours.
+          }}/{{ $labels.pod }} will be full within the next 2 hours.
      expr: |
        (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
      for: 10m
@@ -1071,3 +1071,30 @@ spec:
      for: 10m
      labels:
        severity: warning
+  - name: prometheus-operator
+    rules:
+    - alert: PrometheusOperatorAlertmanagerReconcileErrors
+      annotations:
+        message: Errors while reconciling Alertmanager in {{ $labels.namespace }}
+          Namespace.
+      expr: |
+        rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
+      for: 10m
+      labels:
+        severity: warning
+    - alert: PrometheusOperatorPrometheusReconcileErrors
+      annotations:
+        message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
+      expr: |
+        rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
+      for: 10m
+      labels:
+        severity: warning
+    - alert: PrometheusOperatorNodeLookupErrors
+      annotations:
+        message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
+      expr: |
+        rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
+      for: 10m
+      labels:
+        severity: warning