Add alert if it no samples are ingested

2018-02-22 13:46:27 +01:00
parent 80b2a511be
commit 0ae6c98a48
2 changed files with 32 additions and 0 deletions
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -539,6 +539,7 @@ data:
          severity: warning
        annotations:
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
+    
      - alert: PrometheusNotificationQueueRunningFull
        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
        for: 10m
@@ -547,6 +548,7 @@ data:
        annotations:
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
            $labels.pod}}
+    
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.01
@@ -556,6 +558,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+    
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.03
@@ -565,6 +568,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+    
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 10m
@@ -573,6 +577,7 @@ data:
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
            to any Alertmanagers
+    
      - alert: PrometheusTSDBReloadsFailing
        expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
        for: 12h
@@ -582,6 +587,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            reload failures over the last four hours.'
          summary: Prometheus has issues reloading data blocks from disk
+    
      - alert: PrometheusTSDBCompactionsFailing
        expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
        for: 12h
@@ -591,6 +597,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            compaction failures over the last four hours.'
          summary: Prometheus has issues compacting sample blocks
+    
      - alert: PrometheusTSDBWALCorruptions
        expr: tsdb_wal_corruptions_total > 0
        for: 4h
@@ -600,3 +607,12 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
            log (WAL).'
          summary: Prometheus write-ahead log is corrupted
+    
+      - alert: PrometheusNotIngestingSamples
+        expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
+          summary: "Prometheus isn't ingesting samples"