Merge pull request #1010 from coreos/no_ingest_alert

Add alert if it samples aren't ingested
2018-02-28 10:43:23 +01:00
parent eb636277cb 0ae6c98a48
commit 9764d157b1
2 changed files with 32 additions and 0 deletions
--- a/assets/prometheus/rules/prometheus.rules.yaml
+++ b/assets/prometheus/rules/prometheus.rules.yaml
@@ -8,6 +8,7 @@ groups:
      severity: warning
    annotations:
      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
  - alert: PrometheusNotificationQueueRunningFull
    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
    for: 10m
@@ -16,6 +17,7 @@ groups:
    annotations:
      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
        $labels.pod}}
  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.01
@@ -25,6 +27,7 @@ groups:
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.03
@@ -34,6 +37,7 @@ groups:
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  - alert: PrometheusNotConnectedToAlertmanagers
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 10m
@@ -42,6 +46,7 @@ groups:
    annotations:
      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
        to any Alertmanagers
  - alert: PrometheusTSDBReloadsFailing
    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
    for: 12h
@@ -51,6 +56,7 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        reload failures over the last four hours.'
      summary: Prometheus has issues reloading data blocks from disk
  - alert: PrometheusTSDBCompactionsFailing
    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
    for: 12h
@@ -60,6 +66,7 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        compaction failures over the last four hours.'
      summary: Prometheus has issues compacting sample blocks
  - alert: PrometheusTSDBWALCorruptions
    expr: tsdb_wal_corruptions_total > 0
    for: 4h
@@ -69,3 +76,12 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
        log (WAL).'
      summary: Prometheus write-ahead log is corrupted
  - alert: PrometheusNotIngestingSamples
    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
      summary: "Prometheus isn't ingesting samples"
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -539,6 +539,7 @@ data:
          severity: warning
        annotations:
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
      - alert: PrometheusNotificationQueueRunningFull
        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
        for: 10m
@@ -547,6 +548,7 @@ data:
        annotations:
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
            $labels.pod}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.01
@@ -556,6 +558,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.03
@@ -565,6 +568,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 10m
@@ -573,6 +577,7 @@ data:
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
            to any Alertmanagers
      - alert: PrometheusTSDBReloadsFailing
        expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
        for: 12h
@@ -582,6 +587,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            reload failures over the last four hours.'
          summary: Prometheus has issues reloading data blocks from disk
      - alert: PrometheusTSDBCompactionsFailing
        expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
        for: 12h
@@ -591,6 +597,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            compaction failures over the last four hours.'
          summary: Prometheus has issues compacting sample blocks
      - alert: PrometheusTSDBWALCorruptions
        expr: tsdb_wal_corruptions_total > 0
        for: 4h
@@ -600,3 +607,12 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
            log (WAL).'
          summary: Prometheus write-ahead log is corrupted
      - alert: PrometheusNotIngestingSamples
        expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
        for: 10m
        labels:
          severity: warning
        annotations:
          description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
          summary: "Prometheus isn't ingesting samples"