Merge pull request #1010 from coreos/no_ingest_alert

Add alert if it samples aren't ingested
This commit is contained in:
Antoine Legrand
2018-02-28 10:43:23 +01:00
committed by GitHub
2 changed files with 32 additions and 0 deletions

View File

@@ -8,6 +8,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
@@ -16,6 +17,7 @@ groups:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
@@ -25,6 +27,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
@@ -34,6 +37,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
@@ -42,6 +46,7 @@ groups:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
@@ -51,6 +56,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
@@ -60,6 +66,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
@@ -69,3 +76,12 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"

View File

@@ -539,6 +539,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
@@ -547,6 +548,7 @@ data:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
@@ -556,6 +558,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
@@ -565,6 +568,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
@@ -573,6 +577,7 @@ data:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
@@ -582,6 +587,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
@@ -591,6 +597,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
@@ -600,3 +607,12 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"