Merge pull request #1010 from coreos/no_ingest_alert
Add alert if it samples aren't ingested
This commit is contained in:
@@ -8,6 +8,7 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||||
|
|
||||||
- alert: PrometheusNotificationQueueRunningFull
|
- alert: PrometheusNotificationQueueRunningFull
|
||||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -16,6 +17,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||||
$labels.pod}}
|
$labels.pod}}
|
||||||
|
|
||||||
- alert: PrometheusErrorSendingAlerts
|
- alert: PrometheusErrorSendingAlerts
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||||
> 0.01
|
> 0.01
|
||||||
@@ -25,6 +27,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
|
|
||||||
- alert: PrometheusErrorSendingAlerts
|
- alert: PrometheusErrorSendingAlerts
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||||
> 0.03
|
> 0.03
|
||||||
@@ -34,6 +37,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
|
|
||||||
- alert: PrometheusNotConnectedToAlertmanagers
|
- alert: PrometheusNotConnectedToAlertmanagers
|
||||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -42,6 +46,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||||
to any Alertmanagers
|
to any Alertmanagers
|
||||||
|
|
||||||
- alert: PrometheusTSDBReloadsFailing
|
- alert: PrometheusTSDBReloadsFailing
|
||||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
@@ -51,6 +56,7 @@ groups:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
reload failures over the last four hours.'
|
reload failures over the last four hours.'
|
||||||
summary: Prometheus has issues reloading data blocks from disk
|
summary: Prometheus has issues reloading data blocks from disk
|
||||||
|
|
||||||
- alert: PrometheusTSDBCompactionsFailing
|
- alert: PrometheusTSDBCompactionsFailing
|
||||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
@@ -60,6 +66,7 @@ groups:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
compaction failures over the last four hours.'
|
compaction failures over the last four hours.'
|
||||||
summary: Prometheus has issues compacting sample blocks
|
summary: Prometheus has issues compacting sample blocks
|
||||||
|
|
||||||
- alert: PrometheusTSDBWALCorruptions
|
- alert: PrometheusTSDBWALCorruptions
|
||||||
expr: tsdb_wal_corruptions_total > 0
|
expr: tsdb_wal_corruptions_total > 0
|
||||||
for: 4h
|
for: 4h
|
||||||
@@ -69,3 +76,12 @@ groups:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||||
log (WAL).'
|
log (WAL).'
|
||||||
summary: Prometheus write-ahead log is corrupted
|
summary: Prometheus write-ahead log is corrupted
|
||||||
|
|
||||||
|
- alert: PrometheusNotIngestingSamples
|
||||||
|
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
||||||
|
summary: "Prometheus isn't ingesting samples"
|
||||||
|
@@ -539,6 +539,7 @@ data:
|
|||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||||
|
|
||||||
- alert: PrometheusNotificationQueueRunningFull
|
- alert: PrometheusNotificationQueueRunningFull
|
||||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -547,6 +548,7 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||||
$labels.pod}}
|
$labels.pod}}
|
||||||
|
|
||||||
- alert: PrometheusErrorSendingAlerts
|
- alert: PrometheusErrorSendingAlerts
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||||
> 0.01
|
> 0.01
|
||||||
@@ -556,6 +558,7 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
|
|
||||||
- alert: PrometheusErrorSendingAlerts
|
- alert: PrometheusErrorSendingAlerts
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||||
> 0.03
|
> 0.03
|
||||||
@@ -565,6 +568,7 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
|
|
||||||
- alert: PrometheusNotConnectedToAlertmanagers
|
- alert: PrometheusNotConnectedToAlertmanagers
|
||||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -573,6 +577,7 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||||
to any Alertmanagers
|
to any Alertmanagers
|
||||||
|
|
||||||
- alert: PrometheusTSDBReloadsFailing
|
- alert: PrometheusTSDBReloadsFailing
|
||||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
@@ -582,6 +587,7 @@ data:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
reload failures over the last four hours.'
|
reload failures over the last four hours.'
|
||||||
summary: Prometheus has issues reloading data blocks from disk
|
summary: Prometheus has issues reloading data blocks from disk
|
||||||
|
|
||||||
- alert: PrometheusTSDBCompactionsFailing
|
- alert: PrometheusTSDBCompactionsFailing
|
||||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
@@ -591,6 +597,7 @@ data:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
compaction failures over the last four hours.'
|
compaction failures over the last four hours.'
|
||||||
summary: Prometheus has issues compacting sample blocks
|
summary: Prometheus has issues compacting sample blocks
|
||||||
|
|
||||||
- alert: PrometheusTSDBWALCorruptions
|
- alert: PrometheusTSDBWALCorruptions
|
||||||
expr: tsdb_wal_corruptions_total > 0
|
expr: tsdb_wal_corruptions_total > 0
|
||||||
for: 4h
|
for: 4h
|
||||||
@@ -600,3 +607,12 @@ data:
|
|||||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||||
log (WAL).'
|
log (WAL).'
|
||||||
summary: Prometheus write-ahead log is corrupted
|
summary: Prometheus write-ahead log is corrupted
|
||||||
|
|
||||||
|
- alert: PrometheusNotIngestingSamples
|
||||||
|
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
||||||
|
summary: "Prometheus isn't ingesting samples"
|
||||||
|
Reference in New Issue
Block a user