102 lines
3.7 KiB
YAML
102 lines
3.7 KiB
YAML
groups:
|
|
- name: prometheus.rules
|
|
rules:
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
|
summary: Reloading Promehteus' configuration failed
|
|
|
|
- alert: PrometheusNotificationQueueRunningFull
|
|
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
|
$labels.pod}}
|
|
summary: Prometheus' alert notification queue is running full
|
|
|
|
- alert: PrometheusErrorSendingAlerts
|
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
> 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
summary: Errors while sending alert from Prometheus
|
|
|
|
- alert: PrometheusErrorSendingAlerts
|
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
> 0.03
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
summary: Errors while sending alerts from Prometheus
|
|
|
|
- alert: PrometheusNotConnectedToAlertmanagers
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
|
to any Alertmanagers
|
|
summary: Prometheus is not connected to any Alertmanagers
|
|
|
|
- alert: PrometheusTSDBReloadsFailing
|
|
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
reload failures over the last four hours.'
|
|
summary: Prometheus has issues reloading data blocks from disk
|
|
|
|
- alert: PrometheusTSDBCompactionsFailing
|
|
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
compaction failures over the last four hours.'
|
|
summary: Prometheus has issues compacting sample blocks
|
|
|
|
- alert: PrometheusTSDBWALCorruptions
|
|
expr: tsdb_wal_corruptions_total > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
|
log (WAL).'
|
|
summary: Prometheus write-ahead log is corrupted
|
|
|
|
- alert: PrometheusNotIngestingSamples
|
|
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
|
summary: "Prometheus isn't ingesting samples"
|
|
|
|
- alert: PrometheusTargetScapesDuplicate
|
|
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
|
|
summary: Prometheus has many samples rejected
|