groups: - name: prometheus.rules rules: - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m labels: severity: warning annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 for: 10m labels: severity: warning annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 for: 10m labels: severity: critical annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted - alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 for: 10m labels: severity: warning annotations: description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." summary: "Prometheus isn't ingesting samples"