groups:
- name: prometheus.rules
  rules:
  - alert: PrometheusConfigReloadFailed
    expr: prometheus_config_last_reload_successful == 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}

  - alert: PrometheusNotificationQueueRunningFull
    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
        $labels.pod}}

  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.01
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}

  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.03
    for: 10m
    labels:
      severity: critical
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}

  - alert: PrometheusNotConnectedToAlertmanagers
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
        to any Alertmanagers

  - alert: PrometheusTSDBReloadsFailing
    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
    for: 12h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        reload failures over the last four hours.'
      summary: Prometheus has issues reloading data blocks from disk

  - alert: PrometheusTSDBCompactionsFailing
    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
    for: 12h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        compaction failures over the last four hours.'
      summary: Prometheus has issues compacting sample blocks

  - alert: PrometheusTSDBWALCorruptions
    expr: tsdb_wal_corruptions_total > 0
    for: 4h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
        log (WAL).'
      summary: Prometheus write-ahead log is corrupted

  - alert: PrometheusNotIngestingSamples
    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
      summary: "Prometheus isn't ingesting samples"