kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml

groups:
- name: prometheus.rules
  rules:
  - alert: PrometheusConfigReloadFailed
    expr: prometheus_config_last_reload_successful == 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
      summary: Reloading Promehteus' configuration failed

  - alert: PrometheusNotificationQueueRunningFull
    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
        $labels.pod}}
      summary: Prometheus' alert notification queue is running full

  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.01
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      summary: Errors while sending alert from Prometheus

  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.03
    for: 10m
    labels:
      severity: critical
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      summary: Errors while sending alerts from Prometheus

  - alert: PrometheusNotConnectedToAlertmanagers
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
        to any Alertmanagers
      summary: Prometheus is not connected to any Alertmanagers

  - alert: PrometheusTSDBReloadsFailing
    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
    for: 12h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        reload failures over the last four hours.'
      summary: Prometheus has issues reloading data blocks from disk

  - alert: PrometheusTSDBCompactionsFailing
    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
    for: 12h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        compaction failures over the last four hours.'
      summary: Prometheus has issues compacting sample blocks

  - alert: PrometheusTSDBWALCorruptions
    expr: tsdb_wal_corruptions_total > 0
    for: 4h
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
        log (WAL).'
      summary: Prometheus write-ahead log is corrupted

  - alert: PrometheusNotIngestingSamples
    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
      summary: "Prometheus isn't ingesting samples"

  - alert: PrometheusTargetScapesDuplicate
    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
      summary: Prometheus has many samples rejected