kube-prometheus/jsonnet/kube-prometheus/alerts/prometheus.libsonnet

{
  prometheusAlerts+:: {
    groups+: [
      {
        name: 'prometheus.rules',
        rules: [
          {
            alert: 'PrometheusConfigReloadFailed',
            annotations: {
              description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
              summary: "Reloading Prometheus' configuration failed",
            },
            expr: |||
              prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusNotificationQueueRunningFull',
            annotations: {
              description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
              summary: "Prometheus' alert notification queue is running full",
            },
            expr: |||
              predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s}
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusErrorSendingAlerts',
            annotations: {
              description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
              summary: 'Errors while sending alert from Prometheus',
            },
            expr: |||
              rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusErrorSendingAlerts',
            annotations: {
              description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
              summary: 'Errors while sending alerts from Prometheus',
            },
            expr: |||
              rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'critical',
            },
          },
          {
            alert: 'PrometheusNotConnectedToAlertmanagers',
            annotations: {
              description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
              summary: 'Prometheus is not connected to any Alertmanagers',
            },
            expr: |||
              prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusTSDBReloadsFailing',
            annotations: {
              description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
              summary: 'Prometheus has issues reloading data blocks from disk',
            },
            expr: |||
              increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
            ||| % $._config,
            'for': '12h',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusTSDBCompactionsFailing',
            annotations: {
              description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
              summary: 'Prometheus has issues compacting sample blocks',
            },
            expr: |||
              increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
            ||| % $._config,
            'for': '12h',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusTSDBWALCorruptions',
            annotations: {
              description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
              summary: 'Prometheus write-ahead log is corrupted',
            },
            expr: |||
              prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
            ||| % $._config,
            'for': '4h',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusNotIngestingSamples',
            annotations: {
              description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
              summary: "Prometheus isn't ingesting samples",
            },
            expr: |||
              rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
          {
            alert: 'PrometheusTargetScrapesDuplicate',
            annotations: {
              description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
              summary: 'Prometheus has many samples rejected',
            },
            expr: |||
              increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
          },
        ],
      },
    ],
  },
}