* `tsdb_wal_corruptions_total metric` name should be `prometheus_tsdb_wal_corruptions_total`
152 lines
5.9 KiB
Jsonnet
152 lines
5.9 KiB
Jsonnet
{
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'prometheus.rules',
|
|
rules: [
|
|
{
|
|
alert: 'PrometheusConfigReloadFailed',
|
|
annotations: {
|
|
description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
|
|
summary: "Reloading Prometheus' configuration failed",
|
|
},
|
|
expr: |||
|
|
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotificationQueueRunningFull',
|
|
annotations: {
|
|
description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
|
summary: "Prometheus' alert notification queue is running full",
|
|
},
|
|
expr: |||
|
|
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s}
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusErrorSendingAlerts',
|
|
annotations: {
|
|
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
|
summary: 'Errors while sending alert from Prometheus',
|
|
},
|
|
expr: |||
|
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusErrorSendingAlerts',
|
|
annotations: {
|
|
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
|
summary: 'Errors while sending alerts from Prometheus',
|
|
},
|
|
expr: |||
|
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotConnectedToAlertmanagers',
|
|
annotations: {
|
|
description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
|
|
summary: 'Prometheus is not connected to any Alertmanagers',
|
|
},
|
|
expr: |||
|
|
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTSDBReloadsFailing',
|
|
annotations: {
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
|
|
summary: 'Prometheus has issues reloading data blocks from disk',
|
|
},
|
|
expr: |||
|
|
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
|
|
||| % $._config,
|
|
'for': '12h',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTSDBCompactionsFailing',
|
|
annotations: {
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
|
|
summary: 'Prometheus has issues compacting sample blocks',
|
|
},
|
|
expr: |||
|
|
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
|
|
||| % $._config,
|
|
'for': '12h',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTSDBWALCorruptions',
|
|
annotations: {
|
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
|
|
summary: 'Prometheus write-ahead log is corrupted',
|
|
},
|
|
expr: |||
|
|
prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
|
|
||| % $._config,
|
|
'for': '4h',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotIngestingSamples',
|
|
annotations: {
|
|
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
|
summary: "Prometheus isn't ingesting samples",
|
|
},
|
|
expr: |||
|
|
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTargetScrapesDuplicate',
|
|
annotations: {
|
|
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
|
|
summary: 'Prometheus has many samples rejected',
|
|
},
|
|
expr: |||
|
|
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}
|