Add runbook_url annotation to all alerts

Signed-off-by: ArthurSens <arthursens2005@gmail.com>
2021-03-05 12:54:03 +00:00
parent ebd4b28b91
commit e586afb280
10 changed files with 72 additions and 13 deletions
--- a/manifests/prometheus-prometheusRule.yaml
+++ b/manifests/prometheus-prometheusRule.yaml
@@ -17,6 +17,7 @@ spec:
    - alert: PrometheusBadConfig
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig
        summary: Failed Prometheus configuration reload.
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
@@ -28,6 +29,7 @@ spec:
    - alert: PrometheusNotificationQueueRunningFull
      annotations:
        description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull
        summary: Prometheus alert notification queue predicted to run full in less than 30m.
      expr: |
        # Without min_over_time, failed scrapes could create false negatives, see
@@ -43,6 +45,7 @@ spec:
    - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
      annotations:
        description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers
        summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
      expr: |
        (
@@ -58,6 +61,7 @@ spec:
    - alert: PrometheusNotConnectedToAlertmanagers
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers
        summary: Prometheus is not connected to any Alertmanagers.
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
@@ -69,6 +73,7 @@ spec:
    - alert: PrometheusTSDBReloadsFailing
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing
        summary: Prometheus has issues reloading blocks from disk.
      expr: |
        increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
@@ -78,6 +83,7 @@ spec:
    - alert: PrometheusTSDBCompactionsFailing
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing
        summary: Prometheus has issues compacting blocks.
      expr: |
        increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
@@ -87,6 +93,7 @@ spec:
    - alert: PrometheusNotIngestingSamples
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples
        summary: Prometheus is not ingesting samples.
      expr: |
        (
@@ -104,6 +111,7 @@ spec:
    - alert: PrometheusDuplicateTimestamps
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with different values but duplicated timestamp.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps
        summary: Prometheus is dropping samples with duplicate timestamps.
      expr: |
        rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -113,6 +121,7 @@ spec:
    - alert: PrometheusOutOfOrderTimestamps
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps
        summary: Prometheus drops samples with out-of-order timestamps.
      expr: |
        rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -122,6 +131,7 @@ spec:
    - alert: PrometheusRemoteStorageFailures
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures
        summary: Prometheus fails to send samples to remote storage.
      expr: |
        (
@@ -141,6 +151,7 @@ spec:
    - alert: PrometheusRemoteWriteBehind
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind
        summary: Prometheus remote write is behind.
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
@@ -157,6 +168,7 @@ spec:
    - alert: PrometheusRemoteWriteDesiredShards
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards
        summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
@@ -172,6 +184,7 @@ spec:
    - alert: PrometheusRuleFailures
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures
        summary: Prometheus is failing rule evaluations.
      expr: |
        increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -181,6 +194,7 @@ spec:
    - alert: PrometheusMissingRuleEvaluations
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations
        summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
      expr: |
        increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -190,6 +204,7 @@ spec:
    - alert: PrometheusTargetLimitHit
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit
        summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
      expr: |
        increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -199,6 +214,7 @@ spec:
    - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
      annotations:
        description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
+        runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager
        summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
      expr: |
        min without (alertmanager) (