Add summary to Alertmanager rules where missing - updated accoring to guidelines

This commit is contained in:
Alexander Holte-Davidsen
2018-03-05 09:52:51 +01:00
parent c54468ab7b
commit 8b6ee5c18b
6 changed files with 34 additions and 0 deletions

View File

@@ -11,6 +11,7 @@ groups:
annotations: annotations:
description: The configuration of the instances of the Alertmanager cluster description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync. `{{$labels.service}}` are out of sync.
summary: Configuration out of sync
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@@ -20,6 +21,7 @@ groups:
annotations: annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery. disappeared from discovery.
summary: Alertmanager down or missing
- alert: AlertmanagerFailedReload - alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0 expr: alertmanager_config_last_reload_successful == 0
for: 10m for: 10m
@@ -28,3 +30,4 @@ groups:
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
summary: Alertmanager's configuration reload failed

View File

@@ -26,6 +26,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Prometheus failed to scrape
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 10 * 100 > 10

View File

@@ -51,6 +51,7 @@ groups:
annotations: annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}} for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerLatencyHigh - alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4 > 4
@@ -60,6 +61,7 @@ groups:
annotations: annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}} for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerErrorsHigh - alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2 * 100 > 2
@@ -68,6 +70,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: API server returns errors for {{ $value }}% of requests description: API server returns errors for {{ $value }}% of requests
summary: API server request errors
- alert: APIServerErrorsHigh - alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5 * 100 > 5
@@ -84,12 +87,14 @@ groups:
annotations: annotations:
description: No API servers are reachable or all have disappeared from service description: No API servers are reachable or all have disappeared from service
discovery discovery
summary: No API servers are reachable
- alert: K8sCertificateExpirationNotice - alert: K8sCertificateExpirationNotice
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days) description: Kubernetes API Certificate is expiring soon (less than 7 days)
summary: Kubernetes API Certificate is expiering soon
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
- alert: K8sCertificateExpirationNotice - alert: K8sCertificateExpirationNotice
@@ -97,4 +102,5 @@ groups:
severity: critical severity: critical
annotations: annotations:
description: Kubernetes API Certificate is expiring in less than 1 day description: Kubernetes API Certificate is expiring in less than 1 day
summary: Kubernetes API Certificate is expiering
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0

View File

@@ -26,6 +26,7 @@ groups:
annotations: annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery or node-exporters have disappeared from discovery
summary: Prometheus could not scrape a node-exporter
- alert: NodeDiskRunningFull - alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m for: 30m
@@ -42,3 +43,4 @@ groups:
annotations: annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}}) full within the next 2 hours (mounted at {{$labels.mountpoint}})
summary: Node disk is running full

View File

@@ -8,6 +8,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Promehteus' configuration failed
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
@@ -17,6 +18,7 @@ groups:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
summary: Prometheus' alert notification queue is running full
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
@@ -27,6 +29,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
@@ -37,6 +40,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
@@ -46,6 +50,7 @@ groups:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0

View File

@@ -20,6 +20,7 @@ data:
annotations: annotations:
description: The configuration of the instances of the Alertmanager cluster description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync. `{{$labels.service}}` are out of sync.
summary: Configuration out of sync
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@@ -29,6 +30,7 @@ data:
annotations: annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery. disappeared from discovery.
summary: Alertmanager down or missing
- alert: AlertmanagerFailedReload - alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0 expr: alertmanager_config_last_reload_successful == 0
for: 10m for: 10m
@@ -37,6 +39,7 @@ data:
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
summary: Alertmanager's configuration reload failed
etcd3.rules.yaml: |+ etcd3.rules.yaml: |+
groups: groups:
- name: ./etcd3.rules - name: ./etcd3.rules
@@ -363,6 +366,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Prometheus failed to scrape
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 10 * 100 > 10
@@ -436,6 +440,7 @@ data:
annotations: annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}} for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerLatencyHigh - alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4 > 4
@@ -445,6 +450,7 @@ data:
annotations: annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}} for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerErrorsHigh - alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2 * 100 > 2
@@ -453,6 +459,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: API server returns errors for {{ $value }}% of requests description: API server returns errors for {{ $value }}% of requests
summary: API server request errors
- alert: APIServerErrorsHigh - alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5 * 100 > 5
@@ -469,12 +476,14 @@ data:
annotations: annotations:
description: No API servers are reachable or all have disappeared from service description: No API servers are reachable or all have disappeared from service
discovery discovery
summary: No API servers are reachable
- alert: K8sCertificateExpirationNotice - alert: K8sCertificateExpirationNotice
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days) description: Kubernetes API Certificate is expiring soon (less than 7 days)
summary: Kubernetes API Certificate is expiering soon
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
- alert: K8sCertificateExpirationNotice - alert: K8sCertificateExpirationNotice
@@ -482,6 +491,7 @@ data:
severity: critical severity: critical
annotations: annotations:
description: Kubernetes API Certificate is expiring in less than 1 day description: Kubernetes API Certificate is expiring in less than 1 day
summary: Kubernetes API Certificate is expiering
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
node.rules.yaml: |+ node.rules.yaml: |+
groups: groups:
@@ -512,6 +522,7 @@ data:
annotations: annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery or node-exporters have disappeared from discovery
summary: Prometheus could not scrape a node-exporter
- alert: NodeDiskRunningFull - alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m for: 30m
@@ -528,6 +539,7 @@ data:
annotations: annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}}) full within the next 2 hours (mounted at {{$labels.mountpoint}})
summary: Node disk is running full
prometheus.rules.yaml: |+ prometheus.rules.yaml: |+
groups: groups:
- name: prometheus.rules - name: prometheus.rules
@@ -539,6 +551,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Promehteus' configuration failed
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
@@ -548,6 +561,7 @@ data:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
summary: Prometheus' alert notification queue is running full
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
@@ -558,6 +572,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
@@ -568,6 +583,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
@@ -577,6 +593,7 @@ data:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0