Update component versions in docs and jsonnets

This commit is contained in:
Julius Volz
2019-02-04 15:45:52 +01:00
parent 19dc4088a6
commit a09da4faf5
8 changed files with 94 additions and 39 deletions

View File

@@ -314,7 +314,7 @@ spec:
message: Alertmanager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
expr: |
absent(up{job="alertmanager-main"} == 1)
absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
for: 15m
labels:
severity: critical
@@ -386,7 +386,7 @@ spec:
message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
expr: |
absent(up{job="prometheus-k8s"} == 1)
absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
for: 15m
labels:
severity: critical
@@ -395,7 +395,7 @@ spec:
message: PrometheusOperator has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
expr: |
absent(up{job="prometheus-operator"} == 1)
absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
for: 15m
labels:
severity: critical
@@ -799,7 +799,7 @@ spec:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical
@@ -808,7 +808,7 @@ spec:
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
expr: |
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
for: 10m
labels:
severity: warning
@@ -816,9 +816,9 @@ spec:
annotations:
message: Alertmanager has not found all other members of the cluster.
expr: |
alertmanager_cluster_members{job="alertmanager-main"}
alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
for: 5m
labels:
severity: critical
@@ -865,7 +865,7 @@ spec:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Prometheus' configuration failed
expr: |
prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0
for: 10m
labels:
severity: warning
@@ -875,7 +875,7 @@ spec:
$labels.pod}}
summary: Prometheus' alert notification queue is running full
expr: |
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}
for: 10m
labels:
severity: warning
@@ -885,7 +885,7 @@ spec:
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01
for: 10m
labels:
severity: warning
@@ -895,7 +895,7 @@ spec:
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03
for: 10m
labels:
severity: critical
@@ -905,7 +905,7 @@ spec:
to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
expr: |
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1
for: 10m
labels:
severity: warning
@@ -915,7 +915,7 @@ spec:
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
for: 12h
labels:
severity: warning
@@ -925,7 +925,7 @@ spec:
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
for: 12h
labels:
severity: warning
@@ -935,7 +935,7 @@ spec:
log (WAL).'
summary: Prometheus write-ahead log is corrupted
expr: |
tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
for: 4h
labels:
severity: warning
@@ -945,7 +945,7 @@ spec:
samples.
summary: Prometheus isn't ingesting samples
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
for: 10m
labels:
severity: warning
@@ -955,7 +955,7 @@ spec:
due to duplicate timestamps but different values'
summary: Prometheus has many samples rejected
expr: |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
@@ -966,7 +966,7 @@ spec:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
@@ -974,7 +974,7 @@ spec:
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning