Merge pull request #1891 from metalmatze/prometheus-operator-alerts

kube-prometheus: Add Prometheus Operator alerts
This commit is contained in:
Lucas Servén Marín
2018-09-14 14:22:10 +02:00
committed by GitHub
6 changed files with 92 additions and 14 deletions

View File

@@ -1,4 +1,5 @@
(import 'alertmanager.libsonnet') + (import 'alertmanager.libsonnet') +
(import 'general.libsonnet') + (import 'general.libsonnet') +
(import 'node.libsonnet') + (import 'node.libsonnet') +
(import 'prometheus.libsonnet') (import 'prometheus.libsonnet') +
(import 'prometheus-operator.libsonnet')

View File

@@ -0,0 +1,50 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorAlertmanagerReconcileErrors',
expr: |||
rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorPrometheusReconcileErrors',
expr: |||
rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
],
},
],
},
}

View File

@@ -8,7 +8,7 @@
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
} }
}, },
"version": "ce4ab08d6791161267204d9a61588e64f1b57e05" "version": "bffc85d6e76f6341d5370af68ea980030ab402e8"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c" "version": "c70814dcafce1b51357938e09ee1192998a95706"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "1df1ddff4361ed7f2c0f33571923511889a115ce" "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998"
} }
] ]
} }

View File

@@ -4707,7 +4707,7 @@ items:
"step": 10 "step": 10
}, },
{ {
"expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)",
"format": "table", "format": "table",
"instant": true, "instant": true,
"intervalFactor": 2, "intervalFactor": 2,

View File

@@ -838,7 +838,7 @@ spec:
the limit of 110. the limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
kubelet_running_pod_count{job="kubelet"} > 100 kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@@ -914,8 +914,8 @@ spec:
severity: critical severity: critical
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
annotations: annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers description: An unexpected number of Alertmanagers were scraped or disappeared
disappeared from discovery. from discovery.
summary: Alertmanager down or missing summary: Alertmanager down or missing
expr: | expr: |
label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1
@@ -936,7 +936,7 @@ spec:
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
description: '{{ $value }}% of {{ $labels.job }} targets are down.' description: '{{ $value }}% of the {{ $labels.job }} targets are down.'
summary: Targets are down summary: Targets are down
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m for: 10m
@@ -944,7 +944,7 @@ spec:
severity: warning severity: warning
- alert: DeadMansSwitch - alert: DeadMansSwitch
annotations: annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting description: This is a DeadMansSwitch meant to ensure that the entire alerting
pipeline is functional. pipeline is functional.
summary: Alerting DeadMansSwitch summary: Alerting DeadMansSwitch
expr: vector(1) expr: vector(1)
@@ -955,7 +955,7 @@ spec:
- alert: NodeDiskRunningFull - alert: NodeDiskRunningFull
annotations: annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} is running full within the next 24 hours. }}/{{ $labels.pod }} will be full within the next 24 hours.
expr: | expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m for: 30m
@@ -964,7 +964,7 @@ spec:
- alert: NodeDiskRunningFull - alert: NodeDiskRunningFull
annotations: annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} is running full within the next 2 hours. }}/{{ $labels.pod }} will be full within the next 2 hours.
expr: | expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m for: 10m
@@ -1071,3 +1071,30 @@ spec:
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorAlertmanagerReconcileErrors
annotations:
message: Errors while reconciling Alertmanager in {{ $labels.namespace }}
Namespace.
expr: |
rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorPrometheusReconcileErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning