From 8d38e81521b9ef1f51bdca72be2fa7db041f2764 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 14:43:46 +0200 Subject: [PATCH 1/7] contrib/kube-prometheus: Create missing Prometheus operator alerts --- .../kube-prometheus/alerts/alerts.libsonnet | 3 +- .../alerts/prometheus-operator.libsonnet | 50 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet index 19568a24..1b2d94eb 100644 --- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -1,4 +1,5 @@ (import 'alertmanager.libsonnet') + (import 'general.libsonnet') + (import 'node.libsonnet') + -(import 'prometheus.libsonnet') +(import 'prometheus.libsonnet') + +(import 'prometheus-operator.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet new file mode 100644 index 00000000..33dd97ce --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -0,0 +1,50 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus-operator', + rules: [ + { + alert: 'PrometheusOperatorAlertmanagerReconcileErrors', + expr: ||| + rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Alertmanager in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorPrometheusReconcileErrors', + expr: ||| + rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorNodeLookupErrors', + expr: ||| + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + ], + }, + ], + }, +} From 5a935379d649ce0fd69f67497d64592929f977b8 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 14:50:47 +0200 Subject: [PATCH 2/7] contrib/kube-prometheus: Run jb update and generate all manifests --- jsonnetfile.lock.json | 8 ++--- manifests/grafana-dashboardDefinitions.yaml | 2 +- manifests/prometheus-rules.yaml | 40 +++++++++++++++++---- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 613b0ad8..9817f7a9 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ce4ab08d6791161267204d9a61588e64f1b57e05" + "version": "00c64bc438d2acf9c808388fe1e5d733e92b0c3b" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c" + "version": "c70814dcafce1b51357938e09ee1192998a95706" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1df1ddff4361ed7f2c0f33571923511889a115ce" + "version": "a7b1306ecfefeabe48286403b260513786289922" } ] -} \ No newline at end of file +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e8f39619..af68467a 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4707,7 +4707,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 7958b926..17b0de1d 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -838,7 +838,7 @@ spec: the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - kubelet_running_pod_count{job="kubelet"} > 100 + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning @@ -914,8 +914,8 @@ spec: severity: critical - alert: AlertmanagerDownOrMissing annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. + description: An unexpected number of Alertmanagers were scraped or disappeared + from discovery. summary: Alertmanager down or missing expr: | label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 @@ -936,7 +936,7 @@ spec: rules: - alert: TargetDown annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of the {{ $labels.job }} targets are down.' summary: Targets are down expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m @@ -944,7 +944,7 @@ spec: severity: warning - alert: DeadMansSwitch annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting + description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. summary: Alerting DeadMansSwitch expr: vector(1) @@ -955,7 +955,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 24 hours. + }}/{{ $labels.pod }} will be full within the next 24 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m @@ -964,7 +964,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 2 hours. + }}/{{ $labels.pod }} will be full within the next 2 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) for: 10m @@ -1071,3 +1071,29 @@ spec: for: 10m labels: severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorAlertmanagerReconcileErrors + annotations: + message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorPrometheusReconcileErrors + annotations: + message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning From 8965c3e7b60ab3767c15d7c948096e493e8c8c47 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 17:44:47 +0200 Subject: [PATCH 3/7] *: Add missing newline at the end of jsonnetfile.json --- jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index b4ebb0f2..619586b2 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -11,4 +11,4 @@ "version": "." } ] -} \ No newline at end of file +} From 407aaa5e2feb5a56c270091c44bf7b4440549d4b Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 11:08:15 +0200 Subject: [PATCH 4/7] contrib/kube-prometheus: Alert in 10% erros when reconciling Prom & Alertmanager --- .../kube-prometheus/alerts/prometheus-operator.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index 33dd97ce..dd176271 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -7,7 +7,7 @@ { alert: 'PrometheusOperatorAlertmanagerReconcileErrors', expr: ||| - rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', @@ -20,7 +20,7 @@ { alert: 'PrometheusOperatorPrometheusReconcileErrors', expr: ||| - rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', @@ -33,7 +33,7 @@ { alert: 'PrometheusOperatorNodeLookupErrors', expr: ||| - rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', From b308b25accfd0ad32aed0d90f8f4fb76e249f10a Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 11:29:45 +0200 Subject: [PATCH 5/7] contrib/kube-prometheus: Generate new rules based on latest jsonnet changes --- jsonnetfile.lock.json | 4 ++-- manifests/prometheus-rules.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 9817f7a9..035f78be 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "00c64bc438d2acf9c808388fe1e5d733e92b0c3b" + "version": "34035de0f6c20ed3d84ba9a28e23765f11cb0b9f" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a7b1306ecfefeabe48286403b260513786289922" + "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998" } ] } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 17b0de1d..4f4de5d4 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1077,7 +1077,7 @@ spec: annotations: message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -1085,7 +1085,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -1093,7 +1093,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning From 24141c464ff8be00b495e13750730ca072f6297e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 13:33:49 +0200 Subject: [PATCH 6/7] contrib/kube-prometheus: Improve consistency of Prometheus Operator alerts --- .../kube-prometheus/alerts/prometheus-operator.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index dd176271..f851caa0 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -13,7 +13,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Alertmanager in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, @@ -26,7 +26,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, @@ -39,7 +39,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, From df65f57fb48bb1564a73f911ccc1bf1d8535476d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 13:46:18 +0200 Subject: [PATCH 7/7] contrib/kube-prometheus: Generate new manifests after fixing tyops in rules --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 035f78be..e6904980 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "34035de0f6c20ed3d84ba9a28e23765f11cb0b9f" + "version": "bffc85d6e76f6341d5370af68ea980030ab402e8" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 4f4de5d4..221fa726 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1075,7 +1075,8 @@ spec: rules: - alert: PrometheusOperatorAlertmanagerReconcileErrors annotations: - message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. + message: Errors while reconciling Alertmanager in {{ $labels.namespace }} + Namespace. expr: | rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m @@ -1083,7 +1084,7 @@ spec: severity: warning - alert: PrometheusOperatorPrometheusReconcileErrors annotations: - message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m @@ -1091,7 +1092,7 @@ spec: severity: warning - alert: PrometheusOperatorNodeLookupErrors annotations: - message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m