From ba052559cf7acf7deaf4c045a94723a128956e97 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Thu, 14 Jan 2021 15:56:55 +0100 Subject: [PATCH 1/7] jsonnet: separate mixins into components Signed-off-by: paulfantom --- .../alertmanager/alertmanager.libsonnet | 28 +++++++++ .../kube-prometheus/kube-prometheus.libsonnet | 41 +++++++++---- .../kube-state-metrics.libsonnet | 31 +++++++++- .../mixins/monitoring-mixins.libsonnet | 42 ------------- .../node-exporter/node-exporter.libsonnet | 53 +++++++++++++---- .../prometheus-operator.libsonnet | 29 +++++++++ .../prometheus/prometheus.libsonnet | 59 +++++++++++-------- 7 files changed, 191 insertions(+), 92 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 2fee6e1e..452a2cc2 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -55,6 +55,14 @@ local defaults = { ], }, replicas: 3, + mixin: { + ruleLabels: {}, + _config: { + alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}', + alertmanagerClusterLabels: 'namespace,service', + alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"', + }, + }, }; @@ -63,6 +71,26 @@ function(params) { config:: defaults + params, // Safety check assert std.isObject(am.config.resources), + assert std.isObject(am.config.mixin._config), + + mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') { + _config+:: am.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: am.config.commonLabels + am.config.mixin.ruleLabels, + name: am.config.name + '-rules', + namespace: am.config.namespace, + }, + spec: { + local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules else {}, + local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, secret: { apiVersion: 'v1', diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index e97f07f1..6688f023 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -5,6 +5,7 @@ local nodeExporter = import './node-exporter/node-exporter.libsonnet'; local prometheusAdapter = import './prometheus-adapter/prometheus-adapter.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; local prometheus = import './prometheus/prometheus.libsonnet'; +local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; @@ -15,6 +16,9 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; namespace: $._config.namespace, version: '0.21.0', image: 'quay.io/prometheus/alertmanager:v0.21.0', + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), blackboxExporter: blackboxExporter({ namespace: $._config.namespace, @@ -25,11 +29,17 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; namespace: $._config.namespace, version: '1.9.7', image: 'quay.io/coreos/kube-state-metrics:v1.9.7', + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), nodeExporter: nodeExporter({ namespace: $._config.namespace, version: '1.0.1', image: 'quay.io/prometheus/node-exporter:v1.0.1', + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), prometheus: prometheus({ namespace: $._config.namespace, @@ -37,7 +47,9 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; image: 'quay.io/prometheus/prometheus:v2.24.0', name: $._config.prometheusName, alertmanagerName: $._config.alertmanagerName, - rules: $.allRules, + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), prometheusAdapter: prometheusAdapter({ namespace: $._config.namespace, @@ -53,25 +65,26 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; commonLabels+: { 'app.kubernetes.io/part-of': 'kube-prometheus', }, + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), mixins+:: monitoringMixins({ namespace: $._config.namespace, - alertmanagerName: $._config.alertmanagerName, - prometheusName: $._config.prometheusName, }), // FIXME(paulfantom) Remove this variable by moving each mixin to its own component // Example: node_exporter mixin could be added in ./node-exporter/node-exporter.libsonnet allRules:: - $.mixins.nodeExporter.prometheusRules + + //$.mixins.nodeExporter.prometheusRules + $.mixins.kubernetes.prometheusRules + $.mixins.base.prometheusRules + - $.mixins.kubeStateMetrics.prometheusAlerts + - $.mixins.nodeExporter.prometheusAlerts + - $.mixins.alertmanager.prometheusAlerts + - $.mixins.prometheusOperator.prometheusAlerts + + //$.mixins.kubeStateMetrics.prometheusAlerts + + //$.mixins.nodeExporter.prometheusAlerts + + //$.mixins.alertmanager.prometheusAlerts + + //$.mixins.prometheusOperator.prometheusAlerts + $.mixins.kubernetes.prometheusAlerts + - $.mixins.prometheus.prometheusAlerts + + //$.mixins.prometheus.prometheusAlerts + $.mixins.base.prometheusAlerts, kubePrometheus+:: { @@ -118,6 +131,10 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; namespace: 'default', prometheusName: 'k8s', alertmanagerName: 'main', + ruleLabels: { + role: 'alert-rules', + prometheus: $._config.prometheusName, + }, versions+:: { grafana: '7.3.5' }, @@ -131,9 +148,9 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; // FIXME(paulfantom): Same as with rules and alerts. // This should be gathering all dashboards from components without having to enumerate all dashboards. dashboards: - $.mixins.nodeExporter.grafanaDashboards + - $.mixins.kubernetes.grafanaDashboards + - $.mixins.prometheus.grafanaDashboards, + //$.mixins.nodeExporter.grafanaDashboards + + $.mixins.kubernetes.grafanaDashboards, + //$.mixins.prometheus.grafanaDashboards, }, }, } diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 037d023b..c3236589 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -2,6 +2,7 @@ local krp = (import '../kube-rbac-proxy/container.libsonnet'); local defaults = { local defaults = self, + name: 'kube-state-metrics', namespace: error 'must provide namespace', version: error 'must provide version', image: error 'must provide version', @@ -13,7 +14,7 @@ local defaults = { scrapeInterval: '30s', scrapeTimeout: '30s', commonLabels:: { - 'app.kubernetes.io/name': 'kube-state-metrics', + 'app.kubernetes.io/name': defaults.name, 'app.kubernetes.io/version': defaults.version, 'app.kubernetes.io/component': 'exporter', 'app.kubernetes.io/part-of': 'kube-prometheus', @@ -23,6 +24,12 @@ local defaults = { for labelName in std.objectFields(defaults.commonLabels) if !std.setMember(labelName, ['app.kubernetes.io/version']) }, + mixin: { + ruleLabels: {}, + _config: { + kubeStateMetricsSelector: 'job="' + defaults.name + '"', + }, + }, }; function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') { @@ -30,14 +37,34 @@ function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube- config:: defaults + params, // Safety check assert std.isObject(ksm.config.resources), + assert std.isObject(ksm.config.mixin._config), - name:: 'kube-state-metrics', + name:: ksm.config.name, namespace:: ksm.config.namespace, version:: ksm.config.version, image:: ksm.config.image, commonLabels:: ksm.config.commonLabels, podLabels:: ksm.config.selectorLabels, + mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') { + _config+:: ksm.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ksm.config.commonLabels + ksm.config.mixin.ruleLabels, + name: ksm.config.name + '-rules', + namespace: ksm.config.namespace, + }, + spec: { + local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules else {}, + local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, + service+: { spec+: { ports: [ diff --git a/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet b/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet index bb035837..95b88db6 100644 --- a/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet +++ b/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet @@ -1,7 +1,5 @@ local defaults = { namespace: error 'must provide namespace', - prometheusName: error 'must provide Prometheus resource name', - alertmanagerName: error 'must provide Alertmanager resource name', }; function(params) { @@ -34,44 +32,4 @@ function(params) { hostNetworkInterfaceSelector: 'device!~"veth.+"', }, }, - - kubeStateMetrics: - (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') { - _config+:: { - kubeStateMetricsSelector: 'job="kube-state-metrics"', - }, - }, - - prometheusOperator: - (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') { - _config+:: { - prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + m.config.namespace + '"', - }, - }, - - prometheus: - (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') { - _config+:: { - prometheusSelector: 'job="prometheus-' + m.config.prometheusName + '",namespace="' + m.config.namespace + '"', - prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', - }, - }, - - alertmanager: - (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') { - _config+:: { - alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}', - alertmanagerClusterLabels: 'namespace,service', - alertmanagerSelector: 'job="alertmanager-' + m.config.alertmanagerName + '",namespace="' + m.config.namespace + '"', - }, - }, - - nodeExporter: - (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') { - _config+:: { - nodeExporterSelector: 'job="node-exporter"', - fsSpaceFillingUpCriticalThreshold: 15, - diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', - }, - }, } diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index bb16fc41..1c4321b1 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -2,6 +2,7 @@ local krp = (import '../kube-rbac-proxy/container.libsonnet'); local defaults = { local defaults = self, + name: 'node-exporter', namespace: error 'must provide namespace', version: error 'must provide version', image: error 'must provide version', @@ -12,7 +13,7 @@ local defaults = { listenAddress: '127.0.0.1', port: 9100, commonLabels:: { - 'app.kubernetes.io/name': 'node-exporter', + 'app.kubernetes.io/name': defaults.name, 'app.kubernetes.io/version': defaults.version, 'app.kubernetes.io/component': 'exporter', 'app.kubernetes.io/part-of': 'kube-prometheus', @@ -22,6 +23,14 @@ local defaults = { for labelName in std.objectFields(defaults.commonLabels) if !std.setMember(labelName, ['app.kubernetes.io/version']) }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="' + defaults.name + '"', + fsSpaceFillingUpCriticalThreshold: 15, + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + }, + }, }; @@ -30,22 +39,42 @@ function(params) { config:: defaults + params, // Safety check assert std.isObject(ne.config.resources), + assert std.isObject(ne.config.mixin._config), + + mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') { + _config+:: ne.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ne.config.commonLabels + ne.config.mixin.ruleLabels, + name: ne.config.name + '-rules', + namespace: ne.config.namespace, + }, + spec: { + local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules else {}, + local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, clusterRoleBinding: { apiVersion: 'rbac.authorization.k8s.io/v1', kind: 'ClusterRoleBinding', metadata: { - name: 'node-exporter', + name: ne.config.name, labels: ne.config.commonLabels, }, roleRef: { apiGroup: 'rbac.authorization.k8s.io', kind: 'ClusterRole', - name: 'node-exporter', + name: ne.config.name, }, subjects: [{ kind: 'ServiceAccount', - name: 'node-exporter', + name: ne.config.name, namespace: ne.config.namespace, }], }, @@ -54,7 +83,7 @@ function(params) { apiVersion: 'rbac.authorization.k8s.io/v1', kind: 'ClusterRole', metadata: { - name: 'node-exporter', + name: ne.config.name, labels: ne.config.commonLabels, }, rules: [ @@ -75,7 +104,7 @@ function(params) { apiVersion: 'v1', kind: 'ServiceAccount', metadata: { - name: 'node-exporter', + name: ne.config.name, namespace: ne.config.namespace, labels: ne.config.commonLabels, }, @@ -85,7 +114,7 @@ function(params) { apiVersion: 'v1', kind: 'Service', metadata: { - name: 'node-exporter', + name: ne.config.name, namespace: ne.config.namespace, labels: ne.config.commonLabels, }, @@ -102,7 +131,7 @@ function(params) { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: { - name: 'node-exporter', + name: ne.config.name, namespace: ne.config.namespace, labels: ne.config.commonLabels, }, @@ -134,7 +163,7 @@ function(params) { daemonset: local nodeExporter = { - name: 'node-exporter', + name: ne.config.name, image: ne.config.image, args: [ '--web.listen-address=' + std.join(':', [ne.config.listenAddress, std.toString(ne.config.port)]), @@ -177,7 +206,7 @@ function(params) { apiVersion: 'apps/v1', kind: 'DaemonSet', metadata: { - name: 'node-exporter', + name: ne.config.name, namespace: ne.config.namespace, labels: ne.config.commonLabels, }, @@ -199,7 +228,7 @@ function(params) { { name: 'sys', hostPath: { path: '/sys' } }, { name: 'root', hostPath: { path: '/' } }, ], - serviceAccountName: 'node-exporter', + serviceAccountName: ne.config.name, securityContext: { runAsUser: 65534, runAsNonRoot: true, @@ -210,4 +239,6 @@ function(params) { }, }, }, + + } diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 26940c61..b1497416 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -23,6 +23,15 @@ local defaults = { for labelName in std.objectFields(defaults.commonLabels) if !std.setMember(labelName, ['app.kubernetes.io/version']) }, + mixin: { + ruleLabels: { + role: 'alert-rules', + prometheus: defaults.name, + }, + _config: { + prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"', + }, + }, }; function(params) @@ -31,6 +40,26 @@ function(params) assert std.isObject(config.resources); prometheusOperator(config) { + local po = self, + mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') { + _config+:: config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: config.commonLabels + config.mixin.ruleLabels, + name: config.name + '-rules', + namespace: config.namespace, + }, + spec: { + local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules else {}, + local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, + service+: { spec+: { ports: [ diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index e1b66923..1673c9e7 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -13,9 +13,6 @@ local defaults = { alertmanagerName: error 'must provide alertmanagerName', namespaces: ['default', 'kube-system', defaults.namespace], replicas: 2, - rules: { - groups: [], - }, commonLabels:: { 'app.kubernetes.io/name': 'prometheus', 'app.kubernetes.io/version': defaults.version, @@ -27,6 +24,19 @@ local defaults = { for labelName in std.objectFields(defaults.commonLabels) if !std.setMember(labelName, ['app.kubernetes.io/version']) } + { prometheus: defaults.name }, + ruleSelector: { + matchLabels: defaults.mixin.ruleLabels, + }, + mixin: { + ruleLabels: { + role: 'alert-rules', + prometheus: defaults.name, + }, + _config: { + prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"', + prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', + }, + }, }; @@ -35,6 +45,26 @@ function(params) { config:: defaults + params, // Safety check assert std.isObject(p.config.resources), + assert std.isObject(p.config.mixin._config), + + mixin:: (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') { + _config+:: p.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p.config.commonLabels + p.config.mixin.ruleLabels, + name: p.config.name + '-rules', + namespace: p.config.namespace, + }, + spec: { + local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules else {}, + local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, serviceAccount: { apiVersion: 'v1', @@ -63,22 +93,6 @@ function(params) { }, }, - rules: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - metadata: { - labels: { - prometheus: p.config.name, - role: 'alert-rules', - } + p.config.commonLabels, - name: 'prometheus-' + p.config.name + '-rules', - namespace: p.config.namespace, - }, - spec: { - groups: p.config.rules.groups, - }, - }, - roleBindingSpecificNamespaces: local newSpecificRoleBinding(namespace) = { apiVersion: 'rbac.authorization.k8s.io/v1', @@ -230,12 +244,7 @@ function(params) { podMonitorNamespaceSelector: {}, probeNamespaceSelector: {}, nodeSelector: { 'kubernetes.io/os': 'linux' }, - ruleSelector: { - matchLabels: { - role: 'alert-rules', - prometheus: p.config.name, - }, - }, + ruleSelector: p.config.ruleSelector, resources: p.config.resources, alerting: { alertmanagers: [{ From e556dbfd88a963c73159246b950b75193d72d800 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Thu, 14 Jan 2021 16:58:14 +0100 Subject: [PATCH 2/7] jsonnet: add separated kubernetes mixin and custom kube prometheus alerts --- example.jsonnet | 11 +++-- .../kube-prometheus/kube-prometheus.libsonnet | 34 ++++++------- .../{ => mixin}/alerts/alerts.libsonnet | 0 .../{ => mixin}/alerts/general.libsonnet | 0 .../{ => mixin}/alerts/node.libsonnet | 0 .../{ => mixin}/alerts/tests.yaml | 0 .../kube-prometheus/mixin/custom.libsonnet | 42 ++++++++++++++++ .../mixin/kubernetes.libsonnet | 49 +++++++++++++++++++ .../{ => mixin}/rules/general.libsonnet | 0 .../{ => mixin}/rules/node-rules.libsonnet | 0 .../{ => mixin}/rules/rules.libsonnet | 0 .../mixins/monitoring-mixins.libsonnet | 35 ------------- 12 files changed, 112 insertions(+), 59 deletions(-) rename jsonnet/kube-prometheus/{ => mixin}/alerts/alerts.libsonnet (100%) rename jsonnet/kube-prometheus/{ => mixin}/alerts/general.libsonnet (100%) rename jsonnet/kube-prometheus/{ => mixin}/alerts/node.libsonnet (100%) rename jsonnet/kube-prometheus/{ => mixin}/alerts/tests.yaml (100%) create mode 100644 jsonnet/kube-prometheus/mixin/custom.libsonnet create mode 100644 jsonnet/kube-prometheus/mixin/kubernetes.libsonnet rename jsonnet/kube-prometheus/{ => mixin}/rules/general.libsonnet (100%) rename jsonnet/kube-prometheus/{ => mixin}/rules/node-rules.libsonnet (100%) rename jsonnet/kube-prometheus/{ => mixin}/rules/rules.libsonnet (100%) delete mode 100644 jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet diff --git a/example.jsonnet b/example.jsonnet index a459460d..94a40b70 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -14,17 +14,20 @@ local kp = }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['setup/0namespace-namespace']: kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) } diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 6688f023..6d8710a8 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -1,13 +1,14 @@ local alertmanager = import './alertmanager/alertmanager.libsonnet'; local blackboxExporter = import './blackbox-exporter/blackbox-exporter.libsonnet'; +local customMixin = import './mixin/custom.libsonnet'; local kubeStateMetrics = import './kube-state-metrics/kube-state-metrics.libsonnet'; +local kubernetesMixin = import './mixin/kubernetes.libsonnet'; local nodeExporter = import './node-exporter/node-exporter.libsonnet'; local prometheusAdapter = import './prometheus-adapter/prometheus-adapter.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; local prometheus = import './prometheus/prometheus.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; -local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + { @@ -69,25 +70,18 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; ruleLabels: $._config.ruleLabels, }, }), - mixins+:: monitoringMixins({ + kubernetesMixin: kubernetesMixin({ namespace: $._config.namespace, + mixin+: { + ruleLabels: $._config.ruleLabels, + }, }), - - // FIXME(paulfantom) Remove this variable by moving each mixin to its own component - // Example: node_exporter mixin could be added in ./node-exporter/node-exporter.libsonnet - allRules:: - //$.mixins.nodeExporter.prometheusRules + - $.mixins.kubernetes.prometheusRules + - $.mixins.base.prometheusRules + - //$.mixins.kubeStateMetrics.prometheusAlerts + - //$.mixins.nodeExporter.prometheusAlerts + - //$.mixins.alertmanager.prometheusAlerts + - //$.mixins.prometheusOperator.prometheusAlerts + - $.mixins.kubernetes.prometheusAlerts + - //$.mixins.prometheus.prometheusAlerts + - $.mixins.base.prometheusAlerts, - - kubePrometheus+:: { + kubePrometheus: customMixin({ + namespace: $._config.namespace, + mixin+: { + ruleLabels: $._config.ruleLabels, + }, + }) + { namespace: { apiVersion: 'v1', kind: 'Namespace', @@ -147,9 +141,9 @@ local monitoringMixins = import './mixins/monitoring-mixins.libsonnet'; }, // FIXME(paulfantom): Same as with rules and alerts. // This should be gathering all dashboards from components without having to enumerate all dashboards. - dashboards: + dashboards: {}, //$.mixins.nodeExporter.grafanaDashboards + - $.mixins.kubernetes.grafanaDashboards, + //$.mixins.kubernetes.grafanaDashboards, //$.mixins.prometheus.grafanaDashboards, }, }, diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/mixin/alerts/alerts.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/alerts/alerts.libsonnet rename to jsonnet/kube-prometheus/mixin/alerts/alerts.libsonnet diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/mixin/alerts/general.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/alerts/general.libsonnet rename to jsonnet/kube-prometheus/mixin/alerts/general.libsonnet diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/mixin/alerts/node.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/alerts/node.libsonnet rename to jsonnet/kube-prometheus/mixin/alerts/node.libsonnet diff --git a/jsonnet/kube-prometheus/alerts/tests.yaml b/jsonnet/kube-prometheus/mixin/alerts/tests.yaml similarity index 100% rename from jsonnet/kube-prometheus/alerts/tests.yaml rename to jsonnet/kube-prometheus/mixin/alerts/tests.yaml diff --git a/jsonnet/kube-prometheus/mixin/custom.libsonnet b/jsonnet/kube-prometheus/mixin/custom.libsonnet new file mode 100644 index 00000000..d1c0b086 --- /dev/null +++ b/jsonnet/kube-prometheus/mixin/custom.libsonnet @@ -0,0 +1,42 @@ +local defaults = { + name: 'kube-prometheus', + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="node-exporter"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + }, + }, +}; + +function(params) { + local m = self, + config:: defaults + params, + + local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'), + + mixin:: alertsandrules { + _config+:: m.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m.config.commonLabels + m.config.mixin.ruleLabels, + name: m.config.name + '-rules', + namespace: m.config.namespace, + }, + spec: { + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules else {}, + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, +} diff --git a/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet new file mode 100644 index 00000000..b3bc563e --- /dev/null +++ b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet @@ -0,0 +1,49 @@ +local defaults = { + name: 'kubernetes', + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', + kubeletSelector: 'job="kubelet", metrics_path="/metrics"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + }, + }, +}; + +function(params) { + local m = self, + config:: defaults + params, + + mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') { + _config+:: m.config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m.config.commonLabels + m.config.mixin.ruleLabels, + name: m.config.name + '-rules', + namespace: m.config.namespace, + }, + spec: { + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules else {}, + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts else {}, + groups: a + r, + }, + }, +} diff --git a/jsonnet/kube-prometheus/rules/general.libsonnet b/jsonnet/kube-prometheus/mixin/rules/general.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/general.libsonnet rename to jsonnet/kube-prometheus/mixin/rules/general.libsonnet diff --git a/jsonnet/kube-prometheus/rules/node-rules.libsonnet b/jsonnet/kube-prometheus/mixin/rules/node-rules.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/node-rules.libsonnet rename to jsonnet/kube-prometheus/mixin/rules/node-rules.libsonnet diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/mixin/rules/rules.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/rules.libsonnet rename to jsonnet/kube-prometheus/mixin/rules/rules.libsonnet diff --git a/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet b/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet deleted file mode 100644 index 95b88db6..00000000 --- a/jsonnet/kube-prometheus/mixins/monitoring-mixins.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -local defaults = { - namespace: error 'must provide namespace', -}; - -function(params) { - local m = self, - config:: defaults + params, - base+: - (import '../alerts/general.libsonnet') + - (import '../alerts/node.libsonnet') + - (import '../rules/node-rules.libsonnet') + - (import '../rules/general.libsonnet') { - _config+:: { - nodeExporterSelector: 'job="node-exporter"', - hostNetworkInterfaceSelector: 'device!~"veth.+"', - }, - }, - - kubernetes: - (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') { - _config+:: { - cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', - kubeletSelector: 'job="kubelet", metrics_path="/metrics"', - kubeStateMetricsSelector: 'job="kube-state-metrics"', - nodeExporterSelector: 'job="node-exporter"', - kubeSchedulerSelector: 'job="kube-scheduler"', - kubeControllerManagerSelector: 'job="kube-controller-manager"', - kubeApiserverSelector: 'job="apiserver"', - podLabel: 'pod', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', - diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', - hostNetworkInterfaceSelector: 'device!~"veth.+"', - }, - }, -} From 86d4571aeab3f49048c524028dc519943da1fa91 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Thu, 14 Jan 2021 18:38:43 +0100 Subject: [PATCH 3/7] jsonnet: remove grafana global state --- .../kube-prometheus/grafana/grafana.libsonnet | 85 +++++++++++++++++++ .../kube-prometheus/kube-prometheus.libsonnet | 59 ++----------- 2 files changed, 94 insertions(+), 50 deletions(-) create mode 100644 jsonnet/kube-prometheus/grafana/grafana.libsonnet diff --git a/jsonnet/kube-prometheus/grafana/grafana.libsonnet b/jsonnet/kube-prometheus/grafana/grafana.libsonnet new file mode 100644 index 00000000..d20c0b70 --- /dev/null +++ b/jsonnet/kube-prometheus/grafana/grafana.libsonnet @@ -0,0 +1,85 @@ +local defaults = { + local defaults = self, + name: 'grafana', + namespace: error 'must provide namespace', + version: error 'must provide version', + // image: error 'must provide image', + imageRepos: 'grafana/grafana', + resources: { + requests: { cpu: '100m', memory: '100Mi' }, + limits: { cpu: '200m', memory: '200Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + prometheusName: error 'must provide prometheus name', + dashboards: {}, +}; + +function(params) { + local g = self, + config:: defaults + params, + //local g.config = defaults + params, + // Safety check + assert std.isObject(g.config.resources), + + local glib = (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + { + _config+:: { + namespace: g.config.namespace, + versions+:: { + grafana: g.config.version, + }, + imageRepos+:: { + grafana: g.config.imageRepos, + }, + prometheus+:: { + name: g.config.prometheusName, + }, + grafana+:: { + labels: g.config.commonLabels, + dashboards: g.config.dashboards, + resources: g.config.resources, + }, + }, + }, + + service: glib.grafana.service, + serviceAccount: glib.grafana.serviceAccount, + deployment: glib.grafana.deployment, + dashboardDatasources: glib.grafana.dashboardDatasources, + dashboardSources: glib.grafana.dashboardSources, + + dashboardDefinitions: if std.length(g.config.dashboards) > 0 then { + apiVersion: 'v1', + kind: 'ConfigMapList', + items: g.dashboardDefinitions, + }, + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'grafana', + namespace: g.config.namespace, + labels: g.config.commonLabels, + }, + spec: { + selector: { + matchLabels: { + app: 'grafana', + }, + }, + endpoints: [{ + port: 'http', + interval: '15s', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 6d8710a8..709cbfe9 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -1,6 +1,7 @@ local alertmanager = import './alertmanager/alertmanager.libsonnet'; local blackboxExporter = import './blackbox-exporter/blackbox-exporter.libsonnet'; local customMixin = import './mixin/custom.libsonnet'; +local grafana = import './grafana/grafana.libsonnet'; local kubeStateMetrics = import './kube-state-metrics/kube-state-metrics.libsonnet'; local kubernetesMixin = import './mixin/kubernetes.libsonnet'; local nodeExporter = import './node-exporter/node-exporter.libsonnet'; @@ -9,8 +10,6 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib local prometheus = import './prometheus/prometheus.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; - -(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + { alertmanager: alertmanager({ name: $._config.alertmanagerName, @@ -26,6 +25,13 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib version: '0.18.0', image: 'quay.io/prometheus/blackbox-exporter:v0.18.0', }), + grafana: grafana({ + namespace: $._config.namespace, + version: '7.3.5', + image: 'grafana/grafana:v7.3.7', + dashboards: {}, + prometheusName: $._config.prometheusName, + }), kubeStateMetrics: kubeStateMetrics({ namespace: $._config.namespace, version: '1.9.7', @@ -56,7 +62,7 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib namespace: $._config.namespace, version: '0.8.2', image: 'directxman12/k8s-prometheus-adapter:v0.8.2', - prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/', + prometheusURL: 'http://prometheus-' + $._config.prometheusName + '.' + $._config.namespace + '.svc.cluster.local:9090/', }), prometheusOperator: prometheusOperator({ namespace: $._config.namespace, @@ -90,36 +96,6 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib }, }, }, - - grafana+:: { - local dashboardDefinitions = super.dashboardDefinitions, - - dashboardDefinitions: { - apiVersion: 'v1', - kind: 'ConfigMapList', - items: dashboardDefinitions, - }, - serviceMonitor: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'grafana', - namespace: $._config.namespace, - labels: $._config.grafana.labels, - }, - spec: { - selector: { - matchLabels: { - app: 'grafana', - }, - }, - endpoints: [{ - port: 'http', - interval: '15s', - }], - }, - }, - }, } + { _config+:: { namespace: 'default', @@ -129,22 +105,5 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib role: 'alert-rules', prometheus: $._config.prometheusName, }, - - versions+:: { grafana: '7.3.5' }, - - grafana+:: { - labels: { - 'app.kubernetes.io/name': 'grafana', - 'app.kubernetes.io/version': $._config.versions.grafana, - 'app.kubernetes.io/component': 'grafana', - 'app.kubernetes.io/part-of': 'kube-prometheus', - }, - // FIXME(paulfantom): Same as with rules and alerts. - // This should be gathering all dashboards from components without having to enumerate all dashboards. - dashboards: {}, - //$.mixins.nodeExporter.grafanaDashboards + - //$.mixins.kubernetes.grafanaDashboards, - //$.mixins.prometheus.grafanaDashboards, - }, }, } From 5624c5a9a8ac90f985a7624b1d98f93bd0eb1d73 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Fri, 15 Jan 2021 09:26:57 +0100 Subject: [PATCH 4/7] jsonnet: refactor the rest of mixins and grafana inclusion Signed-off-by: paulfantom --- example.jsonnet | 2 +- .../kube-prometheus/grafana/grafana.libsonnet | 2 +- .../kube-prometheus/kube-prometheus.libsonnet | 9 ++++++-- .../kube-prometheus/mixin/custom.libsonnet | 6 ++--- .../mixin/kubernetes.libsonnet | 22 +++++++++---------- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 94a40b70..8cc37f9c 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -14,7 +14,7 @@ local kp = }, }; -{ ['setup/0namespace-namespace']: kp.kubePrometheus.namespace } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) diff --git a/jsonnet/kube-prometheus/grafana/grafana.libsonnet b/jsonnet/kube-prometheus/grafana/grafana.libsonnet index d20c0b70..4dfe516c 100644 --- a/jsonnet/kube-prometheus/grafana/grafana.libsonnet +++ b/jsonnet/kube-prometheus/grafana/grafana.libsonnet @@ -60,7 +60,7 @@ function(params) { dashboardDefinitions: if std.length(g.config.dashboards) > 0 then { apiVersion: 'v1', kind: 'ConfigMapList', - items: g.dashboardDefinitions, + items: glib.grafana.dashboardDefinitions, }, serviceMonitor: { apiVersion: 'monitoring.coreos.com/v1', diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 709cbfe9..fbf5963d 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -11,6 +11,7 @@ local prometheus = import './prometheus/prometheus.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; { + local all = self, alertmanager: alertmanager({ name: $._config.alertmanagerName, namespace: $._config.namespace, @@ -25,12 +26,16 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib version: '0.18.0', image: 'quay.io/prometheus/blackbox-exporter:v0.18.0', }), + // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards + local allDashboards = $.nodeExporter.mixin.grafanaDashboards + + $.prometheus.mixin.grafanaDashboards + + $.kubernetesMixin.mixin.grafanaDashboards, grafana: grafana({ namespace: $._config.namespace, version: '7.3.5', image: 'grafana/grafana:v7.3.7', - dashboards: {}, prometheusName: $._config.prometheusName, + dashboards: allDashboards, }), kubeStateMetrics: kubeStateMetrics({ namespace: $._config.namespace, @@ -106,4 +111,4 @@ local prometheusOperator = import './prometheus-operator/prometheus-operator.lib prometheus: $._config.prometheusName, }, }, -} +} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/mixin/custom.libsonnet b/jsonnet/kube-prometheus/mixin/custom.libsonnet index d1c0b086..278ddc76 100644 --- a/jsonnet/kube-prometheus/mixin/custom.libsonnet +++ b/jsonnet/kube-prometheus/mixin/custom.libsonnet @@ -9,9 +9,9 @@ local defaults = { mixin: { ruleLabels: {}, _config: { - nodeExporterSelector: 'job="node-exporter"', - hostNetworkInterfaceSelector: 'device!~"veth.+"', - }, + nodeExporterSelector: 'job="node-exporter"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + }, }, }; diff --git a/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet index b3bc563e..f399a529 100644 --- a/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet +++ b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet @@ -9,17 +9,17 @@ local defaults = { mixin: { ruleLabels: {}, _config: { - cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', - kubeletSelector: 'job="kubelet", metrics_path="/metrics"', - kubeStateMetricsSelector: 'job="kube-state-metrics"', - nodeExporterSelector: 'job="node-exporter"', - kubeSchedulerSelector: 'job="kube-scheduler"', - kubeControllerManagerSelector: 'job="kube-controller-manager"', - kubeApiserverSelector: 'job="apiserver"', - podLabel: 'pod', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', - diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', - hostNetworkInterfaceSelector: 'device!~"veth.+"', + cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', + kubeletSelector: 'job="kubelet", metrics_path="/metrics"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', }, }, }; From f1bd7af6576d728c037249be02e6a3d869273258 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Fri, 15 Jan 2021 11:07:32 +0100 Subject: [PATCH 5/7] jsonnet: helmize :) Signed-off-by: paulfantom --- example.jsonnet | 6 +- examples/kustomize.jsonnet | 17 +- .../kube-prometheus/kube-prometheus.libsonnet | 170 +++++++++--------- 3 files changed, 97 insertions(+), 96 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 8cc37f9c..c3c496b9 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -9,8 +9,10 @@ local kp = // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + // (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 7b1cf6a2..875d3501 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,27 +1,32 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; local manifests = // Uncomment line below to enable vertical auto scaling of kube-state-metrics //{ ['ksm-autoscaler-' + name]: kp.ksmAutoscaler[name] for name in std.objectFields(kp.ksmAutoscaler) } + - { ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + - // serviceMonitor is separated so that it can be created after the CRDs are ready + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + - { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + + { ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) }; local kustomizationResourceFile(name) = './manifests/' + name + '.yaml'; local kustomization = { diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index fbf5963d..d5346317 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -1,114 +1,108 @@ local alertmanager = import './alertmanager/alertmanager.libsonnet'; local blackboxExporter = import './blackbox-exporter/blackbox-exporter.libsonnet'; -local customMixin = import './mixin/custom.libsonnet'; local grafana = import './grafana/grafana.libsonnet'; local kubeStateMetrics = import './kube-state-metrics/kube-state-metrics.libsonnet'; +local customMixin = import './mixin/custom.libsonnet'; local kubernetesMixin = import './mixin/kubernetes.libsonnet'; local nodeExporter = import './node-exporter/node-exporter.libsonnet'; local prometheusAdapter = import './prometheus-adapter/prometheus-adapter.libsonnet'; local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; local prometheus = import './prometheus/prometheus.libsonnet'; -local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet'; { - local all = self, - alertmanager: alertmanager({ - name: $._config.alertmanagerName, - namespace: $._config.namespace, - version: '0.21.0', - image: 'quay.io/prometheus/alertmanager:v0.21.0', - mixin+: { - ruleLabels: $._config.ruleLabels, + // using `values` as this is similar to helm + values:: { + common: { + namespace: 'default', + ruleLabels: { + role: 'alert-rules', + prometheus: $.values.prometheus.name, + }, }, - }), - blackboxExporter: blackboxExporter({ - namespace: $._config.namespace, - version: '0.18.0', - image: 'quay.io/prometheus/blackbox-exporter:v0.18.0', - }), - // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards - local allDashboards = $.nodeExporter.mixin.grafanaDashboards + - $.prometheus.mixin.grafanaDashboards + - $.kubernetesMixin.mixin.grafanaDashboards, - grafana: grafana({ - namespace: $._config.namespace, - version: '7.3.5', - image: 'grafana/grafana:v7.3.7', - prometheusName: $._config.prometheusName, - dashboards: allDashboards, - }), - kubeStateMetrics: kubeStateMetrics({ - namespace: $._config.namespace, - version: '1.9.7', - image: 'quay.io/coreos/kube-state-metrics:v1.9.7', - mixin+: { - ruleLabels: $._config.ruleLabels, + alertmanager: { + name: 'main', + namespace: $.values.common.namespace, + version: '0.21.0', + image: 'quay.io/prometheus/alertmanager:v0.21.0', + mixin+: { + ruleLabels: $.values.common.ruleLabels, + }, }, - }), - nodeExporter: nodeExporter({ - namespace: $._config.namespace, - version: '1.0.1', - image: 'quay.io/prometheus/node-exporter:v1.0.1', - mixin+: { - ruleLabels: $._config.ruleLabels, + blackboxExporter: { + namespace: $.values.common.namespace, + version: '0.18.0', + image: 'quay.io/prometheus/blackbox-exporter:v0.18.0', }, - }), - prometheus: prometheus({ - namespace: $._config.namespace, - version: '2.24.0', - image: 'quay.io/prometheus/prometheus:v2.24.0', - name: $._config.prometheusName, - alertmanagerName: $._config.alertmanagerName, - mixin+: { - ruleLabels: $._config.ruleLabels, + grafana: { + namespace: $.values.common.namespace, + version: '7.3.5', + image: 'grafana/grafana:v7.3.7', + prometheusName: $.values.prometheus.name, + // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards + dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesMixin.mixin.grafanaDashboards, }, - }), - prometheusAdapter: prometheusAdapter({ - namespace: $._config.namespace, - version: '0.8.2', - image: 'directxman12/k8s-prometheus-adapter:v0.8.2', - prometheusURL: 'http://prometheus-' + $._config.prometheusName + '.' + $._config.namespace + '.svc.cluster.local:9090/', - }), - prometheusOperator: prometheusOperator({ - namespace: $._config.namespace, - version: '0.45.0', - image: 'quay.io/prometheus-operator/prometheus-operator:v0.45.0', - configReloaderImage: 'quay.io/prometheus-operator/prometheus-config-reloader:v0.45.0', - commonLabels+: { - 'app.kubernetes.io/part-of': 'kube-prometheus', + kubeStateMetrics: { + namespace: $.values.common.namespace, + version: '1.9.7', + image: 'quay.io/coreos/kube-state-metrics:v1.9.7', + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, - mixin+: { - ruleLabels: $._config.ruleLabels, + nodeExporter: { + namespace: $.values.common.namespace, + version: '1.0.1', + image: 'quay.io/prometheus/node-exporter:v1.0.1', + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, - }), - kubernetesMixin: kubernetesMixin({ - namespace: $._config.namespace, - mixin+: { - ruleLabels: $._config.ruleLabels, + prometheus: { + namespace: $.values.common.namespace, + version: '2.24.0', + image: 'quay.io/prometheus/prometheus:v2.24.0', + name: 'k8s', + alertmanagerName: $.values.alertmanager.name, + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, - }), - kubePrometheus: customMixin({ - namespace: $._config.namespace, - mixin+: { - ruleLabels: $._config.ruleLabels, + prometheusAdapter: { + namespace: $.values.common.namespace, + version: '0.8.2', + image: 'directxman12/k8s-prometheus-adapter:v0.8.2', + prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.common.namespace + '.svc.cluster.local:9090/', }, - }) + { + prometheusOperator: { + namespace: $.values.common.namespace, + version: '0.45.0', + image: 'quay.io/prometheus-operator/prometheus-operator:v0.45.0', + configReloaderImage: 'quay.io/prometheus-operator/prometheus-config-reloader:v0.45.0', + commonLabels+: { + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + kubernetesMixin: { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + kubePrometheus: { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + }, + + alertmanager: alertmanager($.values.alertmanager), + blackboxExporter: blackboxExporter($.values.blackboxExporter), + grafana: grafana($.values.grafana), + kubeStateMetrics: kubeStateMetrics($.values.kubeStateMetrics), + nodeExporter: nodeExporter($.values.nodeExporter), + prometheus: prometheus($.values.prometheus), + prometheusAdapter: prometheusAdapter($.values.prometheusAdapter), + prometheusOperator: prometheusOperator($.values.prometheusOperator), + kubernetesMixin: kubernetesMixin($.values.kubernetesMixin), + kubePrometheus: customMixin($.values.kubePrometheus) + { namespace: { apiVersion: 'v1', kind: 'Namespace', metadata: { - name: $._config.namespace, + name: $.values.kubePrometheus.namespace, }, }, }, -} + { - _config+:: { - namespace: 'default', - prometheusName: 'k8s', - alertmanagerName: 'main', - ruleLabels: { - role: 'alert-rules', - prometheus: $._config.prometheusName, - }, - }, -} \ No newline at end of file +} From d4c48539a09a63bd34a45425f19e5e5a64d136c8 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Fri, 15 Jan 2021 11:49:43 +0100 Subject: [PATCH 6/7] jsonnet: fix too much nesting of data in PrometheusRules Signed-off-by: paulfantom --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 4 ++-- .../kube-state-metrics/kube-state-metrics.libsonnet | 4 ++-- jsonnet/kube-prometheus/mixin/custom.libsonnet | 4 ++-- jsonnet/kube-prometheus/mixin/kubernetes.libsonnet | 4 ++-- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 4 ++-- .../prometheus-operator/prometheus-operator.libsonnet | 4 ++-- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 452a2cc2..5e992176 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -86,8 +86,8 @@ function(params) { namespace: am.config.namespace, }, spec: { - local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules else {}, - local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index c3236589..fdfe3bd5 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -59,8 +59,8 @@ function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube- namespace: ksm.config.namespace, }, spec: { - local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules else {}, - local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/mixin/custom.libsonnet b/jsonnet/kube-prometheus/mixin/custom.libsonnet index 278ddc76..13c36332 100644 --- a/jsonnet/kube-prometheus/mixin/custom.libsonnet +++ b/jsonnet/kube-prometheus/mixin/custom.libsonnet @@ -34,8 +34,8 @@ function(params) { namespace: m.config.namespace, }, spec: { - local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules else {}, - local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet index f399a529..f9e5791f 100644 --- a/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet +++ b/jsonnet/kube-prometheus/mixin/kubernetes.libsonnet @@ -41,8 +41,8 @@ function(params) { namespace: m.config.namespace, }, spec: { - local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules else {}, - local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else {}, + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else {}, groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 1c4321b1..c5b82498 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -54,8 +54,8 @@ function(params) { namespace: ne.config.namespace, }, spec: { - local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules else {}, - local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index b1497416..8114f91c 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -54,8 +54,8 @@ function(params) namespace: config.namespace, }, spec: { - local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules else {}, - local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 1673c9e7..76a251ff 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -60,8 +60,8 @@ function(params) { namespace: p.config.namespace, }, spec: { - local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules else {}, - local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts else {}, + local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts.groups else [], groups: a + r, }, }, From 092b22d62bd9b0a8cc2fb14c793c42d4e3964de8 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Fri, 15 Jan 2021 11:58:04 +0100 Subject: [PATCH 7/7] regenerate --- README.md | 17 +- ...prometheus-rules-and-grafana-dashboards.md | 17 +- kustomization.yaml | 8 +- manifests/alertmanager-prometheusRule.yaml | 116 + manifests/grafana-deployment.yaml | 2 +- manifests/kube-prometheus-prometheusRule.yaml | 63 + .../kube-state-metrics-prometheusRule.yaml | 40 + ...es.yaml => kubernetes-prometheusRule.yaml} | 2071 ++++++----------- manifests/node-exporter-prometheusRule.yaml | 266 +++ .../prometheus-operator-prometheusRule.yaml | 79 + manifests/prometheus-prometheusRule.yaml | 213 ++ 11 files changed, 1479 insertions(+), 1413 deletions(-) create mode 100644 manifests/alertmanager-prometheusRule.yaml create mode 100644 manifests/kube-prometheus-prometheusRule.yaml create mode 100644 manifests/kube-state-metrics-prometheusRule.yaml rename manifests/{prometheus-rules.yaml => kubernetes-prometheusRule.yaml} (62%) create mode 100644 manifests/node-exporter-prometheusRule.yaml create mode 100644 manifests/prometheus-operator-prometheusRule.yaml create mode 100644 manifests/prometheus-prometheusRule.yaml diff --git a/README.md b/README.md index 8e6d6694..b0aab969 100644 --- a/README.md +++ b/README.md @@ -217,25 +217,30 @@ local kp = // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + // (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) } ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index f9decdcd..b6c9f978 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -21,25 +21,30 @@ local kp = // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + // (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) } ``` ## Prometheus rules diff --git a/kustomization.yaml b/kustomization.yaml index 7066018a..2ebd021b 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -2,6 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./manifests/alertmanager-alertmanager.yaml +- ./manifests/alertmanager-prometheusRule.yaml - ./manifests/alertmanager-secret.yaml - ./manifests/alertmanager-service.yaml - ./manifests/alertmanager-serviceAccount.yaml @@ -20,15 +21,19 @@ resources: - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml +- ./manifests/kube-prometheus-prometheusRule.yaml - ./manifests/kube-state-metrics-clusterRole.yaml - ./manifests/kube-state-metrics-clusterRoleBinding.yaml - ./manifests/kube-state-metrics-deployment.yaml +- ./manifests/kube-state-metrics-prometheusRule.yaml - ./manifests/kube-state-metrics-service.yaml - ./manifests/kube-state-metrics-serviceAccount.yaml - ./manifests/kube-state-metrics-serviceMonitor.yaml +- ./manifests/kubernetes-prometheusRule.yaml - ./manifests/node-exporter-clusterRole.yaml - ./manifests/node-exporter-clusterRoleBinding.yaml - ./manifests/node-exporter-daemonset.yaml +- ./manifests/node-exporter-prometheusRule.yaml - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml @@ -46,13 +51,14 @@ resources: - ./manifests/prometheus-adapter-serviceMonitor.yaml - ./manifests/prometheus-clusterRole.yaml - ./manifests/prometheus-clusterRoleBinding.yaml +- ./manifests/prometheus-operator-prometheusRule.yaml - ./manifests/prometheus-operator-serviceMonitor.yaml - ./manifests/prometheus-prometheus.yaml +- ./manifests/prometheus-prometheusRule.yaml - ./manifests/prometheus-roleBindingConfig.yaml - ./manifests/prometheus-roleBindingSpecificNamespaces.yaml - ./manifests/prometheus-roleConfig.yaml - ./manifests/prometheus-roleSpecificNamespaces.yaml -- ./manifests/prometheus-rules.yaml - ./manifests/prometheus-service.yaml - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml new file mode 100644 index 00000000..ea78ad11 --- /dev/null +++ b/manifests/alertmanager-prometheusRule.yaml @@ -0,0 +1,116 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 + prometheus: k8s + role: alert-rules + name: main-rules + namespace: monitoring +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 10m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + summary: All Alertmanager instances in a cluster failed to send notifications. + expr: | + min by (namespace,service) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 85d3f68d..d6bb77da 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -13,7 +13,7 @@ spec: template: metadata: annotations: - checksum/grafana-dashboards: a9e19e1ab605dc374f30edda771e6917 + checksum/grafana-dashboards: b02ae450c84445cbaca8c685eefaec6c checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a labels: app: grafana diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml new file mode 100644 index 00000000..26e7da58 --- /dev/null +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -0,0 +1,63 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + expr: vector(1) + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/manifests/kube-state-metrics-prometheusRule.yaml b/manifests/kube-state-metrics-prometheusRule.yaml new file mode 100644 index 00000000..28c9ec05 --- /dev/null +++ b/manifests/kube-state-metrics-prometheusRule.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.9.7 + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: monitoring +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical diff --git a/manifests/prometheus-rules.yaml b/manifests/kubernetes-prometheusRule.yaml similarity index 62% rename from manifests/prometheus-rules.yaml rename to manifests/kubernetes-prometheusRule.yaml index fd56b0aa..d683cff6 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -2,73 +2,688 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: prometheus + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 2.24.0 prometheus: k8s role: alert-rules - name: prometheus-k8s-rules + name: kubernetes-rules namespace: monitoring spec: groups: - - name: node-exporter.rules + - name: kubernetes-apps rules: - - expr: | - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job="node-exporter"} + - alert: KubePodCrashLooping + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) ) - ) - record: instance:node_num_cpu:sum - - expr: | - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) - ) - record: instance:node_cpu_utilisation:rate1m - - expr: | + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | ( - node_load1{job="node-exporter"} + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion + summary: Job did not complete in time + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 12h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch + summary: HPA has not matched descired number of replicas. + expr: | + (kube_hpa_status_desired_replicas{job="kube-state-metrics"} + != + kube_hpa_status_current_replicas{job="kube-state-metrics"}) + and + (kube_hpa_status_current_replicas{job="kube-state-metrics"} + > + kube_hpa_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_hpa_status_current_replicas{job="kube-state-metrics"} + < + kube_hpa_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_hpa_status_current_replicas[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_hpa_status_current_replicas{job="kube-state-metrics"} + == + kube_hpa_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) + / + sum(kube_node_status_allocatable_cpu_cores) + > + (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) + for: 5m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) + / + sum(kube_node_status_allocatable_memory_bytes) + > + (count(kube_node_status_allocatable_memory_bytes)-1) + / + count(kube_node_status_allocatable_memory_bytes) + for: 5m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(kube_node_status_allocatable_cpu_cores) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + < 0.03 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - alert: AggregatedAPIErrors + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors + summary: An aggregated API has reported errors. + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown + summary: An aggregated API is down. + expr: | + (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: | - 1 - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: | - rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m + max by(node) ( + kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 + ) > 0.95 + for: 15m + labels: + severity: warning + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical - name: kube-apiserver.rules rules: - expr: | @@ -750,1345 +1365,3 @@ spec: labels: quantile: "0.5" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: | - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: | - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure - summary: Failed device in RAID array - expr: | - node_md_disks{state="fail"} > 0 - labels: - severity: warning - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload - summary: Reloading an Alertmanager configuration has failed. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent - summary: A member of an Alertmanager cluster has not found all other cluster members. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) - < on (namespace,service) group_left - count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) - for: 10m - labels: - severity: critical - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts - summary: An Alertmanager instance failed to send notifications. - expr: | - ( - rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications. - expr: | - min by (namespace,service) ( - rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: critical - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent - summary: Alertmanager instances within the same cluster have different configurations. - expr: | - count by (namespace,service) ( - count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) - ) - != 1 - for: 20m - labels: - severity: critical - - alert: AlertmanagerClusterDown - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown - summary: Half or more of the Alertmanager instances within the same cluster are down. - expr: | - ( - count by (namespace,service) ( - avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 - ) - / - count by (namespace,service) ( - up{job="alertmanager-main",namespace="monitoring"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping - summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. - expr: | - ( - count by (namespace,service) ( - changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 - ) - / - count by (namespace,service) ( - up{job="alertmanager-main",namespace="monitoring"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: | - min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors - summary: Errors while reconciling controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready - summary: Prometheus operator not ready - expr: | - min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: | - min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 5m - labels: - severity: warning - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping - summary: Pod is crash looping. - expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: | - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: | - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobCompletion - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion - summary: Job did not complete in time - expr: | - kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 - for: 12h - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed - summary: Job failed to complete. - expr: | - kube_job_failed{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. - expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} - != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - > - kube_hpa_spec_min_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - < - kube_hpa_spec_max_replicas{job="kube-state-metrics"}) - and - changes(kube_hpa_status_current_replicas[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout - summary: HPA is running at max replicas - expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} - == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) - / - sum(kube_node_status_allocatable_cpu_cores) - > - (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) - for: 5m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) - / - sum(kube_node_status_allocatable_memory_bytes) - > - (count(kube_node_status_allocatable_memory_bytes)-1) - / - count(kube_node_status_allocatable_memory_bytes) - for: 5m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable_cpu_cores) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused - summary: Namespace quota is fully used. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - < 0.03 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - ( - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - ) < 0.15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: | - count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m - labels: - severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 - labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) - ) - / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - ) > 0.95 - for: 15m - labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m - labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-scheduler - rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-controller-manager - rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - ( - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 - ) - ) - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - - name: general.rules - rules: - - alert: TargetDown - annotations: - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - message: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - expr: vector(1) - labels: - severity: none - - name: node-network - rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" - expr: | - changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning diff --git a/manifests/node-exporter-prometheusRule.yaml b/manifests/node-exporter-prometheusRule.yaml new file mode 100644 index 00000000..eee95a1a --- /dev/null +++ b/manifests/node-exporter-prometheusRule.yaml @@ -0,0 +1,266 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.0.1 + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + summary: RAID Array is degraded + expr: | + node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + summary: Failed device in RAID array + expr: | + node_md_disks{state="fail"} > 0 + labels: + severity: warning + - name: node-exporter.rules + rules: + - expr: | + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{job="node-exporter"} + ) + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + ) + record: instance:node_cpu_utilisation:rate1m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) + record: instance:node_vmstat_pgmajfault:rate1m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_seconds:rate1m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate1m diff --git a/manifests/prometheus-operator-prometheusRule.yaml b/manifests/prometheus-operator-prometheusRule.yaml new file mode 100644 index 00000000..c1f85086 --- /dev/null +++ b/manifests/prometheus-operator-prometheusRule.yaml @@ -0,0 +1,79 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 + prometheus: k8s + role: alert-rules + name: prometheus-operator-rules + namespace: monitoring +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + summary: Errors while performing list operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + summary: Errors while performing watch operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + summary: Errors while reconciling controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. + summary: Prometheus operator not ready + expr: | + min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + summary: Resources rejected by Prometheus operator + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml new file mode 100644 index 00000000..aa4f0ce9 --- /dev/null +++ b/manifests/prometheus-prometheusRule.yaml @@ -0,0 +1,213 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + prometheus: k8s + role: alert-rules + name: k8s-rules + namespace: monitoring +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + ( + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + + + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical