Merge pull request #873 from paulfantom/separate-mixins
This commit is contained in:
17
README.md
17
README.md
@@ -217,25 +217,30 @@ local kp =
|
||||
// (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') +
|
||||
// (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') +
|
||||
{
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } +
|
||||
{
|
||||
['setup/prometheus-operator-' + name]: kp.prometheusOperator[name]
|
||||
for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator))
|
||||
for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator))
|
||||
} +
|
||||
// serviceMonitor is separated so that it can be created after the CRDs are ready
|
||||
// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready
|
||||
{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } +
|
||||
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +
|
||||
{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +
|
||||
{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) }
|
||||
```
|
||||
|
||||
And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`):
|
||||
|
||||
@@ -21,25 +21,30 @@ local kp =
|
||||
// (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') +
|
||||
// (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') +
|
||||
{
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } +
|
||||
{
|
||||
['setup/prometheus-operator-' + name]: kp.prometheusOperator[name]
|
||||
for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator))
|
||||
for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator))
|
||||
} +
|
||||
// serviceMonitor is separated so that it can be created after the CRDs are ready
|
||||
// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready
|
||||
{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } +
|
||||
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +
|
||||
{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +
|
||||
{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) }
|
||||
```
|
||||
|
||||
## Prometheus rules
|
||||
|
||||
@@ -9,22 +9,27 @@ local kp =
|
||||
// (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') +
|
||||
// (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') +
|
||||
{
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } +
|
||||
{
|
||||
['setup/prometheus-operator-' + name]: kp.prometheusOperator[name]
|
||||
for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator))
|
||||
for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator))
|
||||
} +
|
||||
// serviceMonitor is separated so that it can be created after the CRDs are ready
|
||||
// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready
|
||||
{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } +
|
||||
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +
|
||||
{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +
|
||||
{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) }
|
||||
|
||||
@@ -1,27 +1,32 @@
|
||||
local kp =
|
||||
(import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
local manifests =
|
||||
// Uncomment line below to enable vertical auto scaling of kube-state-metrics
|
||||
//{ ['ksm-autoscaler-' + name]: kp.ksmAutoscaler[name] for name in std.objectFields(kp.ksmAutoscaler) } +
|
||||
{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } +
|
||||
{
|
||||
['setup/prometheus-operator-' + name]: kp.prometheusOperator[name]
|
||||
for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator))
|
||||
for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator))
|
||||
} +
|
||||
// serviceMonitor is separated so that it can be created after the CRDs are ready
|
||||
// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready
|
||||
{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } +
|
||||
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +
|
||||
{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) };
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +
|
||||
{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) };
|
||||
|
||||
local kustomizationResourceFile(name) = './manifests/' + name + '.yaml';
|
||||
local kustomization = {
|
||||
|
||||
@@ -55,6 +55,14 @@ local defaults = {
|
||||
],
|
||||
},
|
||||
replicas: 3,
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}',
|
||||
alertmanagerClusterLabels: 'namespace,service',
|
||||
alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -63,6 +71,26 @@ function(params) {
|
||||
config:: defaults + params,
|
||||
// Safety check
|
||||
assert std.isObject(am.config.resources),
|
||||
assert std.isObject(am.config.mixin._config),
|
||||
|
||||
mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') {
|
||||
_config+:: am.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: am.config.commonLabels + am.config.mixin.ruleLabels,
|
||||
name: am.config.name + '-rules',
|
||||
namespace: am.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
|
||||
secret: {
|
||||
apiVersion: 'v1',
|
||||
|
||||
85
jsonnet/kube-prometheus/grafana/grafana.libsonnet
Normal file
85
jsonnet/kube-prometheus/grafana/grafana.libsonnet
Normal file
@@ -0,0 +1,85 @@
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
name: 'grafana',
|
||||
namespace: error 'must provide namespace',
|
||||
version: error 'must provide version',
|
||||
// image: error 'must provide image',
|
||||
imageRepos: 'grafana/grafana',
|
||||
resources: {
|
||||
requests: { cpu: '100m', memory: '100Mi' },
|
||||
limits: { cpu: '200m', memory: '200Mi' },
|
||||
},
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': defaults.name,
|
||||
'app.kubernetes.io/version': defaults.version,
|
||||
'app.kubernetes.io/component': 'grafana',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
selectorLabels:: {
|
||||
[labelName]: defaults.commonLabels[labelName]
|
||||
for labelName in std.objectFields(defaults.commonLabels)
|
||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
||||
},
|
||||
prometheusName: error 'must provide prometheus name',
|
||||
dashboards: {},
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local g = self,
|
||||
config:: defaults + params,
|
||||
//local g.config = defaults + params,
|
||||
// Safety check
|
||||
assert std.isObject(g.config.resources),
|
||||
|
||||
local glib = (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + {
|
||||
_config+:: {
|
||||
namespace: g.config.namespace,
|
||||
versions+:: {
|
||||
grafana: g.config.version,
|
||||
},
|
||||
imageRepos+:: {
|
||||
grafana: g.config.imageRepos,
|
||||
},
|
||||
prometheus+:: {
|
||||
name: g.config.prometheusName,
|
||||
},
|
||||
grafana+:: {
|
||||
labels: g.config.commonLabels,
|
||||
dashboards: g.config.dashboards,
|
||||
resources: g.config.resources,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
service: glib.grafana.service,
|
||||
serviceAccount: glib.grafana.serviceAccount,
|
||||
deployment: glib.grafana.deployment,
|
||||
dashboardDatasources: glib.grafana.dashboardDatasources,
|
||||
dashboardSources: glib.grafana.dashboardSources,
|
||||
|
||||
dashboardDefinitions: if std.length(g.config.dashboards) > 0 then {
|
||||
apiVersion: 'v1',
|
||||
kind: 'ConfigMapList',
|
||||
items: glib.grafana.dashboardDefinitions,
|
||||
},
|
||||
serviceMonitor: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'grafana',
|
||||
namespace: g.config.namespace,
|
||||
labels: g.config.commonLabels,
|
||||
},
|
||||
spec: {
|
||||
selector: {
|
||||
matchLabels: {
|
||||
app: 'grafana',
|
||||
},
|
||||
},
|
||||
endpoints: [{
|
||||
port: 'http',
|
||||
interval: '15s',
|
||||
}],
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,139 +1,108 @@
|
||||
local alertmanager = import './alertmanager/alertmanager.libsonnet';
|
||||
local blackboxExporter = import './blackbox-exporter/blackbox-exporter.libsonnet';
|
||||
local grafana = import './grafana/grafana.libsonnet';
|
||||
local kubeStateMetrics = import './kube-state-metrics/kube-state-metrics.libsonnet';
|
||||
local customMixin = import './mixin/custom.libsonnet';
|
||||
local kubernetesMixin = import './mixin/kubernetes.libsonnet';
|
||||
local nodeExporter = import './node-exporter/node-exporter.libsonnet';
|
||||
local prometheusAdapter = import './prometheus-adapter/prometheus-adapter.libsonnet';
|
||||
local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet';
|
||||
local prometheus = import './prometheus/prometheus.libsonnet';
|
||||
|
||||
local monitoringMixins = import './mixins/monitoring-mixins.libsonnet';
|
||||
|
||||
(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') +
|
||||
{
|
||||
alertmanager: alertmanager({
|
||||
name: $._config.alertmanagerName,
|
||||
namespace: $._config.namespace,
|
||||
version: '0.21.0',
|
||||
image: 'quay.io/prometheus/alertmanager:v0.21.0',
|
||||
}),
|
||||
blackboxExporter: blackboxExporter({
|
||||
namespace: $._config.namespace,
|
||||
version: '0.18.0',
|
||||
image: 'quay.io/prometheus/blackbox-exporter:v0.18.0',
|
||||
}),
|
||||
kubeStateMetrics: kubeStateMetrics({
|
||||
namespace: $._config.namespace,
|
||||
version: '1.9.7',
|
||||
image: 'quay.io/coreos/kube-state-metrics:v1.9.7',
|
||||
}),
|
||||
nodeExporter: nodeExporter({
|
||||
namespace: $._config.namespace,
|
||||
version: '1.0.1',
|
||||
image: 'quay.io/prometheus/node-exporter:v1.0.1',
|
||||
}),
|
||||
prometheus: prometheus({
|
||||
namespace: $._config.namespace,
|
||||
version: '2.24.0',
|
||||
image: 'quay.io/prometheus/prometheus:v2.24.0',
|
||||
name: $._config.prometheusName,
|
||||
alertmanagerName: $._config.alertmanagerName,
|
||||
rules: $.allRules,
|
||||
}),
|
||||
prometheusAdapter: prometheusAdapter({
|
||||
namespace: $._config.namespace,
|
||||
version: '0.8.2',
|
||||
image: 'directxman12/k8s-prometheus-adapter:v0.8.2',
|
||||
prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/',
|
||||
}),
|
||||
prometheusOperator: prometheusOperator({
|
||||
namespace: $._config.namespace,
|
||||
version: '0.45.0',
|
||||
image: 'quay.io/prometheus-operator/prometheus-operator:v0.45.0',
|
||||
configReloaderImage: 'quay.io/prometheus-operator/prometheus-config-reloader:v0.45.0',
|
||||
commonLabels+: {
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
// using `values` as this is similar to helm
|
||||
values:: {
|
||||
common: {
|
||||
namespace: 'default',
|
||||
ruleLabels: {
|
||||
role: 'alert-rules',
|
||||
prometheus: $.values.prometheus.name,
|
||||
},
|
||||
},
|
||||
}),
|
||||
mixins+:: monitoringMixins({
|
||||
namespace: $._config.namespace,
|
||||
alertmanagerName: $._config.alertmanagerName,
|
||||
prometheusName: $._config.prometheusName,
|
||||
}),
|
||||
alertmanager: {
|
||||
name: 'main',
|
||||
namespace: $.values.common.namespace,
|
||||
version: '0.21.0',
|
||||
image: 'quay.io/prometheus/alertmanager:v0.21.0',
|
||||
mixin+: {
|
||||
ruleLabels: $.values.common.ruleLabels,
|
||||
},
|
||||
},
|
||||
blackboxExporter: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '0.18.0',
|
||||
image: 'quay.io/prometheus/blackbox-exporter:v0.18.0',
|
||||
},
|
||||
grafana: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '7.3.5',
|
||||
image: 'grafana/grafana:v7.3.7',
|
||||
prometheusName: $.values.prometheus.name,
|
||||
// TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards
|
||||
dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesMixin.mixin.grafanaDashboards,
|
||||
},
|
||||
kubeStateMetrics: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '1.9.7',
|
||||
image: 'quay.io/coreos/kube-state-metrics:v1.9.7',
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
nodeExporter: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '1.0.1',
|
||||
image: 'quay.io/prometheus/node-exporter:v1.0.1',
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
prometheus: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '2.24.0',
|
||||
image: 'quay.io/prometheus/prometheus:v2.24.0',
|
||||
name: 'k8s',
|
||||
alertmanagerName: $.values.alertmanager.name,
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
prometheusAdapter: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '0.8.2',
|
||||
image: 'directxman12/k8s-prometheus-adapter:v0.8.2',
|
||||
prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.common.namespace + '.svc.cluster.local:9090/',
|
||||
},
|
||||
prometheusOperator: {
|
||||
namespace: $.values.common.namespace,
|
||||
version: '0.45.0',
|
||||
image: 'quay.io/prometheus-operator/prometheus-operator:v0.45.0',
|
||||
configReloaderImage: 'quay.io/prometheus-operator/prometheus-config-reloader:v0.45.0',
|
||||
commonLabels+: {
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
kubernetesMixin: {
|
||||
namespace: $.values.common.namespace,
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
kubePrometheus: {
|
||||
namespace: $.values.common.namespace,
|
||||
mixin+: { ruleLabels: $.values.common.ruleLabels },
|
||||
},
|
||||
},
|
||||
|
||||
// FIXME(paulfantom) Remove this variable by moving each mixin to its own component
|
||||
// Example: node_exporter mixin could be added in ./node-exporter/node-exporter.libsonnet
|
||||
allRules::
|
||||
$.mixins.nodeExporter.prometheusRules +
|
||||
$.mixins.kubernetes.prometheusRules +
|
||||
$.mixins.base.prometheusRules +
|
||||
$.mixins.kubeStateMetrics.prometheusAlerts +
|
||||
$.mixins.nodeExporter.prometheusAlerts +
|
||||
$.mixins.alertmanager.prometheusAlerts +
|
||||
$.mixins.prometheusOperator.prometheusAlerts +
|
||||
$.mixins.kubernetes.prometheusAlerts +
|
||||
$.mixins.prometheus.prometheusAlerts +
|
||||
$.mixins.base.prometheusAlerts,
|
||||
|
||||
kubePrometheus+:: {
|
||||
alertmanager: alertmanager($.values.alertmanager),
|
||||
blackboxExporter: blackboxExporter($.values.blackboxExporter),
|
||||
grafana: grafana($.values.grafana),
|
||||
kubeStateMetrics: kubeStateMetrics($.values.kubeStateMetrics),
|
||||
nodeExporter: nodeExporter($.values.nodeExporter),
|
||||
prometheus: prometheus($.values.prometheus),
|
||||
prometheusAdapter: prometheusAdapter($.values.prometheusAdapter),
|
||||
prometheusOperator: prometheusOperator($.values.prometheusOperator),
|
||||
kubernetesMixin: kubernetesMixin($.values.kubernetesMixin),
|
||||
kubePrometheus: customMixin($.values.kubePrometheus) + {
|
||||
namespace: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Namespace',
|
||||
metadata: {
|
||||
name: $._config.namespace,
|
||||
name: $.values.kubePrometheus.namespace,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
grafana+:: {
|
||||
local dashboardDefinitions = super.dashboardDefinitions,
|
||||
|
||||
dashboardDefinitions: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'ConfigMapList',
|
||||
items: dashboardDefinitions,
|
||||
},
|
||||
serviceMonitor: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'grafana',
|
||||
namespace: $._config.namespace,
|
||||
labels: $._config.grafana.labels,
|
||||
},
|
||||
spec: {
|
||||
selector: {
|
||||
matchLabels: {
|
||||
app: 'grafana',
|
||||
},
|
||||
},
|
||||
endpoints: [{
|
||||
port: 'http',
|
||||
interval: '15s',
|
||||
}],
|
||||
},
|
||||
},
|
||||
},
|
||||
} + {
|
||||
_config+:: {
|
||||
namespace: 'default',
|
||||
prometheusName: 'k8s',
|
||||
alertmanagerName: 'main',
|
||||
|
||||
versions+:: { grafana: '7.3.5' },
|
||||
|
||||
grafana+:: {
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'grafana',
|
||||
'app.kubernetes.io/version': $._config.versions.grafana,
|
||||
'app.kubernetes.io/component': 'grafana',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
// FIXME(paulfantom): Same as with rules and alerts.
|
||||
// This should be gathering all dashboards from components without having to enumerate all dashboards.
|
||||
dashboards:
|
||||
$.mixins.nodeExporter.grafanaDashboards +
|
||||
$.mixins.kubernetes.grafanaDashboards +
|
||||
$.mixins.prometheus.grafanaDashboards,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ local krp = (import '../kube-rbac-proxy/container.libsonnet');
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
name: 'kube-state-metrics',
|
||||
namespace: error 'must provide namespace',
|
||||
version: error 'must provide version',
|
||||
image: error 'must provide version',
|
||||
@@ -13,7 +14,7 @@ local defaults = {
|
||||
scrapeInterval: '30s',
|
||||
scrapeTimeout: '30s',
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'kube-state-metrics',
|
||||
'app.kubernetes.io/name': defaults.name,
|
||||
'app.kubernetes.io/version': defaults.version,
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
@@ -23,6 +24,12 @@ local defaults = {
|
||||
for labelName in std.objectFields(defaults.commonLabels)
|
||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
kubeStateMetricsSelector: 'job="' + defaults.name + '"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') {
|
||||
@@ -30,14 +37,34 @@ function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-
|
||||
config:: defaults + params,
|
||||
// Safety check
|
||||
assert std.isObject(ksm.config.resources),
|
||||
assert std.isObject(ksm.config.mixin._config),
|
||||
|
||||
name:: 'kube-state-metrics',
|
||||
name:: ksm.config.name,
|
||||
namespace:: ksm.config.namespace,
|
||||
version:: ksm.config.version,
|
||||
image:: ksm.config.image,
|
||||
commonLabels:: ksm.config.commonLabels,
|
||||
podLabels:: ksm.config.selectorLabels,
|
||||
|
||||
mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') {
|
||||
_config+:: ksm.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: ksm.config.commonLabels + ksm.config.mixin.ruleLabels,
|
||||
name: ksm.config.name + '-rules',
|
||||
namespace: ksm.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
|
||||
service+: {
|
||||
spec+: {
|
||||
ports: [
|
||||
|
||||
42
jsonnet/kube-prometheus/mixin/custom.libsonnet
Normal file
42
jsonnet/kube-prometheus/mixin/custom.libsonnet
Normal file
@@ -0,0 +1,42 @@
|
||||
local defaults = {
|
||||
name: 'kube-prometheus',
|
||||
namespace: error 'must provide namespace',
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'kube-prometheus',
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local m = self,
|
||||
config:: defaults + params,
|
||||
|
||||
local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'),
|
||||
|
||||
mixin:: alertsandrules {
|
||||
_config+:: m.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
|
||||
name: m.config.name + '-rules',
|
||||
namespace: m.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
}
|
||||
49
jsonnet/kube-prometheus/mixin/kubernetes.libsonnet
Normal file
49
jsonnet/kube-prometheus/mixin/kubernetes.libsonnet
Normal file
@@ -0,0 +1,49 @@
|
||||
local defaults = {
|
||||
name: 'kubernetes',
|
||||
namespace: error 'must provide namespace',
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'kube-prometheus',
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local m = self,
|
||||
config:: defaults + params,
|
||||
|
||||
mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') {
|
||||
_config+:: m.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
|
||||
name: m.config.name + '-rules',
|
||||
namespace: m.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else {},
|
||||
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else {},
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
local defaults = {
|
||||
namespace: error 'must provide namespace',
|
||||
prometheusName: error 'must provide Prometheus resource name',
|
||||
alertmanagerName: error 'must provide Alertmanager resource name',
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local m = self,
|
||||
config:: defaults + params,
|
||||
base+:
|
||||
(import '../alerts/general.libsonnet') +
|
||||
(import '../alerts/node.libsonnet') +
|
||||
(import '../rules/node-rules.libsonnet') +
|
||||
(import '../rules/general.libsonnet') {
|
||||
_config+:: {
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
|
||||
kubernetes:
|
||||
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
|
||||
kubeStateMetrics:
|
||||
(import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
},
|
||||
},
|
||||
|
||||
prometheusOperator:
|
||||
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + m.config.namespace + '"',
|
||||
},
|
||||
},
|
||||
|
||||
prometheus:
|
||||
(import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
prometheusSelector: 'job="prometheus-' + m.config.prometheusName + '",namespace="' + m.config.namespace + '"',
|
||||
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
|
||||
},
|
||||
},
|
||||
|
||||
alertmanager:
|
||||
(import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}',
|
||||
alertmanagerClusterLabels: 'namespace,service',
|
||||
alertmanagerSelector: 'job="alertmanager-' + m.config.alertmanagerName + '",namespace="' + m.config.namespace + '"',
|
||||
},
|
||||
},
|
||||
|
||||
nodeExporter:
|
||||
(import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') {
|
||||
_config+:: {
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
fsSpaceFillingUpCriticalThreshold: 15,
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -2,6 +2,7 @@ local krp = (import '../kube-rbac-proxy/container.libsonnet');
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
name: 'node-exporter',
|
||||
namespace: error 'must provide namespace',
|
||||
version: error 'must provide version',
|
||||
image: error 'must provide version',
|
||||
@@ -12,7 +13,7 @@ local defaults = {
|
||||
listenAddress: '127.0.0.1',
|
||||
port: 9100,
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'node-exporter',
|
||||
'app.kubernetes.io/name': defaults.name,
|
||||
'app.kubernetes.io/version': defaults.version,
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
@@ -22,6 +23,14 @@ local defaults = {
|
||||
for labelName in std.objectFields(defaults.commonLabels)
|
||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
nodeExporterSelector: 'job="' + defaults.name + '"',
|
||||
fsSpaceFillingUpCriticalThreshold: 15,
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -30,22 +39,42 @@ function(params) {
|
||||
config:: defaults + params,
|
||||
// Safety check
|
||||
assert std.isObject(ne.config.resources),
|
||||
assert std.isObject(ne.config.mixin._config),
|
||||
|
||||
mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') {
|
||||
_config+:: ne.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: ne.config.commonLabels + ne.config.mixin.ruleLabels,
|
||||
name: ne.config.name + '-rules',
|
||||
namespace: ne.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
|
||||
clusterRoleBinding: {
|
||||
apiVersion: 'rbac.authorization.k8s.io/v1',
|
||||
kind: 'ClusterRoleBinding',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
roleRef: {
|
||||
apiGroup: 'rbac.authorization.k8s.io',
|
||||
kind: 'ClusterRole',
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
},
|
||||
subjects: [{
|
||||
kind: 'ServiceAccount',
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
namespace: ne.config.namespace,
|
||||
}],
|
||||
},
|
||||
@@ -54,7 +83,7 @@ function(params) {
|
||||
apiVersion: 'rbac.authorization.k8s.io/v1',
|
||||
kind: 'ClusterRole',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
rules: [
|
||||
@@ -75,7 +104,7 @@ function(params) {
|
||||
apiVersion: 'v1',
|
||||
kind: 'ServiceAccount',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
namespace: ne.config.namespace,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
@@ -85,7 +114,7 @@ function(params) {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
namespace: ne.config.namespace,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
@@ -102,7 +131,7 @@ function(params) {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
namespace: ne.config.namespace,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
@@ -134,7 +163,7 @@ function(params) {
|
||||
|
||||
daemonset:
|
||||
local nodeExporter = {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
image: ne.config.image,
|
||||
args: [
|
||||
'--web.listen-address=' + std.join(':', [ne.config.listenAddress, std.toString(ne.config.port)]),
|
||||
@@ -177,7 +206,7 @@ function(params) {
|
||||
apiVersion: 'apps/v1',
|
||||
kind: 'DaemonSet',
|
||||
metadata: {
|
||||
name: 'node-exporter',
|
||||
name: ne.config.name,
|
||||
namespace: ne.config.namespace,
|
||||
labels: ne.config.commonLabels,
|
||||
},
|
||||
@@ -199,7 +228,7 @@ function(params) {
|
||||
{ name: 'sys', hostPath: { path: '/sys' } },
|
||||
{ name: 'root', hostPath: { path: '/' } },
|
||||
],
|
||||
serviceAccountName: 'node-exporter',
|
||||
serviceAccountName: ne.config.name,
|
||||
securityContext: {
|
||||
runAsUser: 65534,
|
||||
runAsNonRoot: true,
|
||||
@@ -210,4 +239,6 @@ function(params) {
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -23,6 +23,15 @@ local defaults = {
|
||||
for labelName in std.objectFields(defaults.commonLabels)
|
||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {
|
||||
role: 'alert-rules',
|
||||
prometheus: defaults.name,
|
||||
},
|
||||
_config: {
|
||||
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params)
|
||||
@@ -31,6 +40,26 @@ function(params)
|
||||
assert std.isObject(config.resources);
|
||||
|
||||
prometheusOperator(config) {
|
||||
local po = self,
|
||||
mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') {
|
||||
_config+:: config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: config.commonLabels + config.mixin.ruleLabels,
|
||||
name: config.name + '-rules',
|
||||
namespace: config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
|
||||
service+: {
|
||||
spec+: {
|
||||
ports: [
|
||||
|
||||
@@ -13,9 +13,6 @@ local defaults = {
|
||||
alertmanagerName: error 'must provide alertmanagerName',
|
||||
namespaces: ['default', 'kube-system', defaults.namespace],
|
||||
replicas: 2,
|
||||
rules: {
|
||||
groups: [],
|
||||
},
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'prometheus',
|
||||
'app.kubernetes.io/version': defaults.version,
|
||||
@@ -27,6 +24,19 @@ local defaults = {
|
||||
for labelName in std.objectFields(defaults.commonLabels)
|
||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
||||
} + { prometheus: defaults.name },
|
||||
ruleSelector: {
|
||||
matchLabels: defaults.mixin.ruleLabels,
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {
|
||||
role: 'alert-rules',
|
||||
prometheus: defaults.name,
|
||||
},
|
||||
_config: {
|
||||
prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"',
|
||||
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -35,6 +45,26 @@ function(params) {
|
||||
config:: defaults + params,
|
||||
// Safety check
|
||||
assert std.isObject(p.config.resources),
|
||||
assert std.isObject(p.config.mixin._config),
|
||||
|
||||
mixin:: (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') {
|
||||
_config+:: p.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: p.config.commonLabels + p.config.mixin.ruleLabels,
|
||||
name: p.config.name + '-rules',
|
||||
namespace: p.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
|
||||
serviceAccount: {
|
||||
apiVersion: 'v1',
|
||||
@@ -63,22 +93,6 @@ function(params) {
|
||||
},
|
||||
},
|
||||
|
||||
rules: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: {
|
||||
prometheus: p.config.name,
|
||||
role: 'alert-rules',
|
||||
} + p.config.commonLabels,
|
||||
name: 'prometheus-' + p.config.name + '-rules',
|
||||
namespace: p.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
groups: p.config.rules.groups,
|
||||
},
|
||||
},
|
||||
|
||||
roleBindingSpecificNamespaces:
|
||||
local newSpecificRoleBinding(namespace) = {
|
||||
apiVersion: 'rbac.authorization.k8s.io/v1',
|
||||
@@ -230,12 +244,7 @@ function(params) {
|
||||
podMonitorNamespaceSelector: {},
|
||||
probeNamespaceSelector: {},
|
||||
nodeSelector: { 'kubernetes.io/os': 'linux' },
|
||||
ruleSelector: {
|
||||
matchLabels: {
|
||||
role: 'alert-rules',
|
||||
prometheus: p.config.name,
|
||||
},
|
||||
},
|
||||
ruleSelector: p.config.ruleSelector,
|
||||
resources: p.config.resources,
|
||||
alerting: {
|
||||
alertmanagers: [{
|
||||
|
||||
@@ -2,6 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ./manifests/alertmanager-alertmanager.yaml
|
||||
- ./manifests/alertmanager-prometheusRule.yaml
|
||||
- ./manifests/alertmanager-secret.yaml
|
||||
- ./manifests/alertmanager-service.yaml
|
||||
- ./manifests/alertmanager-serviceAccount.yaml
|
||||
@@ -20,15 +21,19 @@ resources:
|
||||
- ./manifests/grafana-service.yaml
|
||||
- ./manifests/grafana-serviceAccount.yaml
|
||||
- ./manifests/grafana-serviceMonitor.yaml
|
||||
- ./manifests/kube-prometheus-prometheusRule.yaml
|
||||
- ./manifests/kube-state-metrics-clusterRole.yaml
|
||||
- ./manifests/kube-state-metrics-clusterRoleBinding.yaml
|
||||
- ./manifests/kube-state-metrics-deployment.yaml
|
||||
- ./manifests/kube-state-metrics-prometheusRule.yaml
|
||||
- ./manifests/kube-state-metrics-service.yaml
|
||||
- ./manifests/kube-state-metrics-serviceAccount.yaml
|
||||
- ./manifests/kube-state-metrics-serviceMonitor.yaml
|
||||
- ./manifests/kubernetes-prometheusRule.yaml
|
||||
- ./manifests/node-exporter-clusterRole.yaml
|
||||
- ./manifests/node-exporter-clusterRoleBinding.yaml
|
||||
- ./manifests/node-exporter-daemonset.yaml
|
||||
- ./manifests/node-exporter-prometheusRule.yaml
|
||||
- ./manifests/node-exporter-service.yaml
|
||||
- ./manifests/node-exporter-serviceAccount.yaml
|
||||
- ./manifests/node-exporter-serviceMonitor.yaml
|
||||
@@ -46,13 +51,14 @@ resources:
|
||||
- ./manifests/prometheus-adapter-serviceMonitor.yaml
|
||||
- ./manifests/prometheus-clusterRole.yaml
|
||||
- ./manifests/prometheus-clusterRoleBinding.yaml
|
||||
- ./manifests/prometheus-operator-prometheusRule.yaml
|
||||
- ./manifests/prometheus-operator-serviceMonitor.yaml
|
||||
- ./manifests/prometheus-prometheus.yaml
|
||||
- ./manifests/prometheus-prometheusRule.yaml
|
||||
- ./manifests/prometheus-roleBindingConfig.yaml
|
||||
- ./manifests/prometheus-roleBindingSpecificNamespaces.yaml
|
||||
- ./manifests/prometheus-roleConfig.yaml
|
||||
- ./manifests/prometheus-roleSpecificNamespaces.yaml
|
||||
- ./manifests/prometheus-rules.yaml
|
||||
- ./manifests/prometheus-service.yaml
|
||||
- ./manifests/prometheus-serviceAccount.yaml
|
||||
- ./manifests/prometheus-serviceMonitor.yaml
|
||||
|
||||
116
manifests/alertmanager-prometheusRule.yaml
Normal file
116
manifests/alertmanager-prometheusRule.yaml
Normal file
@@ -0,0 +1,116 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
app.kubernetes.io/version: 0.21.0
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: main-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
< on (namespace,service) group_left
|
||||
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications.
|
||||
expr: |
|
||||
min by (namespace,service) (
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |
|
||||
count by (namespace,service) (
|
||||
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
|
||||
)
|
||||
!= 1
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |
|
||||
(
|
||||
count by (namespace,service) (
|
||||
avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="alertmanager-main",namespace="monitoring"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |
|
||||
(
|
||||
count by (namespace,service) (
|
||||
changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="alertmanager-main",namespace="monitoring"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -13,7 +13,7 @@ spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
checksum/grafana-dashboards: a9e19e1ab605dc374f30edda771e6917
|
||||
checksum/grafana-dashboards: b02ae450c84445cbaca8c685eefaec6c
|
||||
checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a
|
||||
labels:
|
||||
app: grafana
|
||||
|
||||
63
manifests/kube-prometheus-prometheusRule.yaml
Normal file
63
manifests/kube-prometheus-prometheusRule.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: kube-prometheus
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: kube-prometheus-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
message: |
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
40
manifests/kube-state-metrics-prometheusRule.yaml
Normal file
40
manifests/kube-state-metrics-prometheusRule.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
app.kubernetes.io/version: 1.9.7
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: kube-state-metrics-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
File diff suppressed because it is too large
Load Diff
266
manifests/node-exporter-prometheusRule.yaml
Normal file
266
manifests/node-exporter-prometheusRule.yaml
Normal file
@@ -0,0 +1,266 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: node-exporter
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
app.kubernetes.io/version: 1.0.1
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: node-exporter-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: |
|
||||
node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
summary: RAID Array is degraded
|
||||
expr: |
|
||||
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
summary: Failed device in RAID array
|
||||
expr: |
|
||||
node_md_disks{state="fail"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate1m
|
||||
- expr: |
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: |
|
||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
79
manifests/prometheus-operator-prometheusRule.yaml
Normal file
79
manifests/prometheus-operator-prometheusRule.yaml
Normal file
@@ -0,0 +1,79 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
app.kubernetes.io/version: 0.45.0
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: prometheus-operator-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
|
||||
summary: Last controller reconciliation failed
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while reconciling controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: |
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
|
||||
summary: Prometheus operator not ready
|
||||
expr: |
|
||||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
213
manifests/prometheus-prometheusRule.yaml
Normal file
213
manifests/prometheus-prometheusRule.yaml
Normal file
@@ -0,0 +1,213 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: prometheus
|
||||
app.kubernetes.io/name: prometheus
|
||||
app.kubernetes.io/part-of: kube-prometheus
|
||||
app.kubernetes.io/version: 2.24.0
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: k8s-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
/
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
+
|
||||
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
Reference in New Issue
Block a user