kube-prometheus: Migrate kube-prometheus alerts to jsonnet
This commit is contained in:
53
jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
Normal file
53
jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'alertmanager.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'AlertmanagerConfigInconsistent',
|
||||
annotations: {
|
||||
description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
|
||||
summary: 'Configuration out of sync',
|
||||
},
|
||||
expr: |||
|
||||
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerDownOrMissing',
|
||||
annotations: {
|
||||
description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.',
|
||||
summary: 'Alertmanager down or missing',
|
||||
},
|
||||
expr: |||
|
||||
label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerFailedReload',
|
||||
annotations: {
|
||||
description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.",
|
||||
summary: "Alertmanager's configuration reload failed",
|
||||
},
|
||||
expr: |||
|
||||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
4
jsonnet/kube-prometheus/alerts/alerts.libsonnet
Normal file
4
jsonnet/kube-prometheus/alerts/alerts.libsonnet
Normal file
@@ -0,0 +1,4 @@
|
||||
(import 'alertmanager.libsonnet') +
|
||||
(import 'general.libsonnet') +
|
||||
(import 'node.libsonnet') +
|
||||
(import 'prometheus.libsonnet')
|
34
jsonnet/kube-prometheus/alerts/general.libsonnet
Normal file
34
jsonnet/kube-prometheus/alerts/general.libsonnet
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'general.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'TargetDown',
|
||||
annotations: {
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.',
|
||||
summary: 'Targets are down',
|
||||
},
|
||||
expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'DeadMansSwitch',
|
||||
annotations: {
|
||||
description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.',
|
||||
summary: 'Alerting DeadMansSwitch',
|
||||
},
|
||||
expr: 'vector(1)',
|
||||
labels: {
|
||||
severity: 'none',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
39
jsonnet/kube-prometheus/alerts/node.libsonnet
Normal file
39
jsonnet/kube-prometheus/alerts/node.libsonnet
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-node-alerting.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'NodeDiskRunningFull',
|
||||
annotations: {
|
||||
description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})',
|
||||
summary: 'Node disk is running full within 24 hours',
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0
|
||||
||| % $._config,
|
||||
'for': '30m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeDiskRunningFull',
|
||||
annotations: {
|
||||
description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})',
|
||||
summary: 'Node disk is running full within 2 hours',
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
151
jsonnet/kube-prometheus/alerts/prometheus.libsonnet
Normal file
151
jsonnet/kube-prometheus/alerts/prometheus.libsonnet
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'prometheus.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'PrometheusConfigReloadFailed',
|
||||
annotations: {
|
||||
description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
|
||||
summary: "Reloading Promehteus' configuration failed",
|
||||
},
|
||||
expr: |||
|
||||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotificationQueueRunningFull',
|
||||
annotations: {
|
||||
description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
||||
summary: "Prometheus' alert notification queue is running full",
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s}
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlerts',
|
||||
annotations: {
|
||||
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||
summary: 'Errors while sending alert from Prometheus',
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlerts',
|
||||
annotations: {
|
||||
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||
summary: 'Errors while sending alerts from Prometheus',
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||
annotations: {
|
||||
description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
|
||||
summary: 'Prometheus is not connected to any Alertmanagers',
|
||||
},
|
||||
expr: |||
|
||||
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBReloadsFailing',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
|
||||
summary: 'Prometheus has issues reloading data blocks from disk',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
|
||||
||| % $._config,
|
||||
'for': '12h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBCompactionsFailing',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
|
||||
summary: 'Prometheus has issues compacting sample blocks',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
|
||||
||| % $._config,
|
||||
'for': '12h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBWALCorruptions',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
|
||||
summary: 'Prometheus write-ahead log is corrupted',
|
||||
},
|
||||
expr: |||
|
||||
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
|
||||
||| % $._config,
|
||||
'for': '4h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotIngestingSamples',
|
||||
annotations: {
|
||||
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
||||
summary: "Prometheus isn't ingesting samples",
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTargetScapesDuplicate',
|
||||
annotations: {
|
||||
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
|
||||
summary: 'Prometheus has many samples rejected',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
@@ -6,7 +6,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
||||
(import 'alertmanager/alertmanager.libsonnet') +
|
||||
(import 'prometheus-operator/prometheus-operator.libsonnet') +
|
||||
(import 'prometheus/prometheus.libsonnet') +
|
||||
(import 'kubernetes-mixin/mixin.libsonnet') + {
|
||||
(import 'kubernetes-mixin/mixin.libsonnet') +
|
||||
(import 'alerts/alerts.libsonnet') +
|
||||
(import 'rules/rules.libsonnet') + {
|
||||
kubePrometheus+:: {
|
||||
namespace: k.core.v1.namespace.new($._config.namespace),
|
||||
},
|
||||
@@ -14,11 +16,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
||||
_config+:: {
|
||||
namespace: 'default',
|
||||
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
cadvisorSelector: 'job="kubelet"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
kubeletSelector: 'job="kubelet"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
notKubeDnsSelector: 'job!="kube-dns"',
|
||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
|
||||
alertmanagerSelector: 'job="alertmanager-main"',
|
||||
prometheusSelector: 'job="prometheus-k8s"',
|
||||
prometheusOperatorSelector: 'job="prometheus-operator"',
|
||||
|
||||
jobs: {
|
||||
Kubelet: $._config.kubeletSelector,
|
||||
KubeScheduler: $._config.kubeSchedulerSelector,
|
||||
KubeControllerManager: $._config.kubeControllerManagerSelector,
|
||||
KubeAPI: $._config.kubeApiserverSelector,
|
||||
KubeStateMetrics: $._config.kubeStateMetricsSelector,
|
||||
NodeExporter: $._config.nodeExporterSelector,
|
||||
Alertmanager: $._config.alertmanagerSelector,
|
||||
Prometheus: $._config.prometheusSelector,
|
||||
PrometheusOperator: $._config.prometheusOperatorSelector,
|
||||
},
|
||||
|
||||
prometheus+:: {
|
||||
rules: $.prometheusRules + $.prometheusAlerts,
|
||||
|
39
jsonnet/kube-prometheus/rules/rules.libsonnet
Normal file
39
jsonnet/kube-prometheus/rules/rules.libsonnet
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-node-recording.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
|
||||
record: 'instance:node_cpu:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)',
|
||||
record: 'instance:node_filesystem_usage:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)',
|
||||
record: 'instance:node_network_receive_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)',
|
||||
record: 'instance:node_network_transmit_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)',
|
||||
record: 'instance:node_cpu:ratio',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))',
|
||||
record: 'cluster:node_cpu:sum_rate5m',
|
||||
},
|
||||
{
|
||||
expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))',
|
||||
record: 'cluster:node_cpu:ratio',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
Reference in New Issue
Block a user