kube-prometheus: Migrate kube-prometheus alerts to jsonnet

2018-05-28 16:54:39 +02:00
parent 309974fadb
commit 64db049d3a
13 changed files with 497 additions and 258 deletions
--- a/assets/prometheus/rules/alertmanager.rules.yaml
+++ b/assets/prometheus/rules/alertmanager.rules.yaml
@@ -1,33 +0,0 @@
-groups:
- name: alertmanager.rules
-  rules:
-  - alert: AlertmanagerConfigInconsistent
-    expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
-      GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
-      "alertmanager-$1", "alertmanager", "(.*)") != 1
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      description: The configuration of the instances of the Alertmanager cluster
-        `{{$labels.service}}` are out of sync.
-      summary: Configuration out of sync
-  - alert: AlertmanagerDownOrMissing
-    expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
-      "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      description: An unexpected number of Alertmanagers are scraped or Alertmanagers
-        disappeared from discovery.
-      summary: Alertmanager down or missing
-  - alert: AlertmanagerFailedReload
-    expr: alertmanager_config_last_reload_successful == 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
-        }}/{{ $labels.pod}}.
-      summary: Alertmanager's configuration reload failed
--- a/assets/prometheus/rules/general.rules.yaml
+++ b/assets/prometheus/rules/general.rules.yaml
@@ -1,39 +0,0 @@
-groups:
- name: general.rules
-  rules:
-  - alert: TargetDown
-    expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: '{{ $value }}% of {{ $labels.job }} targets are down.'
-      summary: Targets are down
-  - alert: DeadMansSwitch
-    expr: vector(1)
-    labels:
-      severity: none
-    annotations:
-      description: This is a DeadMansSwitch meant to ensure that the entire Alerting
-        pipeline is functional.
-      summary: Alerting DeadMansSwitch
-  - record: fd_utilization
-    expr: process_open_fds / process_max_fds
-  - alert: FdExhaustionClose
-    expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
-        will exhaust in file/socket descriptors within the next 4 hours'
-      summary: file descriptors soon exhausted
-  - alert: FdExhaustionClose
-    expr: predict_linear(fd_utilization[10m], 3600) > 1
-    for: 10m
-    labels:
-      severity: critical
-    annotations:
-      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
-        will exhaust in file/socket descriptors within the next hour'
-      summary: file descriptors soon exhausted
--- a/assets/prometheus/rules/node.rules.yaml
+++ b/assets/prometheus/rules/node.rules.yaml
@@ -1,47 +0,0 @@
-groups:
- name: node.rules
-  rules:
-  - record: instance:node_cpu:rate:sum
-    expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m]))
-      BY (instance)
-  - record: instance:node_filesystem_usage:sum
-    expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
-      BY (instance)
-  - record: instance:node_network_receive_bytes:rate:sum
-    expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
-  - record: instance:node_network_transmit_bytes:rate:sum
-    expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
-  - record: instance:node_cpu:ratio
-    expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance)
-      GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
-  - record: cluster:node_cpu:sum_rate5m
-    expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
-  - record: cluster:node_cpu:ratio
-    expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
-  - alert: NodeExporterDown
-    expr: absent(up{job="node-exporter"} == 1)
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Prometheus could not scrape a node-exporter for more than 10m,
-        or node-exporters have disappeared from discovery
-      summary: Prometheus could not scrape a node-exporter
-  - alert: NodeDiskRunningFull
-    expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
-    for: 30m
-    labels:
-      severity: warning
-    annotations:
-      description: device {{$labels.device}} on node {{$labels.instance}} is running
-        full within the next 24 hours (mounted at {{$labels.mountpoint}})
-      summary: Node disk is running full within 24 hours
-  - alert: NodeDiskRunningFull
-    expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
-    for: 10m
-    labels:
-      severity: critical
-    annotations:
-      description: device {{$labels.device}} on node {{$labels.instance}} is running
-        full within the next 2 hours (mounted at {{$labels.mountpoint}})
-      summary: Node disk is running full within 2 hours
--- a/assets/prometheus/rules/prometheus.rules.yaml
+++ b/assets/prometheus/rules/prometheus.rules.yaml
@@ -1,101 +0,0 @@
-groups:
- name: prometheus.rules
-  rules:
-  - alert: PrometheusConfigReloadFailed
-    expr: prometheus_config_last_reload_successful == 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
-      summary: Reloading Promehteus' configuration failed
-
-  - alert: PrometheusNotificationQueueRunningFull
-    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
-        $labels.pod}}
-      summary: Prometheus' alert notification queue is running full  
-
-  - alert: PrometheusErrorSendingAlerts
-    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
-      > 0.01
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
-        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
-      summary: Errors while sending alert from Prometheus
-
-  - alert: PrometheusErrorSendingAlerts
-    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
-      > 0.03
-    for: 10m
-    labels:
-      severity: critical
-    annotations:
-      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
-        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
-      summary: Errors while sending alerts from Prometheus
-
-  - alert: PrometheusNotConnectedToAlertmanagers
-    expr: prometheus_notifications_alertmanagers_discovered < 1
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
-        to any Alertmanagers
-      summary: Prometheus is not connected to any Alertmanagers
-
-  - alert: PrometheusTSDBReloadsFailing
-    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
-    for: 12h
-    labels:
-      severity: warning
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
-        reload failures over the last four hours.'
-      summary: Prometheus has issues reloading data blocks from disk
-
-  - alert: PrometheusTSDBCompactionsFailing
-    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
-    for: 12h
-    labels:
-      severity: warning
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
-        compaction failures over the last four hours.'
-      summary: Prometheus has issues compacting sample blocks
-
-  - alert: PrometheusTSDBWALCorruptions
-    expr: tsdb_wal_corruptions_total > 0
-    for: 4h
-    labels:
-      severity: warning
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
-        log (WAL).'
-      summary: Prometheus write-ahead log is corrupted
-
-  - alert: PrometheusNotIngestingSamples
-    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
-      summary: "Prometheus isn't ingesting samples"
-
-  - alert: PrometheusTargetScapesDuplicate
-    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
-      summary: Prometheus has many samples rejected