jsonnet: move files around
Signed-off-by: paulfantom <pawel@krupa.net.pl>
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
(import 'general.libsonnet') +
|
||||
(import 'node.libsonnet')
|
||||
@@ -0,0 +1,38 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'general.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'TargetDown',
|
||||
annotations: {
|
||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
|
||||
},
|
||||
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'Watchdog',
|
||||
annotations: {
|
||||
message: |||
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|||,
|
||||
},
|
||||
expr: 'vector(1)',
|
||||
labels: {
|
||||
severity: 'none',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node-network',
|
||||
rules: [
|
||||
{
|
||||
alert: 'NodeNetworkInterfaceFlapping',
|
||||
annotations: {
|
||||
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
|
||||
},
|
||||
expr: |||
|
||||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
|
||||
||| % $._config,
|
||||
'for': '2m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
157
jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml
Normal file
157
jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml
Normal file
@@ -0,0 +1,157 @@
|
||||
# TODO(metalmatze): This file is temporarily saved here for later reference
|
||||
# until we find out how to integrate the tests into our jsonnet stack.
|
||||
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
- eval_time: 11m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 17m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 23m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
- eval_time: 11m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 17m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- eval_time: 23m
|
||||
alertname: AlertmanagerMembersInconsistent
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.0
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-0
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.1
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-1
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
- exp_labels:
|
||||
service: 'alertmanager-main'
|
||||
severity: critical
|
||||
job: 'alertmanager-main'
|
||||
instance: 10.10.10.2
|
||||
namespace: monitoring
|
||||
pod: alertmanager-main-2
|
||||
exp_annotations:
|
||||
message: 'Alertmanager has not found all other members of the cluster.'
|
||||
42
jsonnet/kube-prometheus/components/mixin/custom.libsonnet
Normal file
42
jsonnet/kube-prometheus/components/mixin/custom.libsonnet
Normal file
@@ -0,0 +1,42 @@
|
||||
local defaults = {
|
||||
name: 'kube-prometheus',
|
||||
namespace: error 'must provide namespace',
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'kube-prometheus',
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local m = self,
|
||||
config:: defaults + params,
|
||||
|
||||
local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'),
|
||||
|
||||
mixin:: alertsandrules {
|
||||
_config+:: m.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
|
||||
name: m.config.name + '-rules',
|
||||
namespace: m.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [],
|
||||
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [],
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
local defaults = {
|
||||
name: 'kubernetes',
|
||||
namespace: error 'must provide namespace',
|
||||
commonLabels:: {
|
||||
'app.kubernetes.io/name': 'kube-prometheus',
|
||||
'app.kubernetes.io/component': 'exporter',
|
||||
'app.kubernetes.io/part-of': 'kube-prometheus',
|
||||
},
|
||||
mixin: {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function(params) {
|
||||
local m = self,
|
||||
config:: defaults + params,
|
||||
|
||||
mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') {
|
||||
_config+:: m.config.mixin._config,
|
||||
},
|
||||
|
||||
prometheusRule: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
|
||||
name: m.config.name + '-rules',
|
||||
namespace: m.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else {},
|
||||
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else {},
|
||||
groups: a + r,
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-general.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'count without(instance, pod, node) (up == 1)',
|
||||
record: 'count:up1',
|
||||
},
|
||||
{
|
||||
expr: 'count without(instance, pod, node) (up == 0)',
|
||||
record: 'count:up0',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-node-recording.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)',
|
||||
record: 'instance:node_cpu:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
|
||||
record: 'instance:node_network_receive_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)',
|
||||
record: 'instance:node_network_transmit_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)',
|
||||
record: 'instance:node_cpu:ratio',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))',
|
||||
record: 'cluster:node_cpu:sum_rate5m',
|
||||
},
|
||||
{
|
||||
expr: 'cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))',
|
||||
record: 'cluster:node_cpu:ratio',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
(import 'node-rules.libsonnet') +
|
||||
(import 'general.libsonnet')
|
||||
Reference in New Issue
Block a user