jsonnet: move files around

Signed-off-by: paulfantom <pawel@krupa.net.pl>
This commit is contained in:
paulfantom
2021-01-15 14:54:19 +01:00
parent 75f918067d
commit 1eedb90c17
48 changed files with 27 additions and 36 deletions

View File

@@ -0,0 +1,2 @@
(import 'general.libsonnet') +
(import 'node.libsonnet')

View File

@@ -0,0 +1,38 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'general.rules',
rules: [
{
alert: 'TargetDown',
annotations: {
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
},
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
'for': '10m',
labels: {
severity: 'warning',
},
},
{
alert: 'Watchdog',
annotations: {
message: |||
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
|||,
},
expr: 'vector(1)',
labels: {
severity: 'none',
},
},
],
},
],
},
}

View File

@@ -0,0 +1,24 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'node-network',
rules: [
{
alert: 'NodeNetworkInterfaceFlapping',
annotations: {
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
},
expr: |||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
||| % $._config,
'for': '2m',
labels: {
severity: 'warning',
},
},
],
},
],
},
}

View File

@@ -0,0 +1,157 @@
# TODO(metalmatze): This file is temporarily saved here for later reference
# until we find out how to integrate the tests into our jsonnet stack.
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'

View File

@@ -0,0 +1,42 @@
local defaults = {
name: 'kube-prometheus',
namespace: error 'must provide namespace',
commonLabels:: {
'app.kubernetes.io/name': 'kube-prometheus',
'app.kubernetes.io/component': 'exporter',
'app.kubernetes.io/part-of': 'kube-prometheus',
},
mixin: {
ruleLabels: {},
_config: {
nodeExporterSelector: 'job="node-exporter"',
hostNetworkInterfaceSelector: 'device!~"veth.+"',
},
},
};
function(params) {
local m = self,
config:: defaults + params,
local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'),
mixin:: alertsandrules {
_config+:: m.config.mixin._config,
},
prometheusRule: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
name: m.config.name + '-rules',
namespace: m.config.namespace,
},
spec: {
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [],
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [],
groups: a + r,
},
},
}

View File

@@ -0,0 +1,49 @@
local defaults = {
name: 'kubernetes',
namespace: error 'must provide namespace',
commonLabels:: {
'app.kubernetes.io/name': 'kube-prometheus',
'app.kubernetes.io/component': 'exporter',
'app.kubernetes.io/part-of': 'kube-prometheus',
},
mixin: {
ruleLabels: {},
_config: {
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
kubeStateMetricsSelector: 'job="kube-state-metrics"',
nodeExporterSelector: 'job="node-exporter"',
kubeSchedulerSelector: 'job="kube-scheduler"',
kubeControllerManagerSelector: 'job="kube-controller-manager"',
kubeApiserverSelector: 'job="apiserver"',
podLabel: 'pod',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
hostNetworkInterfaceSelector: 'device!~"veth.+"',
},
},
};
function(params) {
local m = self,
config:: defaults + params,
mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') {
_config+:: m.config.mixin._config,
},
prometheusRule: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
labels: m.config.commonLabels + m.config.mixin.ruleLabels,
name: m.config.name + '-rules',
namespace: m.config.namespace,
},
spec: {
local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else {},
local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else {},
groups: a + r,
},
},
}

View File

@@ -0,0 +1,19 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-general.rules',
rules: [
{
expr: 'count without(instance, pod, node) (up == 1)',
record: 'count:up1',
},
{
expr: 'count without(instance, pod, node) (up == 0)',
record: 'count:up0',
},
],
},
],
},
}

View File

@@ -0,0 +1,35 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-node-recording.rules',
rules: [
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)',
record: 'instance:node_cpu:rate:sum',
},
{
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
record: 'instance:node_network_receive_bytes:rate:sum',
},
{
expr: 'sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)',
record: 'instance:node_network_transmit_bytes:rate:sum',
},
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)',
record: 'instance:node_cpu:ratio',
},
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))',
record: 'cluster:node_cpu:sum_rate5m',
},
{
expr: 'cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))',
record: 'cluster:node_cpu:ratio',
},
],
},
],
},
}

View File

@@ -0,0 +1,2 @@
(import 'node-rules.libsonnet') +
(import 'general.libsonnet')