Merge remote-tracking branch 'upstream/master' into feature/prometheus-config-reloader-flags

This commit is contained in:
Jessie A. Morris
2019-03-18 13:08:12 -06:00
20 changed files with 586 additions and 222 deletions

View File

@@ -260,12 +260,12 @@ These are the available fields with their respective default values:
namespace: "default", namespace: "default",
versions+:: { versions+:: {
alertmanager: "v0.16.0", alertmanager: "v0.16.1",
nodeExporter: "v0.17.0", nodeExporter: "v0.17.0",
kubeStateMetrics: "v1.5.0", kubeStateMetrics: "v1.5.0",
kubeRbacProxy: "v0.4.1", kubeRbacProxy: "v0.4.1",
addonResizer: "2.1", addonResizer: "1.8.4",
prometheusOperator: "v0.28.0", prometheusOperator: "v0.29.0",
prometheus: "v2.5.0", prometheus: "v2.5.0",
}, },
@@ -274,7 +274,7 @@ These are the available fields with their respective default values:
alertmanager: "quay.io/prometheus/alertmanager", alertmanager: "quay.io/prometheus/alertmanager",
kubeStateMetrics: "quay.io/coreos/kube-state-metrics", kubeStateMetrics: "quay.io/coreos/kube-state-metrics",
kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy",
addonResizer: "gcr.io/google-containers/addon-resizer-amd64", addonResizer: "k8s.gcr.io/addon-resizer",
nodeExporter: "quay.io/prometheus/node-exporter", nodeExporter: "quay.io/prometheus/node-exporter",
prometheusOperator: "quay.io/coreos/prometheus-operator", prometheusOperator: "quay.io/coreos/prometheus-operator",
}, },
@@ -298,7 +298,7 @@ These are the available fields with their respective default values:
receiver: 'null' receiver: 'null'
routes: routes:
- match: - match:
alertname: DeadMansSwitch alertname: Watchdog
receiver: 'null' receiver: 'null'
receivers: receivers:
- name: 'null' - name: 'null'
@@ -402,12 +402,12 @@ To produce the `docker pull/tag/push` commands that will synchronize upstream im
```shell ```shell
$ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet
docker pull gcr.io/google-containers/addon-resizer-amd64:2.1 docker pull k8s.gcr.io/addon-resizer:1.8.4
docker tag gcr.io/google-containers/addon-resizer-amd64:2.1 internal-registry.com/organization/addon-resizer:2.1 docker tag k8s.gcr.io/addon-resizer:1.8.4 internal-registry.com/organization/addon-resizer:1.8.4
docker push internal-registry.com/organization/addon-resizer:2.1 docker push internal-registry.com/organization/addon-resizer:1.8.4
docker pull quay.io/prometheus/alertmanager:v0.15.3 docker pull quay.io/prometheus/alertmanager:v0.16.1
docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 docker tag quay.io/prometheus/alertmanager:v0.16.1 internal-registry.com/organization/alertmanager:v0.16.1
docker push internal-registry.com/organization/alertmanager:v0.15.3 docker push internal-registry.com/organization/alertmanager:v0.16.1
... ...
``` ```
@@ -497,7 +497,7 @@ The Alertmanager configuration is located in the `_config.alertmanager.config` c
receiver: 'null' receiver: 'null'
routes: routes:
- match: - match:
alertname: DeadMansSwitch alertname: Watchdog
receiver: 'null' receiver: 'null'
receivers: receivers:
- name: 'null' - name: 'null'
@@ -581,7 +581,7 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ
As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md]. If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
#### Authentication problem #### Authentication problem

View File

@@ -49,13 +49,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
name: 'example-group', name: 'example-group',
rules: [ rules: [
{ {
alert: 'DeadMansSwitch', alert: 'Watchdog',
expr: 'vector(1)', expr: 'vector(1)',
labels: { labels: {
severity: 'none', severity: 'none',
}, },
annotations: { annotations: {
description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.',
}, },
}, },
], ],
@@ -139,7 +139,75 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
``` ```
### Changing default rules
Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts).
Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above.
#### Filter
Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet).
```jsonnet
local filter = {
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'kubernetes-apps' then
group {
rules: std.filter(function(rule)
rule.alert != "KubeStatefulSetReplicasMismatch",
group.rules
)
}
else
group,
super.groups
),
},
};
```
#### Adjustment
Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet).
```jsonnet
local update = {
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'kubernetes-apps' then
group {
rules: std.map(
function(rule)
if rule.alert == "KubeStatefulSetReplicasMismatch" then
rule {
expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}"
}
else
rule,
group.rules
)
}
else
group,
super.groups
),
},
};
```
Using the example from above about adding in pre-rendered rules, the new local vaiables can be added in as follows:
```jsonnet
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + {
prometheusAlerts+:: (import 'existingrule.json'),
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
```
## Dashboards ## Dashboards
Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard. Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard.

View File

@@ -12,7 +12,7 @@
receiver: 'null' receiver: 'null'
routes: routes:
- match: - match:
alertname: DeadMansSwitch alertname: Watchdog
receiver: 'null' receiver: 'null'
receivers: receivers:
- name: 'null' - name: 'null'

View File

@@ -9,7 +9,7 @@ route:
receiver: 'null' receiver: 'null'
routes: routes:
- match: - match:
alertname: DeadMansSwitch alertname: Watchdog
receiver: 'null' receiver: 'null'
receivers: receivers:
- name: 'null' - name: 'null'

View File

@@ -1 +1 @@
{"groups":[{"name":"example-group","rules":[{"alert":"DeadMansSwitch","annotations":{"description":"This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} {"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]}

View File

@@ -1,9 +1,9 @@
groups: groups:
- name: example-group - name: example-group
rules: rules:
- alert: DeadMansSwitch - alert: Watchdog
expr: vector(1) expr: vector(1)
labels: labels:
severity: "none" severity: "none"
annotations: annotations:
description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional.

View File

@@ -8,13 +8,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
name: 'example-group', name: 'example-group',
rules: [ rules: [
{ {
alert: 'DeadMansSwitch', alert: 'Watchdog',
expr: 'vector(1)', expr: 'vector(1)',
labels: { labels: {
severity: 'none', severity: 'none',
}, },
annotations: { annotations: {
description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.',
}, },
}, },
], ],

View File

@@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
namespace: 'default', namespace: 'default',
versions+:: { versions+:: {
alertmanager: 'v0.16.0', alertmanager: 'v0.16.1',
}, },
imageRepos+:: { imageRepos+:: {
@@ -28,7 +28,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
{ {
receiver: 'null', receiver: 'null',
match: { match: {
alertname: 'DeadMansSwitch', alertname: 'Watchdog',
}, },
}, },
], ],

View File

@@ -109,7 +109,7 @@
summary: 'Prometheus write-ahead log is corrupted', summary: 'Prometheus write-ahead log is corrupted',
}, },
expr: ||| expr: |||
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
||| % $._config, ||| % $._config,
'for': '4h', 'for': '4h',
labels: { labels: {

View File

@@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'), service.mixin.spec.withClusterIp('None'),
kubeDnsPrometheusDiscoveryService: kubeDnsPrometheusDiscoveryService:
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
service.mixin.metadata.withNamespace('kube-system') + service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
service.mixin.spec.withClusterIp('None'), service.mixin.spec.withClusterIp('None'),

View File

@@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{ {
_config+:: { _config+:: {
versions+:: { versions+:: {
thanos: 'v0.2.1', thanos: 'v0.3.2',
}, },
imageRepos+:: { imageRepos+:: {
thanos: 'improbable/thanos', thanos: 'improbable/thanos',

View File

@@ -43,7 +43,7 @@ local configMapList = k.core.v1.configMapList;
namespace: 'default', namespace: 'default',
versions+:: { versions+:: {
grafana: '6.0.0-beta1', grafana: '6.0.1',
}, },
tlsCipherSuites: [ tlsCipherSuites: [

View File

@@ -18,13 +18,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
versions+:: { versions+:: {
kubeStateMetrics: 'v1.5.0', kubeStateMetrics: 'v1.5.0',
kubeRbacProxy: 'v0.4.1', kubeRbacProxy: 'v0.4.1',
addonResizer: '2.1', addonResizer: '1.8.4',
}, },
imageRepos+:: { imageRepos+:: {
kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', kubeStateMetrics: 'quay.io/coreos/kube-state-metrics',
kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy',
addonResizer: 'gcr.io/google-containers/addon-resizer-amd64', addonResizer: 'k8s.gcr.io/addon-resizer',
}, },
}, },
@@ -175,7 +175,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
'--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode,
'--memory=' + $._config.kubeStateMetrics.baseMemory, '--memory=' + $._config.kubeStateMetrics.baseMemory,
'--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode,
'--acceptance-offset=5', '--threshold=5',
'--deployment=kube-state-metrics', '--deployment=kube-state-metrics',
]) + ]) +
container.withEnv([ container.withEnv([

View File

@@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
namespace: 'default', namespace: 'default',
versions+:: { versions+:: {
prometheus: 'v2.5.0', prometheus: 'v2.7.2',
}, },
imageRepos+:: { imageRepos+:: {

View File

@@ -8,7 +8,7 @@
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
} }
}, },
"version": "df002d09f7b7a50321786c4f19c70d371494410b" "version": "9faab58c2b1cce4def2cc35045162554b8e4a706"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "ccb787a44f2ebdecbb346d57490fa7e49981b323" "version": "b8b1a40066bd40bf7612bbb1cc9208f76530f44a"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@@ -48,7 +48,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "5d7e5391010c768a6ddd39163c35662f379e20ca" "version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e"
}, },
{ {
"name": "grafana", "name": "grafana",
@@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "a7e3bd06b2ef0286e1571836997287a81146c25a" "version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1"
} }
] ]
} }

View File

@@ -15,4 +15,4 @@ spec:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 1000 runAsUser: 1000
serviceAccountName: alertmanager-main serviceAccountName: alertmanager-main
version: v0.16.0 version: v0.16.1

View File

@@ -1,6 +1,6 @@
apiVersion: v1 apiVersion: v1
data: data:
alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
kind: Secret kind: Secret
metadata: metadata:
name: alertmanager-main name: alertmanager-main

File diff suppressed because it is too large Load Diff

View File

@@ -71,7 +71,7 @@ spec:
- --extra-cpu=2m - --extra-cpu=2m
- --memory=150Mi - --memory=150Mi
- --extra-memory=30Mi - --extra-memory=30Mi
- --acceptance-offset=5 - --threshold=5
- --deployment=kube-state-metrics - --deployment=kube-state-metrics
env: env:
- name: MY_POD_NAME - name: MY_POD_NAME
@@ -84,7 +84,7 @@ spec:
fieldRef: fieldRef:
apiVersion: v1 apiVersion: v1
fieldPath: metadata.namespace fieldPath: metadata.namespace
image: gcr.io/google-containers/addon-resizer-amd64:2.1 image: k8s.gcr.io/addon-resizer:1.8.4
name: addon-resizer name: addon-resizer
resources: resources:
limits: limits:

View File

@@ -225,21 +225,21 @@ spec:
) )
record: node:node_memory_swap_io_bytes:sum_rate record: node:node_memory_swap_io_bytes:sum_rate
- expr: | - expr: |
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: :node_disk_utilisation:avg_irate record: :node_disk_utilisation:avg_irate
- expr: | - expr: |
avg by (node) ( avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node) * on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info: node_namespace_pod:kube_pod_info:
) )
record: node:node_disk_utilisation:avg_irate record: node:node_disk_utilisation:avg_irate
- expr: | - expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate record: :node_disk_saturation:avg_irate
- expr: | - expr: |
avg by (node) ( avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node) * on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info: node_namespace_pod:kube_pod_info:
) )
@@ -769,9 +769,9 @@ spec:
message: API server is returning errors for {{ $value }}% of requests. message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/ /
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
@@ -780,9 +780,33 @@ spec:
message: API server is returning errors for {{ $value }}% of requests. message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/ /
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@@ -951,7 +975,7 @@ spec:
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
expr: | expr: |
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
for: 4h for: 4h
labels: labels:
severity: warning severity: warning