Merge remote-tracking branch 'upstream/master' into feature/prometheus-config-reloader-flags

This commit is contained in:
Jessie A. Morris
2019-03-18 13:08:12 -06:00
20 changed files with 586 additions and 222 deletions

View File

@@ -260,12 +260,12 @@ These are the available fields with their respective default values:
namespace: "default",
versions+:: {
alertmanager: "v0.16.0",
alertmanager: "v0.16.1",
nodeExporter: "v0.17.0",
kubeStateMetrics: "v1.5.0",
kubeRbacProxy: "v0.4.1",
addonResizer: "2.1",
prometheusOperator: "v0.28.0",
addonResizer: "1.8.4",
prometheusOperator: "v0.29.0",
prometheus: "v2.5.0",
},
@@ -274,7 +274,7 @@ These are the available fields with their respective default values:
alertmanager: "quay.io/prometheus/alertmanager",
kubeStateMetrics: "quay.io/coreos/kube-state-metrics",
kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy",
addonResizer: "gcr.io/google-containers/addon-resizer-amd64",
addonResizer: "k8s.gcr.io/addon-resizer",
nodeExporter: "quay.io/prometheus/node-exporter",
prometheusOperator: "quay.io/coreos/prometheus-operator",
},
@@ -298,7 +298,7 @@ These are the available fields with their respective default values:
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'
@@ -402,12 +402,12 @@ To produce the `docker pull/tag/push` commands that will synchronize upstream im
```shell
$ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet
docker pull gcr.io/google-containers/addon-resizer-amd64:2.1
docker tag gcr.io/google-containers/addon-resizer-amd64:2.1 internal-registry.com/organization/addon-resizer:2.1
docker push internal-registry.com/organization/addon-resizer:2.1
docker pull quay.io/prometheus/alertmanager:v0.15.3
docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3
docker push internal-registry.com/organization/alertmanager:v0.15.3
docker pull k8s.gcr.io/addon-resizer:1.8.4
docker tag k8s.gcr.io/addon-resizer:1.8.4 internal-registry.com/organization/addon-resizer:1.8.4
docker push internal-registry.com/organization/addon-resizer:1.8.4
docker pull quay.io/prometheus/alertmanager:v0.16.1
docker tag quay.io/prometheus/alertmanager:v0.16.1 internal-registry.com/organization/alertmanager:v0.16.1
docker push internal-registry.com/organization/alertmanager:v0.16.1
...
```
@@ -497,7 +497,7 @@ The Alertmanager configuration is located in the `_config.alertmanager.config` c
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'
@@ -581,7 +581,7 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ
As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md].
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
#### Authentication problem

View File

@@ -49,13 +49,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
name: 'example-group',
rules: [
{
alert: 'DeadMansSwitch',
alert: 'Watchdog',
expr: 'vector(1)',
labels: {
severity: 'none',
},
annotations: {
description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.',
description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.',
},
},
],
@@ -139,7 +139,75 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
```
### Changing default rules
Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts).
Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above.
#### Filter
Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet).
```jsonnet
local filter = {
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'kubernetes-apps' then
group {
rules: std.filter(function(rule)
rule.alert != "KubeStatefulSetReplicasMismatch",
group.rules
)
}
else
group,
super.groups
),
},
};
```
#### Adjustment
Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet).
```jsonnet
local update = {
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'kubernetes-apps' then
group {
rules: std.map(
function(rule)
if rule.alert == "KubeStatefulSetReplicasMismatch" then
rule {
expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}"
}
else
rule,
group.rules
)
}
else
group,
super.groups
),
},
};
```
Using the example from above about adding in pre-rendered rules, the new local vaiables can be added in as follows:
```jsonnet
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + {
prometheusAlerts+:: (import 'existingrule.json'),
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
```
## Dashboards
Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard.

View File

@@ -12,7 +12,7 @@
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'

View File

@@ -9,7 +9,7 @@ route:
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'

View File

@@ -1 +1 @@
{"groups":[{"name":"example-group","rules":[{"alert":"DeadMansSwitch","annotations":{"description":"This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]}
{"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]}

View File

@@ -1,9 +1,9 @@
groups:
- name: example-group
rules:
- alert: DeadMansSwitch
- alert: Watchdog
expr: vector(1)
labels:
severity: "none"
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.
description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional.

View File

@@ -8,13 +8,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
name: 'example-group',
rules: [
{
alert: 'DeadMansSwitch',
alert: 'Watchdog',
expr: 'vector(1)',
labels: {
severity: 'none',
},
annotations: {
description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.',
description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.',
},
},
],

View File

@@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
namespace: 'default',
versions+:: {
alertmanager: 'v0.16.0',
alertmanager: 'v0.16.1',
},
imageRepos+:: {
@@ -28,7 +28,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
{
receiver: 'null',
match: {
alertname: 'DeadMansSwitch',
alertname: 'Watchdog',
},
},
],

View File

@@ -109,7 +109,7 @@
summary: 'Prometheus write-ahead log is corrupted',
},
expr: |||
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
||| % $._config,
'for': '4h',
labels: {

View File

@@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
kubeDnsPrometheusDiscoveryService:
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
service.mixin.spec.withClusterIp('None'),

View File

@@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
versions+:: {
thanos: 'v0.2.1',
thanos: 'v0.3.2',
},
imageRepos+:: {
thanos: 'improbable/thanos',

View File

@@ -43,7 +43,7 @@ local configMapList = k.core.v1.configMapList;
namespace: 'default',
versions+:: {
grafana: '6.0.0-beta1',
grafana: '6.0.1',
},
tlsCipherSuites: [

View File

@@ -18,13 +18,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
versions+:: {
kubeStateMetrics: 'v1.5.0',
kubeRbacProxy: 'v0.4.1',
addonResizer: '2.1',
addonResizer: '1.8.4',
},
imageRepos+:: {
kubeStateMetrics: 'quay.io/coreos/kube-state-metrics',
kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy',
addonResizer: 'gcr.io/google-containers/addon-resizer-amd64',
addonResizer: 'k8s.gcr.io/addon-resizer',
},
},
@@ -175,7 +175,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
'--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode,
'--memory=' + $._config.kubeStateMetrics.baseMemory,
'--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode,
'--acceptance-offset=5',
'--threshold=5',
'--deployment=kube-state-metrics',
]) +
container.withEnv([

View File

@@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
namespace: 'default',
versions+:: {
prometheus: 'v2.5.0',
prometheus: 'v2.7.2',
},
imageRepos+:: {

View File

@@ -8,7 +8,7 @@
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
}
},
"version": "df002d09f7b7a50321786c4f19c70d371494410b"
"version": "9faab58c2b1cce4def2cc35045162554b8e4a706"
},
{
"name": "ksonnet",
@@ -28,7 +28,7 @@
"subdir": ""
}
},
"version": "ccb787a44f2ebdecbb346d57490fa7e49981b323"
"version": "b8b1a40066bd40bf7612bbb1cc9208f76530f44a"
},
{
"name": "grafonnet",
@@ -48,7 +48,7 @@
"subdir": "grafana-builder"
}
},
"version": "5d7e5391010c768a6ddd39163c35662f379e20ca"
"version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e"
},
{
"name": "grafana",
@@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "a7e3bd06b2ef0286e1571836997287a81146c25a"
"version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1"
}
]
}

View File

@@ -15,4 +15,4 @@ spec:
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: v0.16.0
version: v0.16.1

View File

@@ -1,6 +1,6 @@
apiVersion: v1
data:
alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
kind: Secret
metadata:
name: alertmanager-main

File diff suppressed because it is too large Load Diff

View File

@@ -71,7 +71,7 @@ spec:
- --extra-cpu=2m
- --memory=150Mi
- --extra-memory=30Mi
- --acceptance-offset=5
- --threshold=5
- --deployment=kube-state-metrics
env:
- name: MY_POD_NAME
@@ -84,7 +84,7 @@ spec:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: gcr.io/google-containers/addon-resizer-amd64:2.1
image: k8s.gcr.io/addon-resizer:1.8.4
name: addon-resizer
resources:
limits:

View File

@@ -225,21 +225,21 @@ spec:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: :node_disk_utilisation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
@@ -769,9 +769,9 @@ spec:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
@@ -780,9 +780,33 @@ spec:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
for: 10m
labels:
severity: warning
@@ -951,7 +975,7 @@ spec:
log (WAL).'
summary: Prometheus write-ahead log is corrupted
expr: |
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
for: 4h
labels:
severity: warning