kube-prometheus: generate manifests without kubectl

For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is
actually used, which is unnecessary for generating these manifests. This
approach also allows further customization, such as adding labels to the
generated manifests.
This commit is contained in:
Frederic Branczyk
2017-03-10 14:15:10 +01:00
parent 5546016826
commit 9ed63f191f
10 changed files with 238 additions and 115 deletions

View File

@@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml
kctl apply -f manifests/alertmanager/alertmanager-config.yaml
kctl apply -f manifests/alertmanager/alertmanager-service.yaml
# unfortunately statefulsets cannot be changed except for their replica count
# so we need to make sure that the rule files are created before we create the
# prometheus resource so it can properly discover the rule files when creating
# the statefulset
sleep 5
# `kubectl apply` is currently not working for third party resources so we are
# using `kubectl create` here for the time being.
# (https://github.com/kubernetes/kubernetes/issues/29542)

View File

@@ -0,0 +1,11 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
data:
alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0)
EOF

View File

@@ -0,0 +1,15 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
EOF
for f in assets/grafana/*
do
echo " $(basename $f): |+"
cat $f | sed "s/^/ /g"
done

View File

@@ -1,11 +1,11 @@
#!/bin/bash
# Generate Alert Rules ConfigMap
kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml
hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml
# Generate Dashboard ConfigMap
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml
hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml
# Generate Secret for Alertmanager config
kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml
hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml

View File

@@ -0,0 +1,18 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
EOF
for f in assets/prometheus/rules/*.rules
do
echo " $(basename $f): |+"
cat $f | sed "s/^/ /g"
done

View File

@@ -1,7 +1,6 @@
apiVersion: v1
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
kind: Secret
metadata:
creationTimestamp: null
name: alertmanager-main
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==

View File

@@ -1,6 +1,9 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
all-nodes-dashboard.json: |
all-nodes-dashboard.json: |+
{
"dashboard":
{
@@ -861,7 +864,7 @@ data:
],
"overwrite": true
}
deployment-dashboard.json: |-
deployment-dashboard.json: |+
{
"dashboard": {
"__inputs": [
@@ -1678,8 +1681,7 @@ data:
}
],
"overwrite": true
}
kubernetes-pods-dashboard.json: |
} kubernetes-pods-dashboard.json: |+
{
"dashboard": {
"__inputs": [
@@ -2089,7 +2091,7 @@ data:
],
"overwrite": true
}
node-dashboard.json: |
node-dashboard.json: |+
{
"dashboard":
{
@@ -2970,7 +2972,7 @@ data:
],
"overwrite": true
}
prometheus-datasource.json: |
prometheus-datasource.json: |+
{
"access": "proxy",
"basicAuth": false,
@@ -2978,7 +2980,7 @@ data:
"type": "prometheus",
"url": "http://prometheus-k8s.monitoring.svc:9090"
}
resource-requests-dashboard.json: |-
resource-requests-dashboard.json: |+
{
"__inputs": [
{
@@ -3403,7 +3405,3 @@ data:
"title": "Resource Requests",
"version": 1
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: grafana-dashboards

View File

@@ -1,57 +1,133 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
= \"a high number of failed proposals within the etcd cluster are happening\",\n
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
$labels.instance }} fync durations are high\",\n }\n"
etcd2.rules: |+
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
### File descriptor alerts ###
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
### etcd proposal alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
### etcd disk io latency alerts ###
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
}
kubernetes.rules: |+
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
@@ -441,7 +517,3 @@ data:
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-k8s-rules

View File

@@ -10,6 +10,10 @@ spec:
serviceMonitorSelector:
matchExpression:
- {key: k8s-apps, operator: Exists}
ruleSelector:
matchLabels:
role: prometheus-rulefiles
prometheus: k8s
resources:
requests:
# 2Gi is default, but won't schedule if you don't have a node with >2Gi