kube-prometheus: generate manifests without kubectl
For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is actually used, which is unnecessary for generating these manifests. This approach also allows further customization, such as adding labels to the generated manifests.
This commit is contained in:
@@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml
|
||||
kctl apply -f manifests/alertmanager/alertmanager-config.yaml
|
||||
kctl apply -f manifests/alertmanager/alertmanager-service.yaml
|
||||
|
||||
# unfortunately statefulsets cannot be changed except for their replica count
|
||||
# so we need to make sure that the rule files are created before we create the
|
||||
# prometheus resource so it can properly discover the rule files when creating
|
||||
# the statefulset
|
||||
sleep 5
|
||||
|
||||
# `kubectl apply` is currently not working for third party resources so we are
|
||||
# using `kubectl create` here for the time being.
|
||||
# (https://github.com/kubernetes/kubernetes/issues/29542)
|
||||
|
11
hack/scripts/generate-alertmanager-config-secret.sh
Executable file
11
hack/scripts/generate-alertmanager-config-secret.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0)
|
||||
EOF
|
||||
|
15
hack/scripts/generate-dashboards-configmap.sh
Executable file
15
hack/scripts/generate-dashboards-configmap.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
data:
|
||||
EOF
|
||||
|
||||
for f in assets/grafana/*
|
||||
do
|
||||
echo " $(basename $f): |+"
|
||||
cat $f | sed "s/^/ /g"
|
||||
done
|
@@ -1,11 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Generate Alert Rules ConfigMap
|
||||
kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml
|
||||
hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml
|
||||
|
||||
# Generate Dashboard ConfigMap
|
||||
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml
|
||||
hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml
|
||||
|
||||
# Generate Secret for Alertmanager config
|
||||
kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml
|
||||
hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml
|
||||
|
||||
|
18
hack/scripts/generate-rules-configmap.sh
Executable file
18
hack/scripts/generate-rules-configmap.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-k8s-rules
|
||||
labels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
EOF
|
||||
|
||||
for f in assets/prometheus/rules/*.rules
|
||||
do
|
||||
echo " $(basename $f): |+"
|
||||
cat $f | sed "s/^/ /g"
|
||||
done
|
@@ -1,7 +1,6 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
|
||||
|
@@ -1,6 +1,9 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
data:
|
||||
all-nodes-dashboard.json: |
|
||||
all-nodes-dashboard.json: |+
|
||||
{
|
||||
"dashboard":
|
||||
{
|
||||
@@ -861,7 +864,7 @@ data:
|
||||
],
|
||||
"overwrite": true
|
||||
}
|
||||
deployment-dashboard.json: |-
|
||||
deployment-dashboard.json: |+
|
||||
{
|
||||
"dashboard": {
|
||||
"__inputs": [
|
||||
@@ -1678,8 +1681,7 @@ data:
|
||||
}
|
||||
],
|
||||
"overwrite": true
|
||||
}
|
||||
kubernetes-pods-dashboard.json: |
|
||||
} kubernetes-pods-dashboard.json: |+
|
||||
{
|
||||
"dashboard": {
|
||||
"__inputs": [
|
||||
@@ -2089,7 +2091,7 @@ data:
|
||||
],
|
||||
"overwrite": true
|
||||
}
|
||||
node-dashboard.json: |
|
||||
node-dashboard.json: |+
|
||||
{
|
||||
"dashboard":
|
||||
{
|
||||
@@ -2970,7 +2972,7 @@ data:
|
||||
],
|
||||
"overwrite": true
|
||||
}
|
||||
prometheus-datasource.json: |
|
||||
prometheus-datasource.json: |+
|
||||
{
|
||||
"access": "proxy",
|
||||
"basicAuth": false,
|
||||
@@ -2978,7 +2980,7 @@ data:
|
||||
"type": "prometheus",
|
||||
"url": "http://prometheus-k8s.monitoring.svc:9090"
|
||||
}
|
||||
resource-requests-dashboard.json: |-
|
||||
resource-requests-dashboard.json: |+
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
@@ -3403,7 +3405,3 @@ data:
|
||||
"title": "Resource Requests",
|
||||
"version": 1
|
||||
}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: grafana-dashboards
|
||||
|
@@ -1,57 +1,133 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-k8s-rules
|
||||
labels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
|
||||
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
|
||||
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
|
||||
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
|
||||
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
|
||||
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
|
||||
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
|
||||
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
|
||||
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
|
||||
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
|
||||
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
|
||||
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
|
||||
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
|
||||
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
|
||||
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
|
||||
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
|
||||
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
|
||||
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
|
||||
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
|
||||
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
|
||||
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
|
||||
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
|
||||
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
|
||||
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
|
||||
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
|
||||
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
|
||||
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
|
||||
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
|
||||
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
|
||||
= \"a high number of failed proposals within the etcd cluster are happening\",\n
|
||||
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
|
||||
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
|
||||
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
|
||||
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
|
||||
$labels.instance }} fync durations are high\",\n }\n"
|
||||
etcd2.rules: |+
|
||||
### General cluster availability ###
|
||||
|
||||
# alert if another failed peer will result in an unavailable cluster
|
||||
ALERT InsufficientPeers
|
||||
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Etcd cluster small",
|
||||
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
### HTTP requests alerts ###
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if 50% of requests get a 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
### etcd proposal alerts ###
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
### etcd disk io latency alerts ###
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
kubernetes.rules: |+
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
@@ -441,7 +517,3 @@ data:
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: prometheus-k8s-rules
|
||||
|
@@ -10,6 +10,10 @@ spec:
|
||||
serviceMonitorSelector:
|
||||
matchExpression:
|
||||
- {key: k8s-apps, operator: Exists}
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
|
Reference in New Issue
Block a user