Merge remote-tracking branch 'upstream/master'
This commit is contained in:
3
Makefile
Normal file
3
Makefile
Normal file
@@ -0,0 +1,3 @@
|
||||
generate:
|
||||
@echo ">> Compiling assets and generating Kubernetes manifests"
|
||||
@hack/scripts/generate-manifests.sh
|
@@ -8,3 +8,35 @@ ALERT NodeExporterDown
|
||||
summary = "node-exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
|
||||
}
|
||||
ALERT K8SNodeOutOfDisk
|
||||
IF kube_node_status_out_of_disk{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node ran out of disk space.",
|
||||
description = "{{ $labels.node }} has run out of disk space.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeMemoryPressure
|
||||
IF kube_node_status_memory_pressure{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under memory pressure.",
|
||||
description = "{{ $labels.node }} is under memory pressure.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeDiskPressure
|
||||
IF kube_node_status_disk_pressure{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under disk pressure.",
|
||||
description = "{{ $labels.node }} is under disk pressure.",
|
||||
}
|
||||
|
27
docs/developing-alerts-and-dashboards.md
Normal file
27
docs/developing-alerts-and-dashboards.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# Developing Alerts and Dashboards
|
||||
|
||||
`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added.
|
||||
|
||||
For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory.
|
||||
|
||||
The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests.
|
||||
|
||||
## Alerts
|
||||
|
||||
The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`.
|
||||
|
||||
It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`.
|
||||
|
||||
To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest.
|
||||
|
||||
Then the generated manifest can be applied against a Kubernetes cluster.
|
||||
|
||||
## Dashboards
|
||||
|
||||
The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`.
|
||||
|
||||
As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions.
|
||||
|
||||
To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests.
|
||||
|
||||
Then the generated manifest can be applied against a Kubernetes cluster.
|
@@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter
|
||||
kctl apply -f manifests/kube-state-metrics
|
||||
kctl apply -f manifests/grafana/grafana-credentials.yaml
|
||||
kctl apply -f manifests/grafana
|
||||
kctl apply -f manifests/prometheus/
|
||||
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
|
||||
kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
|
||||
kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
|
||||
kctl apply -f manifests/alertmanager/
|
||||
|
||||
|
@@ -15,7 +15,9 @@ kctl() {
|
||||
kctl delete -f manifests/node-exporter
|
||||
kctl delete -f manifests/kube-state-metrics
|
||||
kctl delete -f manifests/grafana
|
||||
kctl delete -f manifests/prometheus
|
||||
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
|
||||
kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
|
||||
kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
|
||||
kctl delete -f manifests/alertmanager
|
||||
|
||||
# Hack: wait a bit to let the controller delete the deployed Prometheus server.
|
||||
|
@@ -11,7 +11,7 @@ spec:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
image: grafana/grafana:4.1.1
|
||||
image: grafana/grafana:4.4.1
|
||||
env:
|
||||
- name: GF_AUTH_BASIC_ENABLED
|
||||
value: "true"
|
||||
@@ -41,7 +41,7 @@ spec:
|
||||
memory: 200Mi
|
||||
cpu: 200m
|
||||
- name: grafana-watcher
|
||||
image: quay.io/coreos/grafana-watcher:v0.0.5
|
||||
image: quay.io/coreos/grafana-watcher:v0.0.6
|
||||
args:
|
||||
- '--watch-dir=/var/grafana-dashboards'
|
||||
- '--grafana-url=http://localhost:3000'
|
||||
@@ -56,9 +56,6 @@ spec:
|
||||
secretKeyRef:
|
||||
name: grafana-credentials
|
||||
key: password
|
||||
volumeMounts:
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/grafana-dashboards
|
||||
resources:
|
||||
requests:
|
||||
memory: "16Mi"
|
||||
|
@@ -1,9 +1,9 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
name: prometheus-operator
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
@@ -11,20 +11,20 @@ spec:
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
serviceAccountName: prometheus-operator
|
||||
containers:
|
||||
- name: prometheus-operator
|
||||
image: quay.io/coreos/prometheus-operator:v0.10.1
|
||||
args:
|
||||
- "--kubelet-service=kube-system/kubelet"
|
||||
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
|
||||
- args:
|
||||
- --kubelet-service=kube-system/kubelet
|
||||
- --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
|
||||
image: quay.io/coreos/prometheus-operator:v0.11.1
|
||||
name: prometheus-operator
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
resources:
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 100Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 300Mi
|
||||
serviceAccountName: prometheus-operator
|
||||
|
@@ -1,12 +0,0 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
@@ -1,18 +0,0 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
54
manifests/prometheus/prometheus-k8s-role-bindings.yaml
Normal file
54
manifests/prometheus/prometheus-k8s-role-bindings.yaml
Normal file
@@ -0,0 +1,54 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: default
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
51
manifests/prometheus/prometheus-k8s-roles.yaml
Normal file
51
manifests/prometheus/prometheus-k8s-roles.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: default
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
rules:
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
@@ -582,6 +582,38 @@ data:
|
||||
summary = "node-exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
|
||||
}
|
||||
ALERT K8SNodeOutOfDisk
|
||||
IF kube_node_status_out_of_disk{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node ran out of disk space.",
|
||||
description = "{{ $labels.node }} has run out of disk space.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeMemoryPressure
|
||||
IF kube_node_status_memory_pressure{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under memory pressure.",
|
||||
description = "{{ $labels.node }} is under memory pressure.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeDiskPressure
|
||||
IF kube_node_status_disk_pressure{condition="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under disk pressure.",
|
||||
description = "{{ $labels.node }} is under disk pressure.",
|
||||
}
|
||||
prometheus.rules: |+
|
||||
ALERT FailedReload
|
||||
IF prometheus_config_last_reload_successful == 0
|
||||
|
@@ -9,6 +9,8 @@ spec:
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
- port: cadvisor
|
||||
interval: 30s
|
||||
honorLabels: true
|
||||
selector:
|
||||
matchLabels:
|
||||
|
Reference in New Issue
Block a user