Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Eduardo Gonzalez
2017-08-02 11:19:35 +02:00
13 changed files with 221 additions and 49 deletions

3
Makefile Normal file
View File

@@ -0,0 +1,3 @@
generate:
@echo ">> Compiling assets and generating Kubernetes manifests"
@hack/scripts/generate-manifests.sh

View File

@@ -8,3 +8,35 @@ ALERT NodeExporterDown
summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
}
ALERT K8SNodeOutOfDisk
IF kube_node_status_out_of_disk{condition="true"} == 1
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Node ran out of disk space.",
description = "{{ $labels.node }} has run out of disk space.",
}
ALERT K8SNodeMemoryPressure
IF kube_node_status_memory_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under memory pressure.",
description = "{{ $labels.node }} is under memory pressure.",
}
ALERT K8SNodeDiskPressure
IF kube_node_status_disk_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under disk pressure.",
description = "{{ $labels.node }} is under disk pressure.",
}

View File

@@ -0,0 +1,27 @@
# Developing Alerts and Dashboards
`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added.
For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory.
The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests.
## Alerts
The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`.
It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`.
To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest.
Then the generated manifest can be applied against a Kubernetes cluster.
## Dashboards
The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`.
As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions.
To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests.
Then the generated manifest can be applied against a Kubernetes cluster.

View File

@@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter
kctl apply -f manifests/kube-state-metrics
kctl apply -f manifests/grafana/grafana-credentials.yaml
kctl apply -f manifests/grafana
kctl apply -f manifests/prometheus/
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl apply -f manifests/alertmanager/

View File

@@ -15,7 +15,9 @@ kctl() {
kctl delete -f manifests/node-exporter
kctl delete -f manifests/kube-state-metrics
kctl delete -f manifests/grafana
kctl delete -f manifests/prometheus
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl delete -f manifests/alertmanager
# Hack: wait a bit to let the controller delete the deployed Prometheus server.

View File

@@ -11,7 +11,7 @@ spec:
spec:
containers:
- name: grafana
image: grafana/grafana:4.1.1
image: grafana/grafana:4.4.1
env:
- name: GF_AUTH_BASIC_ENABLED
value: "true"
@@ -41,7 +41,7 @@ spec:
memory: 200Mi
cpu: 200m
- name: grafana-watcher
image: quay.io/coreos/grafana-watcher:v0.0.5
image: quay.io/coreos/grafana-watcher:v0.0.6
args:
- '--watch-dir=/var/grafana-dashboards'
- '--grafana-url=http://localhost:3000'
@@ -56,9 +56,6 @@ spec:
secretKeyRef:
name: grafana-credentials
key: password
volumeMounts:
- name: grafana-dashboards
mountPath: /var/grafana-dashboards
resources:
requests:
memory: "16Mi"

View File

@@ -1,9 +1,9 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: prometheus-operator
labels:
k8s-app: prometheus-operator
name: prometheus-operator
spec:
replicas: 1
template:
@@ -11,20 +11,20 @@ spec:
labels:
k8s-app: prometheus-operator
spec:
serviceAccountName: prometheus-operator
containers:
- name: prometheus-operator
image: quay.io/coreos/prometheus-operator:v0.10.1
args:
- "--kubelet-service=kube-system/kubelet"
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
- args:
- --kubelet-service=kube-system/kubelet
- --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
image: quay.io/coreos/prometheus-operator:v0.11.1
name: prometheus-operator
ports:
- name: http
containerPort: 8080
- containerPort: 8080
name: http
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
limits:
cpu: 200m
memory: 300Mi
serviceAccountName: prometheus-operator

View File

@@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@@ -1,18 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@@ -0,0 +1,54 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@@ -0,0 +1,51 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: default
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@@ -582,6 +582,38 @@ data:
summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
}
ALERT K8SNodeOutOfDisk
IF kube_node_status_out_of_disk{condition="true"} == 1
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Node ran out of disk space.",
description = "{{ $labels.node }} has run out of disk space.",
}
ALERT K8SNodeMemoryPressure
IF kube_node_status_memory_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under memory pressure.",
description = "{{ $labels.node }} is under memory pressure.",
}
ALERT K8SNodeDiskPressure
IF kube_node_status_disk_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under disk pressure.",
description = "{{ $labels.node }} is under disk pressure.",
}
prometheus.rules: |+
ALERT FailedReload
IF prometheus_config_last_reload_successful == 0

View File

@@ -9,6 +9,8 @@ spec:
endpoints:
- port: http-metrics
interval: 30s
- port: cadvisor
interval: 30s
honorLabels: true
selector:
matchLabels: