Add 'contrib/kube-prometheus/' from commit '81c0d2f4d30f63a4e274c2870c5afc89241827b0'
git-subtree-dir: contrib/kube-prometheus
git-subtree-mainline: 050ca21276696c8603375c699513ec487301ed62
git-subtree-split: 81c0d2f4d3
This commit is contained in:
18
manifests/alertmanager/alertmanager-config.yaml
Normal file
18
manifests/alertmanager/alertmanager-config.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: |-
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'webhook'
|
||||
receivers:
|
||||
- name: 'webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanagerwh:30500/'
|
14
manifests/alertmanager/alertmanager-service.yaml
Normal file
14
manifests/alertmanager/alertmanager-service.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
nodePort: 30903
|
||||
port: 9093
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
selector:
|
||||
alertmanager: main
|
9
manifests/alertmanager/alertmanager.yaml
Normal file
9
manifests/alertmanager/alertmanager.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
apiVersion: "monitoring.coreos.com/v1alpha1"
|
||||
kind: "Alertmanager"
|
||||
metadata:
|
||||
name: "main"
|
||||
labels:
|
||||
alertmanager: "main"
|
||||
spec:
|
||||
replicas: 3
|
||||
version: v0.5.1
|
28
manifests/etcd/etcd-bootkube-gce.yaml
Normal file
28
manifests/etcd/etcd-bootkube-gce.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: etcd-k8s
|
||||
labels:
|
||||
k8s-app: etcd
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: api
|
||||
port: 2379
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: etcd-k8s
|
||||
labels:
|
||||
k8s-app: etcd
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: 10.142.0.2
|
||||
nodeName: 10.142.0.2
|
||||
ports:
|
||||
- name: api
|
||||
port: 2379
|
||||
protocol: TCP
|
28
manifests/etcd/etcd-bootkube-vagrant-multi.yaml
Normal file
28
manifests/etcd/etcd-bootkube-vagrant-multi.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: etcd-k8s
|
||||
labels:
|
||||
k8s-app: etcd
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: api
|
||||
port: 2379
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: etcd-k8s
|
||||
labels:
|
||||
k8s-app: etcd
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: 172.17.4.51
|
||||
nodeName: 172.17.4.51
|
||||
ports:
|
||||
- name: api
|
||||
port: 2379
|
||||
protocol: TCP
|
34
manifests/examples/example-app/example-app.yaml
Normal file
34
manifests/examples/example-app/example-app.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
kind: Service
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: example-app
|
||||
labels:
|
||||
tier: frontend
|
||||
spec:
|
||||
selector:
|
||||
app: example-app
|
||||
ports:
|
||||
- name: web
|
||||
protocol: TCP
|
||||
port: 8080
|
||||
targetPort: web
|
||||
---
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: example-app
|
||||
spec:
|
||||
replicas: 4
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: example-app
|
||||
version: 1.1.3
|
||||
spec:
|
||||
containers:
|
||||
- name: example-app
|
||||
image: quay.io/fabxc/prometheus_demo_service
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 8080
|
||||
protocol: TCP
|
14
manifests/examples/example-app/prometheus-frontend-svc.yaml
Normal file
14
manifests/examples/example-app/prometheus-frontend-svc.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-frontend
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
nodePort: 30100
|
||||
port: 9090
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
selector:
|
||||
prometheus: prometheus-frontend
|
24
manifests/examples/example-app/prometheus-frontend.yaml
Normal file
24
manifests/examples/example-app/prometheus-frontend.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: Prometheus
|
||||
metadata:
|
||||
name: prometheus-frontend
|
||||
namespace: default
|
||||
labels:
|
||||
prometheus: frontend
|
||||
spec:
|
||||
version: v1.5.2
|
||||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
tier: frontend
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
# memory. Modify based on your target and time-series count for
|
||||
# production use. This value is mainly meant for demonstration/testing
|
||||
# purposes.
|
||||
memory: 400Mi
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- namespace: monitoring
|
||||
name: alertmanager-main
|
||||
port: web
|
13
manifests/examples/example-app/servicemonitor-frontend.yaml
Normal file
13
manifests/examples/example-app/servicemonitor-frontend.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: frontend
|
||||
labels:
|
||||
tier: frontend
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
tier: frontend
|
||||
endpoints:
|
||||
- port: web
|
||||
interval: 10s
|
25
manifests/exporters/kube-state-metrics-deployment.yaml
Normal file
25
manifests/exporters/kube-state-metrics-deployment.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
containers:
|
||||
- name: kube-state-metrics
|
||||
image: gcr.io/google_containers/kube-state-metrics:v0.4.1
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8080
|
||||
resources:
|
||||
requests:
|
||||
memory: 30Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 50Mi
|
||||
cpu: 200m
|
||||
|
18
manifests/exporters/kube-state-metrics-service.yaml
Normal file
18
manifests/exporters/kube-state-metrics-service.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
k8s-app: kube-state-metrics
|
||||
annotations:
|
||||
alpha.monitoring.coreos.com/non-namespaced: "true"
|
||||
name: kube-state-metrics
|
||||
spec:
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 8080
|
||||
targetPort: metrics
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: kube-state-metrics
|
||||
|
45
manifests/exporters/node-exporter-daemonset.yaml
Normal file
45
manifests/exporters/node-exporter-daemonset.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-exporter
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
name: node-exporter
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
containers:
|
||||
- image: quay.io/prometheus/node-exporter:v0.13.0
|
||||
args:
|
||||
- "-collector.procfs=/host/proc"
|
||||
- "-collector.sysfs=/host/sys"
|
||||
name: node-exporter
|
||||
ports:
|
||||
- containerPort: 9100
|
||||
hostPort: 9100
|
||||
name: scrape
|
||||
resources:
|
||||
requests:
|
||||
memory: 30Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 50Mi
|
||||
cpu: 200m
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
readOnly: true
|
||||
mountPath: /host/proc
|
||||
- name: sys
|
||||
readOnly: true
|
||||
mountPath: /host/sys
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
|
17
manifests/exporters/node-exporter-service.yaml
Normal file
17
manifests/exporters/node-exporter-service.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
k8s-app: node-exporter
|
||||
name: node-exporter
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 9100
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: node-exporter
|
||||
|
2984
manifests/grafana/grafana-dashboards.yaml
Normal file
2984
manifests/grafana/grafana-dashboards.yaml
Normal file
File diff suppressed because it is too large
Load Diff
56
manifests/grafana/grafana-deployment.yaml
Normal file
56
manifests/grafana/grafana-deployment.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
image: grafana/grafana:4.1.1
|
||||
env:
|
||||
- name: GF_AUTH_BASIC_ENABLED
|
||||
value: "true"
|
||||
- name: GF_AUTH_ANONYMOUS_ENABLED
|
||||
value: "true"
|
||||
volumeMounts:
|
||||
- name: grafana-storage
|
||||
mountPath: /var/grafana-storage
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 3000
|
||||
resources:
|
||||
requests:
|
||||
memory: 100Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 300Mi
|
||||
cpu: 300m
|
||||
- name: grafana-watcher
|
||||
image: quay.io/coreos/grafana-watcher:latest
|
||||
args:
|
||||
- '--watch-dir=/var/grafana-dashboards'
|
||||
- '--grafana-url=http://admin:admin@localhost:3000'
|
||||
volumeMounts:
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/grafana-dashboards
|
||||
resources:
|
||||
requests:
|
||||
memory: "16Mi"
|
||||
cpu: "50m"
|
||||
limits:
|
||||
memory: "32Mi"
|
||||
cpu: "100m"
|
||||
volumeMounts:
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/grafana-dashboards
|
||||
volumes:
|
||||
- name: grafana-storage
|
||||
emptyDir: {}
|
||||
- name: grafana-dashboards
|
||||
configMap:
|
||||
name: grafana-dashboards
|
15
manifests/grafana/grafana-service.yaml
Normal file
15
manifests/grafana/grafana-service.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
port: 3000
|
||||
protocol: TCP
|
||||
nodePort: 30902
|
||||
selector:
|
||||
app: grafana
|
28
manifests/k8s/minikube/kube-controller-manager.yaml
Normal file
28
manifests/k8s/minikube/kube-controller-manager.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-controller-manager-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10252
|
||||
targetPort: 10252
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: kube-controller-manager-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: MINIKUBE_IP
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10252
|
||||
protocol: TCP
|
28
manifests/k8s/minikube/kube-scheduler.yaml
Normal file
28
manifests/k8s/minikube/kube-scheduler.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-scheduler-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10251
|
||||
targetPort: 10251
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: kube-scheduler-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: MINIKUBE_IP
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10251
|
||||
protocol: TCP
|
16
manifests/k8s/self-hosted/kube-controller-manager.yaml
Normal file
16
manifests/k8s/self-hosted/kube-controller-manager.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-controller-manager-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
spec:
|
||||
selector:
|
||||
k8s-app: kube-controller-manager
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10252
|
||||
targetPort: 10252
|
||||
protocol: TCP
|
20
manifests/k8s/self-hosted/kube-dns.yaml
Normal file
20
manifests/k8s/self-hosted/kube-dns.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-dns-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-dns
|
||||
spec:
|
||||
selector:
|
||||
k8s-app: kube-dns
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics-skydns
|
||||
port: 10055
|
||||
targetPort: 10055
|
||||
protocol: TCP
|
||||
- name: http-metrics-dnsmasq
|
||||
port: 10054
|
||||
targetPort: 10054
|
||||
protocol: TCP
|
16
manifests/k8s/self-hosted/kube-scheduler.yaml
Normal file
16
manifests/k8s/self-hosted/kube-scheduler.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-scheduler-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
spec:
|
||||
selector:
|
||||
k8s-app: kube-scheduler
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10251
|
||||
targetPort: 10251
|
||||
protocol: TCP
|
26
manifests/prometheus-operator.yaml
Normal file
26
manifests/prometheus-operator.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
operator: prometheus
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
operator: prometheus
|
||||
spec:
|
||||
containers:
|
||||
- name: prometheus-operator
|
||||
image: quay.io/coreos/prometheus-operator:v0.6.0
|
||||
args:
|
||||
- "--kubelet-object=kube-system/kubelet"
|
||||
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 300Mi
|
447
manifests/prometheus/prometheus-k8s-rules.yaml
Normal file
447
manifests/prometheus/prometheus-k8s-rules.yaml
Normal file
@@ -0,0 +1,447 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
|
||||
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
|
||||
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
|
||||
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
|
||||
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
|
||||
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
|
||||
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
|
||||
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
|
||||
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
|
||||
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
|
||||
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
|
||||
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
|
||||
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
|
||||
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
|
||||
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
|
||||
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
|
||||
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
|
||||
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
|
||||
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
|
||||
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
|
||||
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
|
||||
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
|
||||
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
|
||||
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
|
||||
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
|
||||
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
|
||||
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
|
||||
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
|
||||
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
|
||||
= \"a high number of failed proposals within the etcd cluster are happening\",\n
|
||||
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
|
||||
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
|
||||
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
|
||||
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
|
||||
$labels.instance }} fync durations are high\",\n }\n"
|
||||
kubernetes.rules: |+
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
### Container resources ###
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_cpu_shares{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_cpu_usage_seconds_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_rss{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_cache{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_disk_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failures_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_oom:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failcnt{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
### Cluster resources ###
|
||||
|
||||
cluster:memory_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_memory_limit_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:memory_used:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_memory_usage_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:cpu_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_cpu_shares{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
||||
)
|
||||
|
||||
cluster:node_cpu_use:percent =
|
||||
100 * sum by (cluster) (
|
||||
rate(node_cpu{mode!="idle"}[5m])
|
||||
) / sum by (cluster) (
|
||||
machine_cpu_cores
|
||||
)
|
||||
|
||||
### API latency ###
|
||||
|
||||
# Raw metrics are in microseconds. Convert to seconds.
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(
|
||||
0.9,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(
|
||||
0.5,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
|
||||
### Scheduling latency ###
|
||||
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
ALERT K8SNodeDown
|
||||
IF up{job="kubelet"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
|
||||
}
|
||||
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_ready{condition="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletNodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet node_exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SApiserverDown
|
||||
IF up{job="kubernetes"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "An API server could not be scraped.",
|
||||
}
|
||||
|
||||
# Disable for non HA kubernetes setups.
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
||||
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT K8SConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SApiServerEtcdAccessLatency
|
||||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Access to etcd is slow",
|
||||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: prometheus-k8s-rules
|
14
manifests/prometheus/prometheus-k8s-service.yaml
Normal file
14
manifests/prometheus/prometheus-k8s-service.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
nodePort: 30900
|
||||
port: 9090
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
selector:
|
||||
prometheus: k8s
|
69
manifests/prometheus/prometheus-k8s-servicemonitors.yaml
Normal file
69
manifests/prometheus/prometheus-k8s-servicemonitors.yaml
Normal file
@@ -0,0 +1,69 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-apiserver
|
||||
labels:
|
||||
k8s-apps: https
|
||||
spec:
|
||||
jobLabel: provider
|
||||
selector:
|
||||
matchLabels:
|
||||
component: apiserver
|
||||
provider: kubernetes
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- default
|
||||
endpoints:
|
||||
- port: https
|
||||
interval: 15s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
serverName: kubernetes
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: k8s-apps-https
|
||||
labels:
|
||||
k8s-apps: https
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchExpressions:
|
||||
- {key: k8s-app, operator: Exists}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
endpoints:
|
||||
- port: https-metrics
|
||||
interval: 15s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecureSkipVerify: true
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: k8s-apps-http
|
||||
labels:
|
||||
k8s-apps: http
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchExpressions:
|
||||
- {key: k8s-app, operator: Exists}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 15s
|
||||
- port: http-metrics-dnsmasq
|
||||
interval: 15s
|
||||
- port: http-metrics-skydns
|
||||
interval: 15s
|
24
manifests/prometheus/prometheus-k8s.yaml
Normal file
24
manifests/prometheus/prometheus-k8s.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: Prometheus
|
||||
metadata:
|
||||
name: k8s
|
||||
labels:
|
||||
prometheus: k8s
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.5.2
|
||||
serviceMonitorSelector:
|
||||
matchExpression:
|
||||
- {key: k8s-apps, operator: Exists}
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
# memory. Modify based on your target and time-series count for
|
||||
# production use. This value is mainly meant for demonstration/testing
|
||||
# purposes.
|
||||
memory: 400Mi
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- namespace: monitoring
|
||||
name: alertmanager-main
|
||||
port: web
|
Reference in New Issue
Block a user