Merge branch 'master' of github.com:coreos/kube-prometheus

This commit is contained in:
Fabian Reinartz
2016-11-04 13:04:51 -07:00
6 changed files with 88 additions and 14 deletions

View File

@@ -0,0 +1,68 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- /etc/prometheus/rules/*.rules
scrape_configs:
- job_name: kubelets
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# Skip verification until we have resolved why the certificate validation
# for the kubelet on API server nodes fail.
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
# Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics,
# and node-exporter, which we all consider part of a default setup.
- job_name: standard-endpoints
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# As for kubelets, certificate validation fails for the API server (node)
# and we circumvent it for now.
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_name]
regex: prometheus|kubernetes|node-exporter|kube-state-metrics|etcd-k8s
- action: replace
source_labels: [__meta_kubernetes_service_name]
target_label: job
- action: replace
source_labels: [__meta_kubernetes_service_name]
regex: kubernetes
target_label: __scheme__
replacement: https
# Scrapes the endpoint lists for the kube-dns server. Which we consider
# part of a default setup.
- job_name: kube-components
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: replace
source_labels: [__meta_kubernetes_service_name]
target_label: job
regex: "kube-(.*)-prometheus-discovery"
replacement: "kube-${1}"
- action: keep
source_labels: [__meta_kubernetes_service_name]
regex: "kube-(.*)-prometheus-discovery"
- action: keep
source_labels: [__meta_kubernetes_endpoint_port_name]
regex: "prometheus"

View File

@@ -1,3 +1,5 @@
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### Container resources ### ### Container resources ###
cluster_namespace_controller_pod_container:spec_memory_limit_bytes = cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
@@ -249,7 +251,7 @@ ALERT K8SApiserverDown
# Disable for non HA kubernetes setups. # Disable for non HA kubernetes setups.
ALERT K8SApiserverDown ALERT K8SApiserverDown
IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
FOR 5m FOR 5m
LABELS { LABELS {
service = "k8s", service = "k8s",
@@ -361,7 +363,7 @@ ALERT K8STooManyOpenFiles
ALERT K8SApiServerLatency ALERT K8SApiServerLatency
IF histogram_quantile( IF histogram_quantile(
0.99, 0.99,
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0 ) / 1e6 > 1.0
FOR 10m FOR 10m
LABELS { LABELS {

View File

@@ -1,7 +1,10 @@
#!/bin/bash #!/bin/bash
# Generate Prometheus configuration ConfigMap
kubectl create configmap --dry-run=true prometheus-k8s --from-file=assets/prometheus/prometheus.yaml -oyaml > manifests/prometheus/prometheus-k8s-cm.yaml
# Generate Alert Rules ConfigMap # Generate Alert Rules ConfigMap
kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/alerts/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml
# Generate Dashboard ConfigMap # Generate Dashboard ConfigMap
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-cm.yaml kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-cm.yaml

View File

@@ -1,18 +1,15 @@
apiVersion: v1 apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s
data: data:
prometheus.yaml: | prometheus.yaml: |
global: global:
evaluation_interval: 30s scrape_interval: 15s
evaluation_interval: 15s
rule_files: rule_files:
- /etc/prometheus/rules/*.rules - /etc/prometheus/rules/*.rules
scrape_configs: scrape_configs:
- job_name: kubelets - job_name: kubelets
scrape_interval: 20s
scheme: https scheme: https
tls_config: tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
@@ -27,7 +24,6 @@ data:
# Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, # Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics,
# and node-exporter, which we all consider part of a default setup. # and node-exporter, which we all consider part of a default setup.
- job_name: standard-endpoints - job_name: standard-endpoints
scrape_interval: 20s
tls_config: tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# As for kubelets, certificate validation fails for the API server (node) # As for kubelets, certificate validation fails for the API server (node)
@@ -41,7 +37,7 @@ data:
relabel_configs: relabel_configs:
- action: keep - action: keep
source_labels: [__meta_kubernetes_service_name] source_labels: [__meta_kubernetes_service_name]
regex: kubernetes|node-exporter|kube-state-metrics|etcd-k8s regex: prometheus|kubernetes|node-exporter|kube-state-metrics|etcd-k8s
- action: replace - action: replace
source_labels: [__meta_kubernetes_service_name] source_labels: [__meta_kubernetes_service_name]
target_label: job target_label: job
@@ -54,7 +50,6 @@ data:
# Scrapes the endpoint lists for the kube-dns server. Which we consider # Scrapes the endpoint lists for the kube-dns server. Which we consider
# part of a default setup. # part of a default setup.
- job_name: kube-components - job_name: kube-components
scrape_interval: 20s
tls_config: tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
@@ -74,3 +69,7 @@ data:
- action: keep - action: keep
source_labels: [__meta_kubernetes_endpoint_port_name] source_labels: [__meta_kubernetes_endpoint_port_name]
regex: "prometheus" regex: "prometheus"
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-k8s

View File

@@ -53,6 +53,8 @@ data:
\ summary = \"high fsync durations\",\n description = \"ectd instance {{ \ summary = \"high fsync durations\",\n description = \"ectd instance {{
$labels.instance }} fync durations are high\",\n }\n" $labels.instance }} fync durations are high\",\n }\n"
kubernetes.rules: |+ kubernetes.rules: |+
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### Container resources ### ### Container resources ###
cluster_namespace_controller_pod_container:spec_memory_limit_bytes = cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
@@ -304,7 +306,7 @@ data:
# Disable for non HA kubernetes setups. # Disable for non HA kubernetes setups.
ALERT K8SApiserverDown ALERT K8SApiserverDown
IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
FOR 5m FOR 5m
LABELS { LABELS {
service = "k8s", service = "k8s",
@@ -416,7 +418,7 @@ data:
ALERT K8SApiServerLatency ALERT K8SApiServerLatency
IF histogram_quantile( IF histogram_quantile(
0.99, 0.99,
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0 ) / 1e6 > 1.0
FOR 10m FOR 10m
LABELS { LABELS {