From b6e7d708c5822baa1aa713575c568e2279941390 Mon Sep 17 00:00:00 2001 From: seph Date: Fri, 13 Jul 2018 11:48:27 -0400 Subject: [PATCH 01/11] Configure kube-state-metrics As I work with kube-state-metrics in a large cluster, I found I needed to make some adjustments. - Expose the collectors, allowing one to configure exclusions. - Expose the addon_resizer parameters, facilitating reproduce adjustments - Allow adjusting scrapeTimeout and scrapeInterval --- README.md | 21 ++++++++++++ .../kube-state-metrics.libsonnet | 33 ++++++++++++++----- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index bf61d9ae..a9a42e54 100644 --- a/README.md +++ b/README.md @@ -369,3 +369,24 @@ The Prometheus `/targets` page will show the kubelet job with the error `403 Una #### Authorization problem The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. +### kube-state-metrics resource usaged + +In some environments, kube-state-metrics may need additional +resources. One driver for more resource needs, is a high number of +namespaces. There may be others. + +kube-state-metrics has it's resources using an +[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) +You can control it's parameters by setting variables in the +config. They default to: + +``` jsonnet + resizer+:: { + kubeStateMetrics+:: { + cpu: '100m', + extraCpu: '2m', + memory: '150Mi', + extraMemory: '30Mi', + }, + } +``` diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index c36f293b..f9065282 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -4,6 +4,22 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', + kubeStateMetrics+:: { + // when this is an empty string, you get the default set + collectors: '', + scrapeTimeout: '', + scrapeInterval: '30s', + }, + + resizer+:: { + kubeStateMetrics+:: { + cpu: '100m', + extraCpu: '2m', + memory: '150Mi', + extraMemory: '30Mi', + }, + }, + versions+:: { kubeStateMetrics: 'v1.3.1', kubeRbacProxy: 'v0.3.1', @@ -137,19 +153,20 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082', + '--collectors=' + $._config.kubeStateMetrics.collectors, ]) + - container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + - container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + container.mixin.resources.withRequests({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }) + + container.mixin.resources.withLimits({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }); local addonResizer = container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + container.withCommand([ '/pod_nanny', '--container=kube-state-metrics', - '--cpu=100m', - '--extra-cpu=2m', - '--memory=150Mi', - '--extra-memory=30Mi', + '--cpu=' + $._config.resizer.kubeStateMetrics.cpu, + '--extra-cpu=' + $._config.resizer.kubeStateMetrics.extraCpu, + '--memory=' + $._config.resizer.kubeStateMetrics.memory, + '--extra-memory=' + $._config.resizer.kubeStateMetrics.extraMemory, '--threshold=5', '--deployment=kube-state-metrics', ]) + @@ -258,13 +275,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { port: 'https-main', scheme: 'https', - interval: '30s', + interval: $._config.kubeStateMetrics.scrapeInterval, honorLabels: true, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', tlsConfig: { insecureSkipVerify: true, }, - }, + } + if $._config.kubeStateMetrics.scrapeTimeout != '' then { scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout } else {}, { port: 'https-self', scheme: 'https', From dabfca595bed9142a1850dca958a7e18037a18b2 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 17 Jul 2018 15:10:38 +0200 Subject: [PATCH 02/11] Makefile: Properly rebuild po-docgen on src changes (#1625) --- manifests/grafana-dashboardDefinitions.yaml | 34 +++++++++++++++++++-- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index a0dba292..1143970e 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2501,6 +2501,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2509,6 +2510,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2517,6 +2519,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2525,6 +2528,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2533,6 +2537,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -2861,6 +2866,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2869,6 +2875,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2877,6 +2884,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2885,6 +2893,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2893,6 +2902,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3303,6 +3313,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3311,6 +3322,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3319,6 +3331,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3327,6 +3340,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3335,6 +3349,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3663,6 +3678,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3671,6 +3687,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3679,6 +3696,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3687,6 +3705,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3695,6 +3714,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4132,6 +4152,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4140,6 +4161,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4148,6 +4170,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4156,6 +4179,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4164,6 +4188,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4492,6 +4517,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4500,6 +4526,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4508,6 +4535,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4516,6 +4544,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4524,6 +4553,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -5696,14 +5726,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 75d5f36e..49c4a995 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -388,7 +388,7 @@ spec: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} - for: 15m + for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch From efe686c0c09efd920ff6d60461842a8a1e2c48d9 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 17 Jul 2018 15:11:46 +0200 Subject: [PATCH 03/11] security: Enforce nobody user and read only / (#1393) * Make the Prometheus Operator Docker image run as `nobody` by default. * Disallow privilege escalation via K8s * Enforce read only root filesystem --- manifests/0prometheus-operator-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index faca5a84..5a193a35 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,6 +18,7 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet + - -logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.0 image: quay.io/coreos/prometheus-operator:v0.22.0 @@ -32,6 +33,9 @@ spec: requests: cpu: 100m memory: 50Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true nodeSelector: beta.kubernetes.io/os: linux securityContext: From 358c8477eae9429cf5a8b889c72176ca95caee6d Mon Sep 17 00:00:00 2001 From: seph Date: Tue, 17 Jul 2018 09:52:30 -0400 Subject: [PATCH 04/11] Resource config now in config.kubeStateMetrics As requested, this updates the resource specification to live directly in config.kubeStateMetrics It also clarifies the config variables. These names are what google uses in some of their tooling. (And a slight tweak to the way collectors are specified) --- README.md | 17 +++++----- .../kube-state-metrics.libsonnet | 32 ++++++++----------- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index a9a42e54..67cbd4c1 100644 --- a/README.md +++ b/README.md @@ -369,24 +369,23 @@ The Prometheus `/targets` page will show the kubelet job with the error `403 Una #### Authorization problem The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. -### kube-state-metrics resource usaged + +### kube-state-metrics resource usage In some environments, kube-state-metrics may need additional resources. One driver for more resource needs, is a high number of namespaces. There may be others. -kube-state-metrics has it's resources using an +kube-state-metrics resource allocation is managed by [addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) You can control it's parameters by setting variables in the config. They default to: ``` jsonnet - resizer+:: { - kubeStateMetrics+:: { - cpu: '100m', - extraCpu: '2m', - memory: '150Mi', - extraMemory: '30Mi', - }, + kubeStateMetrics+:: { + baseCPU: '100m', + cpuPerNode: '2m', + baseMemory: '150Mi', + memoryPerNode: '30Mi', } ``` diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index f9065282..59c0104a 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -5,19 +5,14 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', kubeStateMetrics+:: { - // when this is an empty string, you get the default set - collectors: '', - scrapeTimeout: '', + collectors: '', // empty string gets a default set scrapeInterval: '30s', - }, + scrapeTimeout: '', - resizer+:: { - kubeStateMetrics+:: { - cpu: '100m', - extraCpu: '2m', - memory: '150Mi', - extraMemory: '30Mi', - }, + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', }, versions+:: { @@ -153,20 +148,19 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082', - '--collectors=' + $._config.kubeStateMetrics.collectors, - ]) + - container.mixin.resources.withRequests({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }) + - container.mixin.resources.withLimits({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }); + ] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) + + container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) + + container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }); local addonResizer = container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + container.withCommand([ '/pod_nanny', '--container=kube-state-metrics', - '--cpu=' + $._config.resizer.kubeStateMetrics.cpu, - '--extra-cpu=' + $._config.resizer.kubeStateMetrics.extraCpu, - '--memory=' + $._config.resizer.kubeStateMetrics.memory, - '--extra-memory=' + $._config.resizer.kubeStateMetrics.extraMemory, + '--cpu=' + $._config.kubeStateMetrics.baseCPU, + '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, + '--memory=' + $._config.kubeStateMetrics.baseMemory, + '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, '--threshold=5', '--deployment=kube-state-metrics', ]) + From 596b8697d005fa388515a675346ca054f29f17e2 Mon Sep 17 00:00:00 2001 From: seph Date: Tue, 17 Jul 2018 10:13:18 -0400 Subject: [PATCH 05/11] Set default scrape values We default to a 30s scrapeInterval, we may as well also set scrapeTimeout to the same. --- .../kube-state-metrics/kube-state-metrics.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 59c0104a..2805fc9d 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -7,7 +7,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; kubeStateMetrics+:: { collectors: '', // empty string gets a default set scrapeInterval: '30s', - scrapeTimeout: '', + scrapeTimeout: '30s', baseCPU: '100m', baseMemory: '150Mi', @@ -270,12 +270,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; port: 'https-main', scheme: 'https', interval: $._config.kubeStateMetrics.scrapeInterval, + scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, honorLabels: true, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', tlsConfig: { insecureSkipVerify: true, }, - } + if $._config.kubeStateMetrics.scrapeTimeout != '' then { scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout } else {}, + }, { port: 'https-self', scheme: 'https', From 04cf9ce35a6dd66006d1a58d2b7d720bf55efcef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 17 Jul 2018 19:49:42 +0200 Subject: [PATCH 06/11] *: Re-generate --- manifests/grafana-dashboardDefinitions.yaml | 34 +++++++++++++++++-- manifests/kube-state-metrics-deployment.yaml | 8 ++--- .../kube-state-metrics-serviceMonitor.yaml | 1 + manifests/prometheus-rules.yaml | 2 +- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index a0dba292..1143970e 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2501,6 +2501,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2509,6 +2510,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2517,6 +2519,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2525,6 +2528,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2533,6 +2537,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -2861,6 +2866,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2869,6 +2875,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2877,6 +2884,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2885,6 +2893,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2893,6 +2902,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3303,6 +3313,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3311,6 +3322,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3319,6 +3331,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3327,6 +3340,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3335,6 +3349,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3663,6 +3678,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3671,6 +3687,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3679,6 +3696,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3687,6 +3705,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3695,6 +3714,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4132,6 +4152,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4140,6 +4161,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4148,6 +4170,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4156,6 +4179,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4164,6 +4188,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4492,6 +4517,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4500,6 +4526,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4508,6 +4535,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4516,6 +4544,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4524,6 +4553,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -5696,14 +5726,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index c7bb25c6..065c87a9 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -55,11 +55,11 @@ spec: name: kube-state-metrics resources: limits: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi requests: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi - command: - /pod_nanny - --container=kube-state-metrics diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index 3d1073ad..2100449d 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -12,6 +12,7 @@ spec: interval: 30s port: https-main scheme: https + scrapeTimeout: 30s tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 75d5f36e..49c4a995 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -388,7 +388,7 @@ spec: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} - for: 15m + for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch From ade7b88d654d698def489e860c5e36d522db6c44 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 18 Jul 2018 10:25:09 +0200 Subject: [PATCH 07/11] Update jsonnet dependencies --- manifests/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 4b00b004..cb8cc9d8 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.1.0 + - image: grafana/grafana:5.2.1 name: grafana ports: - containerPort: 3000 From 06df9fb67d6b3b6e124df6622ba8eb6f8aead47a Mon Sep 17 00:00:00 2001 From: Max Inden Date: Fri, 20 Jul 2018 15:09:17 +0200 Subject: [PATCH 08/11] bundle.yaml: Bump Prometheus Operator memory request and limit (#1622) When handling big Kubernetes objects, marshalling objects is memory intense. This can be reproduced with the end-to-end test `TestPrometheusRulesExceedingConfigMapLimit`. This patch doubles the memory request and limit of the Prometheus Operator deployment to 100mb and 200mb. --- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index b5be341e..06b295f2 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -29,10 +29,10 @@ spec: resources: limits: cpu: 200m - memory: 100Mi + memory: 200Mi requests: cpu: 100m - memory: 50Mi + memory: 100Mi securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true From d728ab5511ea839448499998358c4e5c53c86d24 Mon Sep 17 00:00:00 2001 From: Ali Rizwan Date: Mon, 23 Jul 2018 12:20:54 +0200 Subject: [PATCH 09/11] Only alert for nodes that currently exist (#1661) --- jsonnet/kube-prometheus/alerts/node.libsonnet | 4 ++-- manifests/prometheus-rules.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 46a5e36d..5c24f09f 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -11,7 +11,7 @@ summary: 'Node disk is running full within 24 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '30m', labels: { @@ -25,7 +25,7 @@ summary: 'Node disk is running full within 2 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '10m', labels: { diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 49c4a995..5af7d2fa 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -744,7 +744,7 @@ spec: full within the next 24 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 24 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{job="node-exporter"} for: 30m labels: severity: warning @@ -754,7 +754,7 @@ spec: full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{job="node-exporter"} for: 10m labels: severity: critical From d1cd95190303c2dd4ce05d7236124c3c707e8948 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Mon, 23 Jul 2018 12:57:03 +0200 Subject: [PATCH 10/11] *: regenerate --- ...erator-0alertmanagerCustomResourceDefinition.yaml | 10 +++++----- ...operator-0prometheusCustomResourceDefinition.yaml | 12 ++++++------ ...ator-0servicemonitorCustomResourceDefinition.yaml | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 6f30397a..9d782f51 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Alertmanager - cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerSpec is a specification of the desired behavior + of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: affinity: description: Affinity is a group of affinity scheduling rules. @@ -2372,9 +2372,9 @@ spec: description: Version the cluster should be on. type: string status: - description: 'Most recent observed status of the Alertmanager cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerStatus is the most recent observed status of the + Alertmanager cluster. Read-only. Not included when requesting from the + apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 140deffa..df1274eb 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Prometheus cluster. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusSpec is a specification of the desired behavior + of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: additionalAlertManagerConfigs: description: SecretKeySelector selects a key of a Secret. @@ -2862,7 +2862,7 @@ spec: description: Peers is a DNS name for Thanos to discover peers through. type: string s3: - description: ThanosSpec defines parameters for of AWS Simple Storage + description: ThanosS3Spec defines parameters for of AWS Simple Storage Service (S3) with Thanos. (S3 compatible services apply as well) properties: accessKey: @@ -2961,9 +2961,9 @@ spec: description: Version of Prometheus to be deployed. type: string status: - description: 'Most recent observed status of the Prometheus cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusStatus is the most recent observed status of the + Prometheus cluster. Read-only. Not included when requesting from the apiserver, + only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index f3068cf8..9d96bfeb 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -169,7 +169,7 @@ spec: description: The label to use to retrieve the job name from. type: string namespaceSelector: - description: A selector for selecting namespaces either selecting all + description: NamespaceSelector is a selector for selecting either all namespaces or a list of namespaces. properties: any: From e47243b413a0e6b657407029eeb13f4de80051a1 Mon Sep 17 00:00:00 2001 From: Saverio Proto Date: Tue, 24 Jul 2018 12:58:40 +0200 Subject: [PATCH 11/11] metrics-server: enable access to nodes/stats Without this access the logs of metrics-server will show the following error line: ``` unable to fully scrape metrics from source kubelet_summary:k8s-1: unable to fetch metrics from Kubelet k8s-1 (10.8.10.14): request failed - "403 Forbidden", response: "Forbidden (user=system:serviceaccount:kube-system:metrics-server, verb=get, resource=nodes, subresource=stats)", ``` and `kubectl top nodes` will give no results --- experimental/metrics-server/metrics-server-cluster-role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/experimental/metrics-server/metrics-server-cluster-role.yaml b/experimental/metrics-server/metrics-server-cluster-role.yaml index 6976f5ce..38844d9a 100644 --- a/experimental/metrics-server/metrics-server-cluster-role.yaml +++ b/experimental/metrics-server/metrics-server-cluster-role.yaml @@ -8,6 +8,7 @@ rules: resources: - pods - nodes + - nodes/stats - namespaces verbs: - get